quark.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Quark (Shenma) search engine for searxng"""
  3. from urllib.parse import urlencode
  4. from datetime import datetime
  5. import re
  6. import json
  7. from searx.utils import html_to_text, gen_useragent
  8. from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
  9. # Metadata
  10. about = {
  11. "website": "https://m.quark.cn/",
  12. "wikidata_id": "Q48816502",
  13. "use_official_api": False,
  14. "require_api_key": False,
  15. "results": "HTML",
  16. "language": "zh",
  17. }
  18. # Engine Configuration
  19. categories = []
  20. paging = True
  21. results_per_page = 10
  22. quark_category = 'general'
  23. time_range_support = True
  24. time_range_dict = {'day': '4', 'week': '3', 'month': '2', 'year': '1'}
  25. CAPTCHA_PATTERN = r'\{[^{]*?"action"\s*:\s*"captcha"\s*,\s*"url"\s*:\s*"([^"]+)"[^{]*?\}'
  26. def is_alibaba_captcha(html):
  27. """
  28. Detects if the response contains an Alibaba X5SEC CAPTCHA page.
  29. Quark may return a CAPTCHA challenge after 9 requests in a short period.
  30. Typically, the ban duration is around 15 minutes.
  31. """
  32. return bool(re.search(CAPTCHA_PATTERN, html))
  33. def init(_):
  34. if quark_category not in ('general', 'images'):
  35. raise SearxEngineAPIException(f"Unsupported category: {quark_category}")
  36. def request(query, params):
  37. page_num = params["pageno"]
  38. category_config = {
  39. 'general': {
  40. 'endpoint': 'https://m.quark.cn/s',
  41. 'params': {
  42. "q": query,
  43. "layout": "html",
  44. "page": page_num,
  45. },
  46. },
  47. 'images': {
  48. 'endpoint': 'https://vt.sm.cn/api/pic/list',
  49. 'params': {
  50. "query": query,
  51. "limit": results_per_page,
  52. "start": (page_num - 1) * results_per_page,
  53. },
  54. },
  55. }
  56. query_params = category_config[quark_category]['params']
  57. query_url = category_config[quark_category]['endpoint']
  58. if time_range_dict.get(params['time_range']) and quark_category == 'general':
  59. query_params["tl_request"] = time_range_dict.get(params['time_range'])
  60. params["url"] = f"{query_url}?{urlencode(query_params)}"
  61. params["headers"] = {
  62. "User-Agent": gen_useragent(),
  63. }
  64. return params
  65. def response(resp):
  66. results = []
  67. text = resp.text
  68. if is_alibaba_captcha(text):
  69. raise SearxEngineCaptchaException(
  70. suspended_time=900, message="Alibaba CAPTCHA detected. Please try again later."
  71. )
  72. if quark_category == 'images':
  73. data = json.loads(text)
  74. for item in data.get('data', {}).get('hit', {}).get('imgInfo', {}).get('item', []):
  75. try:
  76. published_date = datetime.fromtimestamp(int(item.get("publish_time")))
  77. except (ValueError, TypeError):
  78. published_date = None
  79. results.append(
  80. {
  81. "template": "images.html",
  82. "url": item.get("imgUrl"),
  83. "thumbnail_src": item.get("img"),
  84. "img_src": item.get("bigPicUrl"),
  85. "title": item.get("title"),
  86. "source": item.get("site"),
  87. "resolution": f"{item['width']} x {item['height']}",
  88. "publishedDate": published_date,
  89. }
  90. )
  91. if quark_category == 'general':
  92. # Quark returns a variety of different sc values on a single page, depending on the query type.
  93. source_category_parsers = {
  94. 'addition': parse_addition,
  95. 'ai_page': parse_ai_page,
  96. 'baike_sc': parse_baike_sc,
  97. 'finance_shuidi': parse_finance_shuidi,
  98. 'kk_yidian_all': parse_kk_yidian_all,
  99. 'life_show_general_image': parse_life_show_general_image,
  100. 'med_struct': parse_med_struct,
  101. 'music_new_song': parse_music_new_song,
  102. 'nature_result': parse_nature_result,
  103. 'news_uchq': parse_news_uchq,
  104. 'ss_note': parse_ss_note,
  105. # ss_kv, ss_pic, ss_text, ss_video, baike, structure_web_novel use the same struct as ss_doc
  106. 'ss_doc': parse_ss_doc,
  107. 'ss_kv': parse_ss_doc,
  108. 'ss_pic': parse_ss_doc,
  109. 'ss_text': parse_ss_doc,
  110. 'ss_video': parse_ss_doc,
  111. 'baike': parse_ss_doc,
  112. 'structure_web_novel': parse_ss_doc,
  113. 'travel_dest_overview': parse_travel_dest_overview,
  114. 'travel_ranking_list': parse_travel_ranking_list,
  115. }
  116. pattern = r'<script\s+type="application/json"\s+id="s-data-[^"]+"\s+data-used-by="hydrate">(.*?)</script>'
  117. matches = re.findall(pattern, text, re.DOTALL)
  118. for match in matches:
  119. data = json.loads(match)
  120. initial_data = data.get('data', {}).get('initialData', {})
  121. extra_data = data.get('extraData', {})
  122. source_category = extra_data.get('sc')
  123. parsers = source_category_parsers.get(source_category)
  124. if parsers:
  125. parsed_results = parsers(initial_data)
  126. if isinstance(parsed_results, list):
  127. # Extend if the result is a list
  128. results.extend(parsed_results)
  129. else:
  130. # Append if it's a single result
  131. results.append(parsed_results)
  132. return results
  133. def parse_addition(data):
  134. return {
  135. "title": html_to_text(data.get('title', {}).get('content')),
  136. "url": data.get('source', {}).get('url'),
  137. "content": html_to_text(data.get('summary', {}).get('content')),
  138. }
  139. def parse_ai_page(data):
  140. results = []
  141. for item in data.get('list', []):
  142. content = (
  143. " | ".join(map(str, item.get('content', [])))
  144. if isinstance(item.get('content'), list)
  145. else str(item.get('content'))
  146. )
  147. try:
  148. published_date = datetime.fromtimestamp(int(item.get('source', {}).get('time')))
  149. except (ValueError, TypeError):
  150. published_date = None
  151. results.append(
  152. {
  153. "title": html_to_text(item.get('title')),
  154. "url": item.get('url'),
  155. "content": html_to_text(content),
  156. "publishedDate": published_date,
  157. }
  158. )
  159. return results
  160. def parse_baike_sc(data):
  161. return {
  162. "title": html_to_text(data.get('data', {}).get('title')),
  163. "url": data.get('data', {}).get('url'),
  164. "content": html_to_text(data.get('data', {}).get('abstract')),
  165. "thumbnail": data.get('data', {}).get('img').replace("http://", "https://"),
  166. }
  167. def parse_finance_shuidi(data):
  168. content = " | ".join(
  169. (
  170. info
  171. for info in [
  172. data.get('establish_time'),
  173. data.get('company_status'),
  174. data.get('controled_type'),
  175. data.get('company_type'),
  176. data.get('capital'),
  177. data.get('address'),
  178. data.get('business_scope'),
  179. ]
  180. if info
  181. )
  182. )
  183. return {
  184. "title": html_to_text(data.get('company_name')),
  185. "url": data.get('title_url'),
  186. "content": html_to_text(content),
  187. }
  188. def parse_kk_yidian_all(data):
  189. content_list = []
  190. for section in data.get('list_container', []):
  191. for item in section.get('list_container', []):
  192. if 'dot_text' in item:
  193. content_list.append(item['dot_text'])
  194. return {
  195. "title": html_to_text(data.get('title')),
  196. "url": data.get('title_url'),
  197. "content": html_to_text(' '.join(content_list)),
  198. }
  199. def parse_life_show_general_image(data):
  200. results = []
  201. for item in data.get('image', []):
  202. try:
  203. published_date = datetime.fromtimestamp(int(item.get("publish_time")))
  204. except (ValueError, TypeError):
  205. published_date = None
  206. results.append(
  207. {
  208. "template": "images.html",
  209. "url": item.get("imgUrl"),
  210. "thumbnail_src": item.get("img"),
  211. "img_src": item.get("bigPicUrl"),
  212. "title": item.get("title"),
  213. "source": item.get("site"),
  214. "resolution": f"{item['width']} x {item['height']}",
  215. "publishedDate": published_date,
  216. }
  217. )
  218. return results
  219. def parse_med_struct(data):
  220. return {
  221. "title": html_to_text(data.get('title')),
  222. "url": data.get('message', {}).get('statistics', {}).get('nu'),
  223. "content": html_to_text(data.get('message', {}).get('content_text')),
  224. "thumbnail": data.get('message', {}).get('video_img').replace("http://", "https://"),
  225. }
  226. def parse_music_new_song(data):
  227. results = []
  228. for item in data.get('hit3', []):
  229. results.append(
  230. {
  231. "title": f"{item['song_name']} | {item['song_singer']}",
  232. "url": item.get("play_url"),
  233. "content": html_to_text(item.get("lyrics")),
  234. "thumbnail": item.get("image_url").replace("http://", "https://"),
  235. }
  236. )
  237. return results
  238. def parse_nature_result(data):
  239. return {"title": html_to_text(data.get('title')), "url": data.get('url'), "content": html_to_text(data.get('desc'))}
  240. def parse_news_uchq(data):
  241. results = []
  242. for item in data.get('feed', []):
  243. try:
  244. published_date = datetime.strptime(item.get('time'), "%Y-%m-%d")
  245. except (ValueError, TypeError):
  246. # Sometime Quark will return non-standard format like "1天前", set published_date as None
  247. published_date = None
  248. results.append(
  249. {
  250. "title": html_to_text(item.get('title')),
  251. "url": item.get('url'),
  252. "content": html_to_text(item.get('summary')),
  253. "thumbnail": item.get('image').replace("http://", "https://"),
  254. "publishedDate": published_date,
  255. }
  256. )
  257. return results
  258. def parse_ss_doc(data):
  259. published_date = None
  260. try:
  261. timestamp = int(data.get('sourceProps', {}).get('time'))
  262. # Sometime Quark will return 0, set published_date as None
  263. if timestamp != 0:
  264. published_date = datetime.fromtimestamp(timestamp)
  265. except (ValueError, TypeError):
  266. pass
  267. try:
  268. thumbnail = data.get('picListProps', [])[0].get('src').replace("http://", "https://")
  269. except (ValueError, TypeError, IndexError):
  270. thumbnail = None
  271. return {
  272. "title": html_to_text(
  273. data.get('titleProps', {}).get('content')
  274. # ss_kv variant 1 & 2
  275. or data.get('title')
  276. ),
  277. "url": data.get('sourceProps', {}).get('dest_url')
  278. # ss_kv variant 1
  279. or data.get('normal_url')
  280. # ss_kv variant 2
  281. or data.get('url'),
  282. "content": html_to_text(
  283. data.get('summaryProps', {}).get('content')
  284. # ss_doc variant 1
  285. or data.get('message', {}).get('replyContent')
  286. # ss_kv variant 1
  287. or data.get('show_body')
  288. # ss_kv variant 2
  289. or data.get('desc')
  290. ),
  291. "publishedDate": published_date,
  292. "thumbnail": thumbnail,
  293. }
  294. def parse_ss_note(data):
  295. try:
  296. published_date = datetime.fromtimestamp(int(data.get('source', {}).get('time')))
  297. except (ValueError, TypeError):
  298. published_date = None
  299. return {
  300. "title": html_to_text(data.get('title', {}).get('content')),
  301. "url": data.get('source', {}).get('dest_url'),
  302. "content": html_to_text(data.get('summary', {}).get('content')),
  303. "publishedDate": published_date,
  304. }
  305. def parse_travel_dest_overview(data):
  306. return {
  307. "title": html_to_text(data.get('strong', {}).get('title')),
  308. "url": data.get('strong', {}).get('baike_url'),
  309. "content": html_to_text(data.get('strong', {}).get('baike_text')),
  310. }
  311. def parse_travel_ranking_list(data):
  312. return {
  313. "title": html_to_text(data.get('title', {}).get('text')),
  314. "url": data.get('title', {}).get('url'),
  315. "content": html_to_text(data.get('title', {}).get('title_tag')),
  316. }