123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383 |
- # SPDX-License-Identifier: AGPL-3.0-or-later
- """Quark (Shenma) search engine for searxng"""
- from urllib.parse import urlencode
- from datetime import datetime
- import re
- import json
- from searx.utils import html_to_text, gen_useragent
- from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
- # Metadata
- about = {
- "website": "https://quark.sm.cn/",
- "wikidata_id": "Q48816502",
- "use_official_api": False,
- "require_api_key": False,
- "results": "HTML",
- "language": "zh",
- }
- # Engine Configuration
- categories = []
- paging = True
- results_per_page = 10
- quark_category = 'general'
- time_range_support = True
- time_range_dict = {'day': '4', 'week': '3', 'month': '2', 'year': '1'}
- CAPTCHA_PATTERN = r'\{[^{]*?"action"\s*:\s*"captcha"\s*,\s*"url"\s*:\s*"([^"]+)"[^{]*?\}'
- def is_alibaba_captcha(html):
- """
- Detects if the response contains an Alibaba X5SEC CAPTCHA page.
- Quark may return a CAPTCHA challenge after 9 requests in a short period.
- Typically, the ban duration is around 15 minutes.
- """
- return bool(re.search(CAPTCHA_PATTERN, html))
- def init(_):
- if quark_category not in ('general', 'images'):
- raise SearxEngineAPIException(f"Unsupported category: {quark_category}")
- def request(query, params):
- page_num = params["pageno"]
- category_config = {
- 'general': {
- 'endpoint': 'https://quark.sm.cn/s',
- 'params': {
- "q": query,
- "layout": "html",
- "page": page_num,
- },
- },
- 'images': {
- 'endpoint': 'https://vt.sm.cn/api/pic/list',
- 'params': {
- "query": query,
- "limit": results_per_page,
- "start": (page_num - 1) * results_per_page,
- },
- },
- }
- query_params = category_config[quark_category]['params']
- query_url = category_config[quark_category]['endpoint']
- if time_range_dict.get(params['time_range']) and quark_category == 'general':
- query_params["tl_request"] = time_range_dict.get(params['time_range'])
- params["url"] = f"{query_url}?{urlencode(query_params)}"
- params["headers"] = {
- "User-Agent": gen_useragent(),
- }
- return params
- def response(resp):
- results = []
- text = resp.text
- if is_alibaba_captcha(text):
- raise SearxEngineCaptchaException(
- suspended_time=900, message="Alibaba CAPTCHA detected. Please try again later."
- )
- if quark_category == 'images':
- data = json.loads(text)
- for item in data.get('data', {}).get('hit', {}).get('imgInfo', {}).get('item', []):
- try:
- published_date = datetime.fromtimestamp(int(item.get("publish_time")))
- except (ValueError, TypeError):
- published_date = None
- results.append(
- {
- "template": "images.html",
- "url": item.get("imgUrl"),
- "thumbnail_src": item.get("img"),
- "img_src": item.get("bigPicUrl"),
- "title": item.get("title"),
- "source": item.get("site"),
- "resolution": f"{item['width']} x {item['height']}",
- "publishedDate": published_date,
- }
- )
- if quark_category == 'general':
- # Quark returns a variety of different sc values on a single page, depending on the query type.
- source_category_parsers = {
- 'addition': parse_addition,
- 'ai_page': parse_ai_page,
- 'baike_sc': parse_baike_sc,
- 'finance_shuidi': parse_finance_shuidi,
- 'kk_yidian_all': parse_kk_yidian_all,
- 'life_show_general_image': parse_life_show_general_image,
- 'med_struct': parse_med_struct,
- 'music_new_song': parse_music_new_song,
- 'nature_result': parse_nature_result,
- 'news_uchq': parse_news_uchq,
- 'ss_note': parse_ss_note,
- # ss_kv, ss_pic, ss_text, ss_video, baike, structure_web_novel use the same struct as ss_doc
- 'ss_doc': parse_ss_doc,
- 'ss_kv': parse_ss_doc,
- 'ss_pic': parse_ss_doc,
- 'ss_text': parse_ss_doc,
- 'ss_video': parse_ss_doc,
- 'baike': parse_ss_doc,
- 'structure_web_novel': parse_ss_doc,
- 'travel_dest_overview': parse_travel_dest_overview,
- 'travel_ranking_list': parse_travel_ranking_list,
- }
- pattern = r'<script\s+type="application/json"\s+id="s-data-[^"]+"\s+data-used-by="hydrate">(.*?)</script>'
- matches = re.findall(pattern, text, re.DOTALL)
- for match in matches:
- data = json.loads(match)
- initial_data = data.get('data', {}).get('initialData', {})
- extra_data = data.get('extraData', {})
- source_category = extra_data.get('sc')
- parsers = source_category_parsers.get(source_category)
- if parsers:
- parsed_results = parsers(initial_data)
- if isinstance(parsed_results, list):
- # Extend if the result is a list
- results.extend(parsed_results)
- else:
- # Append if it's a single result
- results.append(parsed_results)
- return results
- def parse_addition(data):
- return {
- "title": html_to_text(data.get('title', {}).get('content')),
- "url": data.get('source', {}).get('url'),
- "content": html_to_text(data.get('summary', {}).get('content')),
- }
- def parse_ai_page(data):
- results = []
- for item in data.get('list', []):
- content = (
- " | ".join(map(str, item.get('content', [])))
- if isinstance(item.get('content'), list)
- else str(item.get('content'))
- )
- try:
- published_date = datetime.fromtimestamp(int(item.get('source', {}).get('time')))
- except (ValueError, TypeError):
- published_date = None
- results.append(
- {
- "title": html_to_text(item.get('title')),
- "url": item.get('url'),
- "content": html_to_text(content),
- "publishedDate": published_date,
- }
- )
- return results
- def parse_baike_sc(data):
- return {
- "title": html_to_text(data.get('data', {}).get('title')),
- "url": data.get('data', {}).get('url'),
- "content": html_to_text(data.get('data', {}).get('abstract')),
- "thumbnail": data.get('data', {}).get('img').replace("http://", "https://"),
- }
- def parse_finance_shuidi(data):
- content = " | ".join(
- (
- info
- for info in [
- data.get('establish_time'),
- data.get('company_status'),
- data.get('controled_type'),
- data.get('company_type'),
- data.get('capital'),
- data.get('address'),
- data.get('business_scope'),
- ]
- if info
- )
- )
- return {
- "title": html_to_text(data.get('company_name')),
- "url": data.get('title_url'),
- "content": html_to_text(content),
- }
- def parse_kk_yidian_all(data):
- content_list = []
- for section in data.get('list_container', []):
- for item in section.get('list_container', []):
- if 'dot_text' in item:
- content_list.append(item['dot_text'])
- return {
- "title": html_to_text(data.get('title')),
- "url": data.get('title_url'),
- "content": html_to_text(' '.join(content_list)),
- }
- def parse_life_show_general_image(data):
- results = []
- for item in data.get('image', []):
- try:
- published_date = datetime.fromtimestamp(int(item.get("publish_time")))
- except (ValueError, TypeError):
- published_date = None
- results.append(
- {
- "template": "images.html",
- "url": item.get("imgUrl"),
- "thumbnail_src": item.get("img"),
- "img_src": item.get("bigPicUrl"),
- "title": item.get("title"),
- "source": item.get("site"),
- "resolution": f"{item['width']} x {item['height']}",
- "publishedDate": published_date,
- }
- )
- return results
- def parse_med_struct(data):
- return {
- "title": html_to_text(data.get('title')),
- "url": data.get('message', {}).get('statistics', {}).get('nu'),
- "content": html_to_text(data.get('message', {}).get('content_text')),
- "thumbnail": data.get('message', {}).get('video_img').replace("http://", "https://"),
- }
- def parse_music_new_song(data):
- results = []
- for item in data.get('hit3', []):
- results.append(
- {
- "title": f"{item['song_name']} | {item['song_singer']}",
- "url": item.get("play_url"),
- "content": html_to_text(item.get("lyrics")),
- "thumbnail": item.get("image_url").replace("http://", "https://"),
- }
- )
- return results
- def parse_nature_result(data):
- return {"title": html_to_text(data.get('title')), "url": data.get('url'), "content": html_to_text(data.get('desc'))}
- def parse_news_uchq(data):
- results = []
- for item in data.get('feed', []):
- try:
- published_date = datetime.strptime(item.get('time'), "%Y-%m-%d")
- except (ValueError, TypeError):
- # Sometime Quark will return non-standard format like "1天前", set published_date as None
- published_date = None
- results.append(
- {
- "title": html_to_text(item.get('title')),
- "url": item.get('url'),
- "content": html_to_text(item.get('summary')),
- "thumbnail": item.get('image').replace("http://", "https://"),
- "publishedDate": published_date,
- }
- )
- return results
- def parse_ss_doc(data):
- published_date = None
- try:
- timestamp = int(data.get('sourceProps', {}).get('time'))
- # Sometime Quark will return 0, set published_date as None
- if timestamp != 0:
- published_date = datetime.fromtimestamp(timestamp)
- except (ValueError, TypeError):
- pass
- try:
- thumbnail = data.get('picListProps', [])[0].get('src').replace("http://", "https://")
- except (ValueError, TypeError, IndexError):
- thumbnail = None
- return {
- "title": html_to_text(
- data.get('titleProps', {}).get('content')
- # ss_kv variant 1 & 2
- or data.get('title')
- ),
- "url": data.get('sourceProps', {}).get('dest_url')
- # ss_kv variant 1
- or data.get('normal_url')
- # ss_kv variant 2
- or data.get('url'),
- "content": html_to_text(
- data.get('summaryProps', {}).get('content')
- # ss_doc variant 1
- or data.get('message', {}).get('replyContent')
- # ss_kv variant 1
- or data.get('show_body')
- # ss_kv variant 2
- or data.get('desc')
- ),
- "publishedDate": published_date,
- "thumbnail": thumbnail,
- }
- def parse_ss_note(data):
- try:
- published_date = datetime.fromtimestamp(int(data.get('source', {}).get('time')))
- except (ValueError, TypeError):
- published_date = None
- return {
- "title": html_to_text(data.get('title', {}).get('content')),
- "url": data.get('source', {}).get('dest_url'),
- "content": html_to_text(data.get('summary', {}).get('content')),
- "publishedDate": published_date,
- }
- def parse_travel_dest_overview(data):
- return {
- "title": html_to_text(data.get('strong', {}).get('title')),
- "url": data.get('strong', {}).get('baike_url'),
- "content": html_to_text(data.get('strong', {}).get('baike_text')),
- }
- def parse_travel_ranking_list(data):
- return {
- "title": html_to_text(data.get('title', {}).get('text')),
- "url": data.get('title', {}).get('url'),
- "content": html_to_text(data.get('title', {}).get('title_tag')),
- }
|