123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- # SPDX-License-Identifier: AGPL-3.0-or-later
- """Acfun search engine for searxng"""
- from urllib.parse import urlencode
- import re
- import json
- from datetime import datetime, timedelta
- from lxml import html
- from searx.utils import extract_text
- # Metadata
- about = {
- "website": "https://www.acfun.cn/",
- "wikidata_id": "Q3077675",
- "use_official_api": False,
- "require_api_key": False,
- "results": "HTML",
- "language": "zh",
- }
- # Engine Configuration
- categories = ["videos"]
- paging = True
- # Base URL
- base_url = "https://www.acfun.cn"
- def request(query, params):
- query_params = {"keyword": query, "pCursor": params["pageno"]}
- params["url"] = f"{base_url}/search?{urlencode(query_params)}"
- return params
- def response(resp):
- results = []
- matches = re.findall(r'bigPipe\.onPageletArrive\((\{.*?\})\);', resp.text, re.DOTALL)
- if not matches:
- return results
- for match in matches:
- try:
- json_data = json.loads(match)
- raw_html = json_data.get("html", "")
- if not raw_html:
- continue
- tree = html.fromstring(raw_html)
- video_blocks = tree.xpath('//div[contains(@class, "search-video")]')
- if not video_blocks:
- continue
- for video_block in video_blocks:
- video_info = extract_video_data(video_block)
- if video_info and video_info["title"] and video_info["url"]:
- results.append(video_info)
- except json.JSONDecodeError:
- continue
- return results
- def extract_video_data(video_block):
- try:
- data_exposure_log = video_block.get('data-exposure-log')
- video_data = json.loads(data_exposure_log)
- content_id = video_data.get("content_id", "")
- title = video_data.get("title", "")
- url = f"{base_url}/v/ac{content_id}"
- iframe_src = f"{base_url}/player/ac{content_id}"
- create_time = extract_text(video_block.xpath('.//span[contains(@class, "info__create-time")]'))
- video_cover = extract_text(video_block.xpath('.//div[contains(@class, "video__cover")]/a/img/@src')[0])
- video_duration = extract_text(video_block.xpath('.//span[contains(@class, "video__duration")]'))
- video_intro = extract_text(video_block.xpath('.//div[contains(@class, "video__main__intro")]'))
- published_date = None
- if create_time:
- try:
- published_date = datetime.strptime(create_time.strip(), "%Y-%m-%d")
- except (ValueError, TypeError):
- pass
- length = None
- if video_duration:
- try:
- timediff = datetime.strptime(video_duration.strip(), "%M:%S")
- length = timedelta(minutes=timediff.minute, seconds=timediff.second)
- except (ValueError, TypeError):
- pass
- return {
- "title": title,
- "url": url,
- "content": video_intro,
- "thumbnail": video_cover,
- "length": length,
- "publishedDate": published_date,
- "iframe_src": iframe_src,
- }
- except (json.JSONDecodeError, AttributeError, TypeError, ValueError):
- return None
|