acfun.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Acfun search engine for searxng"""
  3. from urllib.parse import urlencode
  4. import re
  5. import json
  6. from datetime import datetime, timedelta
  7. from lxml import html
  8. from searx.utils import extract_text
  9. # Metadata
  10. about = {
  11. "website": "https://www.acfun.cn/",
  12. "wikidata_id": "Q3077675",
  13. "use_official_api": False,
  14. "require_api_key": False,
  15. "results": "HTML",
  16. "language": "zh",
  17. }
  18. # Engine Configuration
  19. categories = ["videos"]
  20. paging = True
  21. # Base URL
  22. base_url = "https://www.acfun.cn"
  23. def request(query, params):
  24. query_params = {"keyword": query, "pCursor": params["pageno"]}
  25. params["url"] = f"{base_url}/search?{urlencode(query_params)}"
  26. return params
  27. def response(resp):
  28. results = []
  29. matches = re.findall(r'bigPipe\.onPageletArrive\((\{.*?\})\);', resp.text, re.DOTALL)
  30. if not matches:
  31. return results
  32. for match in matches:
  33. try:
  34. json_data = json.loads(match)
  35. raw_html = json_data.get("html", "")
  36. if not raw_html:
  37. continue
  38. tree = html.fromstring(raw_html)
  39. video_blocks = tree.xpath('//div[contains(@class, "search-video")]')
  40. if not video_blocks:
  41. continue
  42. for video_block in video_blocks:
  43. video_info = extract_video_data(video_block)
  44. if video_info and video_info["title"] and video_info["url"]:
  45. results.append(video_info)
  46. except json.JSONDecodeError:
  47. continue
  48. return results
  49. def extract_video_data(video_block):
  50. try:
  51. data_exposure_log = video_block.get('data-exposure-log')
  52. video_data = json.loads(data_exposure_log)
  53. content_id = video_data.get("content_id", "")
  54. title = video_data.get("title", "")
  55. url = f"{base_url}/v/ac{content_id}"
  56. iframe_src = f"{base_url}/player/ac{content_id}"
  57. create_time = extract_text(video_block.xpath('.//span[contains(@class, "info__create-time")]'))
  58. video_cover = extract_text(video_block.xpath('.//div[contains(@class, "video__cover")]/a/img/@src')[0])
  59. video_duration = extract_text(video_block.xpath('.//span[contains(@class, "video__duration")]'))
  60. video_intro = extract_text(video_block.xpath('.//div[contains(@class, "video__main__intro")]'))
  61. published_date = None
  62. if create_time:
  63. try:
  64. published_date = datetime.strptime(create_time.strip(), "%Y-%m-%d")
  65. except (ValueError, TypeError):
  66. pass
  67. length = None
  68. if video_duration:
  69. try:
  70. timediff = datetime.strptime(video_duration.strip(), "%M:%S")
  71. length = timedelta(minutes=timediff.minute, seconds=timediff.second)
  72. except (ValueError, TypeError):
  73. pass
  74. return {
  75. "title": title,
  76. "url": url,
  77. "content": video_intro,
  78. "thumbnail": video_cover,
  79. "length": length,
  80. "publishedDate": published_date,
  81. "iframe_src": iframe_src,
  82. }
  83. except (json.JSONDecodeError, AttributeError, TypeError, ValueError):
  84. return None