acfun.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Acfun search engine for searxng"""
  3. from urllib.parse import urlencode
  4. import re
  5. import json
  6. from datetime import datetime, timedelta
  7. from lxml import html
  8. from searx.utils import extract_text
  9. # Metadata
  10. about = {
  11. "website": "https://www.acfun.cn/",
  12. "wikidata_id": "Q3077675",
  13. "use_official_api": False,
  14. "require_api_key": False,
  15. "results": "HTML",
  16. }
  17. # Engine Configuration
  18. categories = ["videos"]
  19. paging = True
  20. # Base URL
  21. base_url = "https://www.acfun.cn"
  22. def request(query, params):
  23. query_params = {"keyword": query, "pCursor": params["pageno"]}
  24. params["url"] = f"{base_url}/search?{urlencode(query_params)}"
  25. return params
  26. def response(resp):
  27. results = []
  28. matches = re.findall(r'bigPipe\.onPageletArrive\((\{.*?\})\);', resp.text, re.DOTALL)
  29. if not matches:
  30. return results
  31. for match in matches:
  32. try:
  33. json_data = json.loads(match)
  34. raw_html = json_data.get("html", "")
  35. if not raw_html:
  36. continue
  37. tree = html.fromstring(raw_html)
  38. video_blocks = tree.xpath('//div[contains(@class, "search-video")]')
  39. if not video_blocks:
  40. continue
  41. for video_block in video_blocks:
  42. video_info = extract_video_data(video_block)
  43. if video_info and video_info["title"] and video_info["url"]:
  44. results.append(video_info)
  45. except json.JSONDecodeError:
  46. continue
  47. return results
  48. def extract_video_data(video_block):
  49. try:
  50. data_exposure_log = video_block.get('data-exposure-log')
  51. video_data = json.loads(data_exposure_log)
  52. content_id = video_data.get("content_id", "")
  53. title = video_data.get("title", "")
  54. url = f"{base_url}/v/ac{content_id}"
  55. iframe_src = f"{base_url}/player/ac{content_id}"
  56. create_time = extract_text(video_block.xpath('.//span[contains(@class, "info__create-time")]'))
  57. video_cover = extract_text(video_block.xpath('.//div[contains(@class, "video__cover")]/a/img/@src')[0])
  58. video_duration = extract_text(video_block.xpath('.//span[contains(@class, "video__duration")]'))
  59. video_intro = extract_text(video_block.xpath('.//div[contains(@class, "video__main__intro")]'))
  60. published_date = None
  61. if create_time:
  62. try:
  63. published_date = datetime.strptime(create_time.strip(), "%Y-%m-%d")
  64. except (ValueError, TypeError):
  65. pass
  66. length = None
  67. if video_duration:
  68. try:
  69. timediff = datetime.strptime(video_duration.strip(), "%M:%S")
  70. length = timedelta(minutes=timediff.minute, seconds=timediff.second)
  71. except (ValueError, TypeError):
  72. pass
  73. return {
  74. "title": title,
  75. "url": url,
  76. "content": video_intro,
  77. "thumbnail": video_cover,
  78. "length": length,
  79. "publishedDate": published_date,
  80. "iframe_src": iframe_src,
  81. }
  82. except (json.JSONDecodeError, AttributeError, TypeError, ValueError):
  83. return None