sogou_wechat.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Sogou-WeChat search engine for retrieving WeChat Article from Sogou"""
  3. from urllib.parse import urlencode
  4. from datetime import datetime
  5. import re
  6. from lxml import html
  7. from searx.utils import extract_text
  8. # Metadata
  9. about = {
  10. "website": "https://weixin.sogou.com/",
  11. "use_official_api": False,
  12. "require_api_key": False,
  13. "results": "HTML",
  14. "language": "zh",
  15. }
  16. # Engine Configuration
  17. categories = ["news"]
  18. paging = True
  19. # Base URL
  20. base_url = "https://weixin.sogou.com"
  21. def request(query, params):
  22. query_params = {
  23. "query": query,
  24. "page": params["pageno"],
  25. "type": 2,
  26. }
  27. params["url"] = f"{base_url}/weixin?{urlencode(query_params)}"
  28. return params
  29. def response(resp):
  30. dom = html.fromstring(resp.text)
  31. results = []
  32. for item in dom.xpath('//li[contains(@id, "sogou_vr_")]'):
  33. title = extract_text(item.xpath('.//h3/a'))
  34. url = extract_text(item.xpath('.//h3/a/@href'))
  35. if url.startswith("/link?url="):
  36. url = f"{base_url}{url}"
  37. content = extract_text(item.xpath('.//p[@class="txt-info"]'))
  38. if not content:
  39. content = extract_text(item.xpath('.//p[contains(@class, "txt-info")]'))
  40. thumbnail = extract_text(item.xpath('.//div[@class="img-box"]/a/img/@src'))
  41. if thumbnail and thumbnail.startswith("//"):
  42. thumbnail = f"https:{thumbnail}"
  43. published_date = None
  44. timestamp = extract_text(item.xpath('.//script[contains(text(), "timeConvert")]'))
  45. if timestamp:
  46. match = re.search(r"timeConvert\('(\d+)'\)", timestamp)
  47. if match:
  48. published_date = datetime.fromtimestamp(int(match.group(1)))
  49. if title and url:
  50. results.append(
  51. {
  52. "title": title,
  53. "url": url,
  54. "content": content,
  55. 'thumbnail': thumbnail,
  56. "publishedDate": published_date,
  57. }
  58. )
  59. return results