sogou_wechat.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Sogou-WeChat search engine for retrieving WeChat Article from Sogou"""
  3. from urllib.parse import urlencode
  4. from datetime import datetime
  5. import re
  6. from lxml import html
  7. from searx.utils import extract_text
  8. # Metadata
  9. about = {
  10. "website": "https://weixin.sogou.com/",
  11. "use_official_api": False,
  12. "require_api_key": False,
  13. "results": "HTML",
  14. }
  15. # Engine Configuration
  16. categories = ["news"]
  17. paging = True
  18. # Base URL
  19. base_url = "https://weixin.sogou.com"
  20. def request(query, params):
  21. query_params = {
  22. "query": query,
  23. "page": params["pageno"],
  24. "type": 2,
  25. }
  26. params["url"] = f"{base_url}/weixin?{urlencode(query_params)}"
  27. return params
  28. def response(resp):
  29. dom = html.fromstring(resp.text)
  30. results = []
  31. for item in dom.xpath('//li[contains(@id, "sogou_vr_")]'):
  32. title = extract_text(item.xpath('.//h3/a'))
  33. url = extract_text(item.xpath('.//h3/a/@href'))
  34. if url.startswith("/link?url="):
  35. url = f"{base_url}{url}"
  36. content = extract_text(item.xpath('.//p[@class="txt-info"]'))
  37. if not content:
  38. content = extract_text(item.xpath('.//p[contains(@class, "txt-info")]'))
  39. thumbnail = extract_text(item.xpath('.//div[@class="img-box"]/a/img/@src'))
  40. if thumbnail and thumbnail.startswith("//"):
  41. thumbnail = f"https:{thumbnail}"
  42. published_date = None
  43. timestamp = extract_text(item.xpath('.//script[contains(text(), "timeConvert")]'))
  44. if timestamp:
  45. match = re.search(r"timeConvert\('(\d+)'\)", timestamp)
  46. if match:
  47. published_date = datetime.fromtimestamp(int(match.group(1)))
  48. if title and url:
  49. results.append(
  50. {
  51. "title": title,
  52. "url": url,
  53. "content": content,
  54. 'thumbnail': thumbnail,
  55. "publishedDate": published_date,
  56. }
  57. )
  58. return results