mojeek.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Mojeek (general, images, news)"""
  3. from datetime import datetime
  4. from urllib.parse import urlencode
  5. from lxml import html
  6. from dateutil.relativedelta import relativedelta
  7. from searx.utils import eval_xpath, eval_xpath_list, extract_text
  8. about = {
  9. 'website': 'https://mojeek.com',
  10. 'wikidata_id': 'Q60747299',
  11. 'official_api_documentation': 'https://www.mojeek.com/support/api/search/request_parameters.html',
  12. 'use_official_api': False,
  13. 'require_api_key': False,
  14. 'results': 'HTML',
  15. }
  16. paging = True # paging is only supported for general search
  17. safesearch = True
  18. time_range_support = True # time range search is supported for general and news
  19. max_page = 10
  20. base_url = "https://www.mojeek.com"
  21. categories = ["general", "web"]
  22. search_type = "" # leave blank for general, other possible values: images, news
  23. results_xpath = '//ul[@class="results-standard"]/li/a[@class="ob"]'
  24. url_xpath = './@href'
  25. title_xpath = '../h2/a'
  26. content_xpath = '..//p[@class="s"]'
  27. suggestion_xpath = '//div[@class="top-info"]/p[@class="top-info spell"]/em/a'
  28. image_results_xpath = '//div[@id="results"]/div[contains(@class, "image")]'
  29. image_url_xpath = './a/@href'
  30. image_title_xpath = './a/@data-title'
  31. image_img_src_xpath = './a/img/@src'
  32. news_results_xpath = '//section[contains(@class, "news-search-result")]//article'
  33. news_url_xpath = './/h2/a/@href'
  34. news_title_xpath = './/h2/a'
  35. news_content_xpath = './/p[@class="s"]'
  36. def init(_):
  37. if search_type not in ('', 'images', 'news'):
  38. raise ValueError(f"Invalid search type {search_type}")
  39. def request(query, params):
  40. args = {
  41. 'q': query,
  42. 'safe': min(params['safesearch'], 1),
  43. 'fmt': search_type,
  44. }
  45. if search_type == '':
  46. args['s'] = 10 * (params['pageno'] - 1)
  47. if params['time_range'] and search_type != 'images':
  48. args["since"] = (datetime.now() - relativedelta(**{f"{params['time_range']}s": 1})).strftime("%Y%m%d")
  49. logger.debug(args["since"])
  50. params['url'] = f"{base_url}/search?{urlencode(args)}"
  51. return params
  52. def _general_results(dom):
  53. results = []
  54. for result in eval_xpath_list(dom, results_xpath):
  55. results.append(
  56. {
  57. 'url': extract_text(eval_xpath(result, url_xpath)),
  58. 'title': extract_text(eval_xpath(result, title_xpath)),
  59. 'content': extract_text(eval_xpath(result, content_xpath)),
  60. }
  61. )
  62. for suggestion in eval_xpath(dom, suggestion_xpath):
  63. results.append({'suggestion': extract_text(suggestion)})
  64. return results
  65. def _image_results(dom):
  66. results = []
  67. for result in eval_xpath_list(dom, image_results_xpath):
  68. results.append(
  69. {
  70. 'template': 'images.html',
  71. 'url': extract_text(eval_xpath(result, image_url_xpath)),
  72. 'title': extract_text(eval_xpath(result, image_title_xpath)),
  73. 'img_src': base_url + extract_text(eval_xpath(result, image_img_src_xpath)),
  74. 'content': '',
  75. }
  76. )
  77. return results
  78. def _news_results(dom):
  79. results = []
  80. for result in eval_xpath_list(dom, news_results_xpath):
  81. results.append(
  82. {
  83. 'url': extract_text(eval_xpath(result, news_url_xpath)),
  84. 'title': extract_text(eval_xpath(result, news_title_xpath)),
  85. 'content': extract_text(eval_xpath(result, news_content_xpath)),
  86. }
  87. )
  88. return results
  89. def response(resp):
  90. dom = html.fromstring(resp.text)
  91. if search_type == '':
  92. return _general_results(dom)
  93. if search_type == 'images':
  94. return _image_results(dom)
  95. if search_type == 'news':
  96. return _news_results(dom)
  97. raise ValueError(f"Invalid search type {search_type}")