duckduckgo.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. DuckDuckGo (Web)
  4. """
  5. from lxml.html import fromstring
  6. from json import loads
  7. from searx.utils import extract_text, match_language, eval_xpath
  8. # about
  9. about = {
  10. "website": 'https://duckduckgo.com/',
  11. "wikidata_id": 'Q12805',
  12. "official_api_documentation": 'https://duckduckgo.com/api',
  13. "use_official_api": False,
  14. "require_api_key": False,
  15. "results": 'HTML',
  16. }
  17. # engine dependent config
  18. categories = ['general']
  19. paging = False
  20. supported_languages_url = 'https://duckduckgo.com/util/u172.js'
  21. time_range_support = True
  22. language_aliases = {
  23. 'ar-SA': 'ar-XA',
  24. 'es-419': 'es-XL',
  25. 'ja': 'jp-JP',
  26. 'ko': 'kr-KR',
  27. 'sl-SI': 'sl-SL',
  28. 'zh-TW': 'tzh-TW',
  29. 'zh-HK': 'tzh-HK'
  30. }
  31. # search-url
  32. url = 'https://html.duckduckgo.com/html'
  33. time_range_dict = {'day': 'd',
  34. 'week': 'w',
  35. 'month': 'm'}
  36. # specific xpath variables
  37. result_xpath = '//div[@class="result results_links results_links_deep web-result "]' # noqa
  38. url_xpath = './/a[@class="result__a"]/@href'
  39. title_xpath = './/a[@class="result__a"]'
  40. content_xpath = './/a[@class="result__snippet"]'
  41. correction_xpath = '//div[@id="did_you_mean"]//a'
  42. # match query's language to a region code that duckduckgo will accept
  43. def get_region_code(lang, lang_list=None):
  44. if lang == 'all':
  45. return None
  46. lang_code = match_language(lang, lang_list or [], language_aliases, 'wt-WT')
  47. lang_parts = lang_code.split('-')
  48. # country code goes first
  49. return lang_parts[1].lower() + '-' + lang_parts[0].lower()
  50. def request(query, params):
  51. if params['time_range'] is not None and params['time_range'] not in time_range_dict:
  52. return params
  53. params['url'] = url
  54. params['method'] = 'POST'
  55. params['data']['b'] = ''
  56. params['data']['q'] = query
  57. params['data']['df'] = ''
  58. region_code = get_region_code(params['language'], supported_languages)
  59. if region_code:
  60. params['data']['kl'] = region_code
  61. params['cookies']['kl'] = region_code
  62. if params['time_range'] in time_range_dict:
  63. params['data']['df'] = time_range_dict[params['time_range']]
  64. return params
  65. # get response from search-request
  66. def response(resp):
  67. results = []
  68. doc = fromstring(resp.text)
  69. # parse results
  70. for i, r in enumerate(eval_xpath(doc, result_xpath)):
  71. if i >= 30:
  72. break
  73. try:
  74. res_url = eval_xpath(r, url_xpath)[-1]
  75. except:
  76. continue
  77. if not res_url:
  78. continue
  79. title = extract_text(eval_xpath(r, title_xpath))
  80. content = extract_text(eval_xpath(r, content_xpath))
  81. # append result
  82. results.append({'title': title,
  83. 'content': content,
  84. 'url': res_url})
  85. # parse correction
  86. for correction in eval_xpath(doc, correction_xpath):
  87. # append correction
  88. results.append({'correction': extract_text(correction)})
  89. # return results
  90. return results
  91. # get supported languages from their site
  92. def _fetch_supported_languages(resp):
  93. # response is a js file with regions as an embedded object
  94. response_page = resp.text
  95. response_page = response_page[response_page.find('regions:{') + 8:]
  96. response_page = response_page[:response_page.find('}') + 1]
  97. regions_json = loads(response_page)
  98. supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
  99. return list(supported_languages)