duckduckgo.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. """
  2. DuckDuckGo (Web)
  3. @website https://duckduckgo.com/
  4. @provide-api yes (https://duckduckgo.com/api),
  5. but not all results from search-site
  6. @using-api no
  7. @results HTML (using search portal)
  8. @stable no (HTML can change)
  9. @parse url, title, content
  10. @todo rewrite to api
  11. """
  12. from lxml.html import fromstring
  13. from json import loads
  14. from urllib.parse import urlencode
  15. from searx.poolrequests import get
  16. from searx.utils import extract_text, match_language, eval_xpath
  17. # engine dependent config
  18. categories = ['general']
  19. paging = True
  20. language_support = True
  21. supported_languages_url = 'https://duckduckgo.com/util/u172.js'
  22. time_range_support = True
  23. language_aliases = {
  24. 'ar-SA': 'ar-XA',
  25. 'es-419': 'es-XL',
  26. 'ja': 'jp-JP',
  27. 'ko': 'kr-KR',
  28. 'sl-SI': 'sl-SL',
  29. 'zh-TW': 'tzh-TW',
  30. 'zh-HK': 'tzh-HK'
  31. }
  32. # search-url
  33. url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}'
  34. time_range_url = '&df={range}'
  35. time_range_dict = {'day': 'd',
  36. 'week': 'w',
  37. 'month': 'm'}
  38. # specific xpath variables
  39. result_xpath = '//div[@class="result results_links results_links_deep web-result "]' # noqa
  40. url_xpath = './/a[@class="result__a"]/@href'
  41. title_xpath = './/a[@class="result__a"]'
  42. content_xpath = './/a[@class="result__snippet"]'
  43. correction_xpath = '//div[@id="did_you_mean"]//a'
  44. # match query's language to a region code that duckduckgo will accept
  45. def get_region_code(lang, lang_list=[]):
  46. if lang == 'all':
  47. return None
  48. lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT')
  49. lang_parts = lang_code.split('-')
  50. # country code goes first
  51. return lang_parts[1].lower() + '-' + lang_parts[0].lower()
  52. def request(query, params):
  53. if params['time_range'] not in (None, 'None', '') and params['time_range'] not in time_range_dict:
  54. return params
  55. offset = (params['pageno'] - 1) * 30
  56. region_code = get_region_code(params['language'], supported_languages)
  57. params['url'] = 'https://duckduckgo.com/html/'
  58. if params['pageno'] > 1:
  59. params['method'] = 'POST'
  60. params['data']['q'] = query
  61. params['data']['s'] = offset
  62. params['data']['dc'] = 30
  63. params['data']['nextParams'] = ''
  64. params['data']['v'] = 'l'
  65. params['data']['o'] = 'json'
  66. params['data']['api'] = '/d.js'
  67. if params['time_range'] in time_range_dict:
  68. params['data']['df'] = time_range_dict[params['time_range']]
  69. if region_code:
  70. params['data']['kl'] = region_code
  71. else:
  72. if region_code:
  73. params['url'] = url.format(
  74. query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
  75. else:
  76. params['url'] = url.format(
  77. query=urlencode({'q': query}), offset=offset, dc_param=offset)
  78. if params['time_range'] in time_range_dict:
  79. params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])
  80. return params
  81. # get response from search-request
  82. def response(resp):
  83. results = []
  84. doc = fromstring(resp.text)
  85. # parse results
  86. for i, r in enumerate(eval_xpath(doc, result_xpath)):
  87. if i >= 30:
  88. break
  89. try:
  90. res_url = eval_xpath(r, url_xpath)[-1]
  91. except:
  92. continue
  93. if not res_url:
  94. continue
  95. title = extract_text(eval_xpath(r, title_xpath))
  96. content = extract_text(eval_xpath(r, content_xpath))
  97. # append result
  98. results.append({'title': title,
  99. 'content': content,
  100. 'url': res_url})
  101. # parse correction
  102. for correction in eval_xpath(doc, correction_xpath):
  103. # append correction
  104. results.append({'correction': extract_text(correction)})
  105. # return results
  106. return results
  107. # get supported languages from their site
  108. def _fetch_supported_languages(resp):
  109. # response is a js file with regions as an embedded object
  110. response_page = resp.text
  111. response_page = response_page[response_page.find('regions:{') + 8:]
  112. response_page = response_page[:response_page.find('}') + 1]
  113. regions_json = loads(response_page)
  114. supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
  115. return list(supported_languages)