duckduckgo.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. """
  2. DuckDuckGo (Web)
  3. @website https://duckduckgo.com/
  4. @provide-api yes (https://duckduckgo.com/api),
  5. but not all results from search-site
  6. @using-api no
  7. @results HTML (using search portal)
  8. @stable no (HTML can change)
  9. @parse url, title, content
  10. @todo rewrite to api
  11. """
  12. from lxml.html import fromstring
  13. from json import loads
  14. from urllib.parse import urlencode
  15. from searx.poolrequests import get
  16. from searx.utils import extract_text, match_language, eval_xpath
  17. # engine dependent config
  18. categories = ['general']
  19. paging = False
  20. language_support = True
  21. supported_languages_url = 'https://duckduckgo.com/util/u172.js'
  22. time_range_support = True
  23. language_aliases = {
  24. 'ar-SA': 'ar-XA',
  25. 'es-419': 'es-XL',
  26. 'ja': 'jp-JP',
  27. 'ko': 'kr-KR',
  28. 'sl-SI': 'sl-SL',
  29. 'zh-TW': 'tzh-TW',
  30. 'zh-HK': 'tzh-HK'
  31. }
  32. # search-url
  33. url = 'https://html.duckduckgo.com/html'
  34. time_range_dict = {'day': 'd',
  35. 'week': 'w',
  36. 'month': 'm'}
  37. # specific xpath variables
  38. result_xpath = '//div[@class="result results_links results_links_deep web-result "]' # noqa
  39. url_xpath = './/a[@class="result__a"]/@href'
  40. title_xpath = './/a[@class="result__a"]'
  41. content_xpath = './/a[@class="result__snippet"]'
  42. correction_xpath = '//div[@id="did_you_mean"]//a'
  43. # match query's language to a region code that duckduckgo will accept
  44. def get_region_code(lang, lang_list=[]):
  45. if lang == 'all':
  46. return None
  47. lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT')
  48. lang_parts = lang_code.split('-')
  49. # country code goes first
  50. return lang_parts[1].lower() + '-' + lang_parts[0].lower()
  51. def request(query, params):
  52. if params['time_range'] is not None and params['time_range'] not in time_range_dict:
  53. return params
  54. params['url'] = url
  55. params['method'] = 'POST'
  56. params['data']['b'] = ''
  57. params['data']['q'] = query
  58. params['data']['df'] = ''
  59. region_code = get_region_code(params['language'], supported_languages)
  60. if region_code:
  61. params['data']['kl'] = region_code
  62. params['cookies']['kl'] = region_code
  63. if params['time_range'] in time_range_dict:
  64. params['data']['df'] = time_range_dict[params['time_range']]
  65. return params
  66. # get response from search-request
  67. def response(resp):
  68. results = []
  69. doc = fromstring(resp.text)
  70. # parse results
  71. for i, r in enumerate(eval_xpath(doc, result_xpath)):
  72. if i >= 30:
  73. break
  74. try:
  75. res_url = eval_xpath(r, url_xpath)[-1]
  76. except:
  77. continue
  78. if not res_url:
  79. continue
  80. title = extract_text(eval_xpath(r, title_xpath))
  81. content = extract_text(eval_xpath(r, content_xpath))
  82. # append result
  83. results.append({'title': title,
  84. 'content': content,
  85. 'url': res_url})
  86. # parse correction
  87. for correction in eval_xpath(doc, correction_xpath):
  88. # append correction
  89. results.append({'correction': extract_text(correction)})
  90. # return results
  91. return results
  92. # get supported languages from their site
  93. def _fetch_supported_languages(resp):
  94. # response is a js file with regions as an embedded object
  95. response_page = resp.text
  96. response_page = response_page[response_page.find('regions:{') + 8:]
  97. response_page = response_page[:response_page.find('}') + 1]
  98. regions_json = loads(response_page)
  99. supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
  100. return list(supported_languages)