duckduckgo.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. """
  2. DuckDuckGo (Web)
  3. @website https://duckduckgo.com/
  4. @provide-api yes (https://duckduckgo.com/api),
  5. but not all results from search-site
  6. @using-api no
  7. @results HTML (using search portal)
  8. @stable no (HTML can change)
  9. @parse url, title, content
  10. @todo rewrite to api
  11. """
  12. from urllib import urlencode
  13. from lxml.html import fromstring
  14. from requests import get
  15. from json import loads
  16. from searx.engines.xpath import extract_text
  17. # engine dependent config
  18. categories = ['general']
  19. paging = True
  20. language_support = True
  21. supported_languages_url = 'https://duckduckgo.com/d2030.js'
  22. time_range_support = True
  23. # search-url
  24. url = 'https://duckduckgo.com/html?{query}&s={offset}'
  25. time_range_url = '&df={range}'
  26. time_range_dict = {'day': 'd',
  27. 'week': 'w',
  28. 'month': 'm'}
  29. # specific xpath variables
  30. result_xpath = '//div[@class="result results_links results_links_deep web-result "]' # noqa
  31. url_xpath = './/a[@class="result__a"]/@href'
  32. title_xpath = './/a[@class="result__a"]'
  33. content_xpath = './/a[@class="result__snippet"]'
  34. # do search-request
  35. def request(query, params):
  36. if params['time_range'] and params['time_range'] not in time_range_dict:
  37. return params
  38. offset = (params['pageno'] - 1) * 30
  39. # custom fixes for languages
  40. if params['language'] == 'all':
  41. locale = None
  42. elif params['language'][:2] == 'ja':
  43. locale = 'jp-jp'
  44. elif params['language'] == 'zh-TW':
  45. locale = 'tw-tzh'
  46. elif params['language'] == 'zh-HK':
  47. locale = 'hk-tzh'
  48. elif params['language'][-2:] == 'SA':
  49. locale = 'xa' + params['language'].split('-')[0]
  50. elif params['language'][-2:] == 'GB':
  51. locale = 'uk' + params['language'].split('-')[0]
  52. else:
  53. locale = params['language'].split('-')
  54. if len(locale) == 2:
  55. # country code goes first
  56. locale = locale[1].lower() + '-' + locale[0].lower()
  57. else:
  58. # tries to get a country code from language
  59. locale = locale[0].lower()
  60. for lc in supported_languages:
  61. lc = lc.split('-')
  62. if locale == lc[0]:
  63. locale = lc[1].lower() + '-' + lc[0].lower()
  64. break
  65. if locale:
  66. params['url'] = url.format(
  67. query=urlencode({'q': query, 'kl': locale}), offset=offset)
  68. else:
  69. params['url'] = url.format(
  70. query=urlencode({'q': query}), offset=offset)
  71. if params['time_range'] in time_range_dict:
  72. params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])
  73. return params
  74. # get response from search-request
  75. def response(resp):
  76. results = []
  77. doc = fromstring(resp.text)
  78. # parse results
  79. for r in doc.xpath(result_xpath):
  80. try:
  81. res_url = r.xpath(url_xpath)[-1]
  82. except:
  83. continue
  84. if not res_url:
  85. continue
  86. title = extract_text(r.xpath(title_xpath))
  87. content = extract_text(r.xpath(content_xpath))
  88. # append result
  89. results.append({'title': title,
  90. 'content': content,
  91. 'url': res_url})
  92. # return results
  93. return results
  94. # get supported languages from their site
  95. def fetch_supported_languages():
  96. response = get(supported_languages_url)
  97. # response is a js file with regions as an embedded object
  98. response_page = response.text
  99. response_page = response_page[response_page.find('regions:{') + 8:]
  100. response_page = response_page[:response_page.find('}') + 1]
  101. regions_json = loads(response_page)
  102. supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
  103. return supported_languages