duckduckgo.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. """
  2. DuckDuckGo (Web)
  3. @website https://duckduckgo.com/
  4. @provide-api yes (https://duckduckgo.com/api),
  5. but not all results from search-site
  6. @using-api no
  7. @results HTML (using search portal)
  8. @stable no (HTML can change)
  9. @parse url, title, content
  10. @todo rewrite to api
  11. """
  12. from urllib import urlencode
  13. from lxml.html import fromstring
  14. from requests import get
  15. from json import loads
  16. from searx.engines.xpath import extract_text
  17. # engine dependent config
  18. categories = ['general']
  19. paging = True
  20. language_support = True
  21. supported_languages_url = 'https://duckduckgo.com/d2030.js'
  22. time_range_support = True
  23. # search-url
  24. url = 'https://duckduckgo.com/html?{query}&s={offset}&api=/d.js&o=json&dc={dc_param}'
  25. time_range_url = '&df={range}'
  26. time_range_dict = {'day': 'd',
  27. 'week': 'w',
  28. 'month': 'm'}
  29. # specific xpath variables
  30. result_xpath = '//div[@class="result results_links results_links_deep web-result "]' # noqa
  31. url_xpath = './/a[@class="result__a"]/@href'
  32. title_xpath = './/a[@class="result__a"]'
  33. content_xpath = './/a[@class="result__snippet"]'
  34. # do search-request
  35. def request(query, params):
  36. if params['time_range'] and params['time_range'] not in time_range_dict:
  37. return params
  38. offset = 30 + (params['pageno'] - 1) * 50
  39. dc_param = offset + 1
  40. # custom fixes for languages
  41. if params['language'] == 'all':
  42. locale = None
  43. elif params['language'][:2] == 'ja':
  44. locale = 'jp-jp'
  45. elif params['language'][:2] == 'sl':
  46. locale = 'sl-sl'
  47. elif params['language'] == 'zh-TW':
  48. locale = 'tw-tzh'
  49. elif params['language'] == 'zh-HK':
  50. locale = 'hk-tzh'
  51. elif params['language'][-2:] == 'SA':
  52. locale = 'xa-' + params['language'].split('-')[0]
  53. elif params['language'][-2:] == 'GB':
  54. locale = 'uk-' + params['language'].split('-')[0]
  55. else:
  56. locale = params['language'].split('-')
  57. if len(locale) == 2:
  58. # country code goes first
  59. locale = locale[1].lower() + '-' + locale[0].lower()
  60. else:
  61. # tries to get a country code from language
  62. locale = locale[0].lower()
  63. for lc in supported_languages:
  64. lc = lc.split('-')
  65. if locale == lc[0]:
  66. locale = lc[1].lower() + '-' + lc[0].lower()
  67. break
  68. if locale:
  69. params['url'] = url.format(
  70. query=urlencode({'q': query, 'kl': locale}), offset=offset, dc_param=dc_param)
  71. else:
  72. params['url'] = url.format(
  73. query=urlencode({'q': query}), offset=offset, dc_param=dc_param)
  74. if params['time_range'] in time_range_dict:
  75. params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])
  76. return params
  77. # get response from search-request
  78. def response(resp):
  79. results = []
  80. doc = fromstring(resp.text)
  81. # parse results
  82. for r in doc.xpath(result_xpath):
  83. try:
  84. res_url = r.xpath(url_xpath)[-1]
  85. except:
  86. continue
  87. if not res_url:
  88. continue
  89. title = extract_text(r.xpath(title_xpath))
  90. content = extract_text(r.xpath(content_xpath))
  91. # append result
  92. results.append({'title': title,
  93. 'content': content,
  94. 'url': res_url})
  95. # return results
  96. return results
  97. # get supported languages from their site
  98. def _fetch_supported_languages(resp):
  99. # response is a js file with regions as an embedded object
  100. response_page = resp.text
  101. response_page = response_page[response_page.find('regions:{') + 8:]
  102. response_page = response_page[:response_page.find('}') + 1]
  103. regions_json = loads(response_page)
  104. supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
  105. return supported_languages