duckduckgo.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """
  4. DuckDuckGo Lite
  5. ~~~~~~~~~~~~~~~
  6. """
  7. from typing import TYPE_CHECKING
  8. import re
  9. from urllib.parse import urlencode
  10. import json
  11. import babel
  12. import lxml.html
  13. from searx import (
  14. locales,
  15. redislib,
  16. external_bang,
  17. )
  18. from searx.utils import (
  19. eval_xpath,
  20. eval_xpath_getindex,
  21. extract_text,
  22. )
  23. from searx.network import get # see https://github.com/searxng/searxng/issues/762
  24. from searx import redisdb
  25. from searx.enginelib.traits import EngineTraits
  26. if TYPE_CHECKING:
  27. import logging
  28. logger: logging.Logger
  29. traits: EngineTraits
  30. about = {
  31. "website": 'https://lite.duckduckgo.com/lite/',
  32. "wikidata_id": 'Q12805',
  33. "use_official_api": False,
  34. "require_api_key": False,
  35. "results": 'HTML',
  36. }
  37. send_accept_language_header = True
  38. """DuckDuckGo-Lite tries to guess user's prefered language from the HTTP
  39. ``Accept-Language``. Optional the user can select a region filter (but not a
  40. language).
  41. """
  42. # engine dependent config
  43. categories = ['general', 'web']
  44. paging = True
  45. time_range_support = True
  46. safesearch = True # user can't select but the results are filtered
  47. url = 'https://lite.duckduckgo.com/lite/'
  48. # url_ping = 'https://duckduckgo.com/t/sl_l'
  49. time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
  50. def cache_vqd(query, value):
  51. """Caches a ``vqd`` token from a query, if token is None the cached value
  52. is deleted.
  53. The vqd token depends on the query string and is needed for the follow up
  54. pages or the images loaded by a XMLHttpRequest:
  55. - DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...`
  56. - DuckDuckGo Images: `https://duckduckgo.com/i.js??q=...&vqd=...`
  57. """
  58. c = redisdb.client()
  59. if c:
  60. key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
  61. if value is not None:
  62. logger.debug("cache vqd value: %s", value)
  63. c.set(key, value, ex=600)
  64. else:
  65. # remove from cache
  66. c.delete(key)
  67. def _get_vqd_value(query):
  68. res = get('https://lite.duckduckgo.com/lite/?' + urlencode({'q': query}))
  69. doc = lxml.html.fromstring(res.text)
  70. return eval_xpath_getindex(doc, "//input[@name='vqd']/@value", 0, None)
  71. def get_vqd(query):
  72. """Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached
  73. (:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd token from the
  74. response.
  75. """
  76. c = redisdb.client()
  77. if c:
  78. key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
  79. value = c.get(key)
  80. if value:
  81. value = value.decode('utf-8')
  82. logger.debug("re-use cached vqd value: %s", value)
  83. return value
  84. value = _get_vqd_value(query)
  85. if not value:
  86. # seems we got a CAPTCHA for this query string, send a dummy request to
  87. # release the captcha and then fetch the vqd value for the query string
  88. # again.
  89. logger.warning("vqd token will no longer work, trying to get a new one by sending another query")
  90. _get_vqd_value(f'{query[:3]} duckduckgo')
  91. value = _get_vqd_value(query)
  92. if not value:
  93. logger.error("was not able to fetch a valid vqd token from DDG")
  94. else:
  95. logger.debug("new vqd value: %s", value)
  96. cache_vqd(query, value)
  97. return value
  98. def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
  99. """Get DuckDuckGo's language identifier from SearXNG's locale.
  100. DuckDuckGo defines its languages by region codes (see
  101. :py:obj:`fetch_traits`).
  102. To get region and language of a DDG service use:
  103. .. code: python
  104. eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
  105. eng_lang = get_ddg_lang(traits, params['searxng_locale'])
  106. It might confuse, but the ``l`` value of the cookie is what SearXNG calls
  107. the *region*:
  108. .. code:: python
  109. # !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
  110. params['cookies']['ad'] = eng_lang
  111. params['cookies']['ah'] = eng_region
  112. params['cookies']['l'] = eng_region
  113. .. hint::
  114. `DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language
  115. selection to the user, only a region can be selected by the user
  116. (``eng_region`` from the example above). DDG-lite stores the selected
  117. region in a cookie::
  118. params['cookies']['kl'] = eng_region # 'ar-es'
  119. """
  120. return eng_traits.custom['lang_region'].get( # type: ignore
  121. sxng_locale, eng_traits.get_language(sxng_locale, default)
  122. )
  123. ddg_reg_map = {
  124. 'tw-tzh': 'zh_TW',
  125. 'hk-tzh': 'zh_HK',
  126. 'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES
  127. 'es-ca': 'ca_ES',
  128. 'id-en': 'id_ID',
  129. 'no-no': 'nb_NO',
  130. 'jp-jp': 'ja_JP',
  131. 'kr-kr': 'ko_KR',
  132. 'xa-ar': 'ar_SA',
  133. 'sl-sl': 'sl_SI',
  134. 'th-en': 'th_TH',
  135. 'vn-en': 'vi_VN',
  136. }
  137. ddg_lang_map = {
  138. # use ar --> ar_EG (Egypt's arabic)
  139. "ar_DZ": 'lang_region',
  140. "ar_JO": 'lang_region',
  141. "ar_SA": 'lang_region',
  142. # use bn --> bn_BD
  143. 'bn_IN': 'lang_region',
  144. # use de --> de_DE
  145. 'de_CH': 'lang_region',
  146. # use en --> en_US,
  147. 'en_AU': 'lang_region',
  148. 'en_CA': 'lang_region',
  149. 'en_GB': 'lang_region',
  150. # Esperanto
  151. 'eo_XX': 'eo',
  152. # use es --> es_ES,
  153. 'es_AR': 'lang_region',
  154. 'es_CL': 'lang_region',
  155. 'es_CO': 'lang_region',
  156. 'es_CR': 'lang_region',
  157. 'es_EC': 'lang_region',
  158. 'es_MX': 'lang_region',
  159. 'es_PE': 'lang_region',
  160. 'es_UY': 'lang_region',
  161. 'es_VE': 'lang_region',
  162. # use fr --> rf_FR
  163. 'fr_CA': 'lang_region',
  164. 'fr_CH': 'lang_region',
  165. 'fr_BE': 'lang_region',
  166. # use nl --> nl_NL
  167. 'nl_BE': 'lang_region',
  168. # use pt --> pt_PT
  169. 'pt_BR': 'lang_region',
  170. # skip these languages
  171. 'od_IN': 'skip',
  172. 'io_XX': 'skip',
  173. 'tokipona_XX': 'skip',
  174. }
  175. def request(query, params):
  176. # quote ddg bangs
  177. query_parts = []
  178. # for val in re.split(r'(\s+)', query):
  179. for val in re.split(r'(\s+)', query):
  180. if not val.strip():
  181. continue
  182. if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]):
  183. val = f"'{val}'"
  184. query_parts.append(val)
  185. query = ' '.join(query_parts)
  186. eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
  187. # eng_lang = get_ddg_lang(traits, params['searxng_locale'])
  188. params['url'] = url
  189. params['method'] = 'POST'
  190. params['data']['q'] = query
  191. # The API is not documented, so we do some reverse engineering and emulate
  192. # what https://lite.duckduckgo.com/lite/ does when you press "next Page"
  193. # link again and again ..
  194. params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
  195. params['headers']['Referer'] = 'https://google.com/'
  196. # initial page does not have an offset
  197. if params['pageno'] == 2:
  198. # second page does have an offset of 30
  199. offset = (params['pageno'] - 1) * 30
  200. params['data']['s'] = offset
  201. params['data']['dc'] = offset + 1
  202. elif params['pageno'] > 2:
  203. # third and following pages do have an offset of 30 + n*50
  204. offset = 30 + (params['pageno'] - 2) * 50
  205. params['data']['s'] = offset
  206. params['data']['dc'] = offset + 1
  207. # request needs a vqd argument
  208. params['data']['vqd'] = get_vqd(query)
  209. # initial page does not have additional data in the input form
  210. if params['pageno'] > 1:
  211. params['data']['o'] = 'json'
  212. params['data']['api'] = 'd.js'
  213. params['data']['nextParams'] = ''
  214. params['data']['v'] = 'l'
  215. params['data']['kl'] = eng_region
  216. params['cookies']['kl'] = eng_region
  217. params['data']['df'] = ''
  218. if params['time_range'] in time_range_dict:
  219. params['data']['df'] = time_range_dict[params['time_range']]
  220. params['cookies']['df'] = time_range_dict[params['time_range']]
  221. logger.debug("param data: %s", params['data'])
  222. logger.debug("param cookies: %s", params['cookies'])
  223. return params
  224. def response(resp):
  225. if resp.status_code == 303:
  226. return []
  227. results = []
  228. doc = lxml.html.fromstring(resp.text)
  229. result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
  230. if len(result_table) == 2:
  231. # some locales (at least China) does not have a "next page" button and
  232. # the layout of the HTML tables is different.
  233. result_table = result_table[1]
  234. elif not len(result_table) >= 3:
  235. # no more results / if we have the vqd token in cache, it's no longer
  236. # valid and has to be deleted
  237. cache_vqd(resp.search_params['data']['q'], None)
  238. return []
  239. else:
  240. result_table = result_table[2]
  241. # update form data from response
  242. form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..')
  243. if len(form):
  244. value = eval_xpath_getindex(form[0], "//input[@name='vqd']/@value", 0, None)
  245. query = resp.search_params['data']['q']
  246. if value:
  247. cache_vqd(query, value)
  248. tr_rows = eval_xpath(result_table, './/tr')
  249. # In the last <tr> is the form of the 'previous/next page' links
  250. tr_rows = tr_rows[:-1]
  251. len_tr_rows = len(tr_rows)
  252. offset = 0
  253. while len_tr_rows >= offset + 4:
  254. # assemble table rows we need to scrap
  255. tr_title = tr_rows[offset]
  256. tr_content = tr_rows[offset + 1]
  257. offset += 4
  258. # ignore sponsored Adds <tr class="result-sponsored">
  259. if tr_content.get('class') == 'result-sponsored':
  260. continue
  261. a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None)
  262. if a_tag is None:
  263. continue
  264. td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None)
  265. if td_content is None:
  266. continue
  267. results.append(
  268. {
  269. 'title': a_tag.text_content(),
  270. 'content': extract_text(td_content),
  271. 'url': a_tag.get('href'),
  272. }
  273. )
  274. return results
  275. def fetch_traits(engine_traits: EngineTraits):
  276. """Fetch languages & regions from DuckDuckGo.
  277. SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``).
  278. DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no
  279. sense in a SearXNG request since SearXNG's ``all`` will not add a
  280. ``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale``
  281. is ``wt-wt`` (the region).
  282. Beside regions DuckDuckGo also defines its languages by region codes. By
  283. example these are the english languages in DuckDuckGo:
  284. - en_US
  285. - en_AU
  286. - en_CA
  287. - en_GB
  288. The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from
  289. SearXNG's locale.
  290. """
  291. # pylint: disable=too-many-branches, too-many-statements
  292. # fetch regions
  293. engine_traits.all_locale = 'wt-wt'
  294. # updated from u588 to u661 / should be updated automatically?
  295. resp = get('https://duckduckgo.com/util/u661.js')
  296. if not resp.ok: # type: ignore
  297. print("ERROR: response from DuckDuckGo is not OK.")
  298. pos = resp.text.find('regions:{') + 8 # type: ignore
  299. js_code = resp.text[pos:] # type: ignore
  300. pos = js_code.find('}') + 1
  301. regions = json.loads(js_code[:pos])
  302. for eng_tag, name in regions.items():
  303. if eng_tag == 'wt-wt':
  304. engine_traits.all_locale = 'wt-wt'
  305. continue
  306. region = ddg_reg_map.get(eng_tag)
  307. if region == 'skip':
  308. continue
  309. if not region:
  310. eng_territory, eng_lang = eng_tag.split('-')
  311. region = eng_lang + '_' + eng_territory.upper()
  312. try:
  313. sxng_tag = locales.region_tag(babel.Locale.parse(region))
  314. except babel.UnknownLocaleError:
  315. print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region))
  316. continue
  317. conflict = engine_traits.regions.get(sxng_tag)
  318. if conflict:
  319. if conflict != eng_tag:
  320. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
  321. continue
  322. engine_traits.regions[sxng_tag] = eng_tag
  323. # fetch languages
  324. engine_traits.custom['lang_region'] = {}
  325. pos = resp.text.find('languages:{') + 10 # type: ignore
  326. js_code = resp.text[pos:] # type: ignore
  327. pos = js_code.find('}') + 1
  328. js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"')
  329. languages = json.loads(js_code)
  330. for eng_lang, name in languages.items():
  331. if eng_lang == 'wt_WT':
  332. continue
  333. babel_tag = ddg_lang_map.get(eng_lang, eng_lang)
  334. if babel_tag == 'skip':
  335. continue
  336. try:
  337. if babel_tag == 'lang_region':
  338. sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang))
  339. engine_traits.custom['lang_region'][sxng_tag] = eng_lang
  340. continue
  341. sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag))
  342. except babel.UnknownLocaleError:
  343. print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang))
  344. continue
  345. conflict = engine_traits.languages.get(sxng_tag)
  346. if conflict:
  347. if conflict != eng_lang:
  348. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
  349. continue
  350. engine_traits.languages[sxng_tag] = eng_lang