qwant.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Qwant (Web, Images, News, Social)
  4. """
  5. from datetime import datetime
  6. from json import loads
  7. from urllib.parse import urlencode
  8. from searx.utils import html_to_text, match_language
  9. from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
  10. from searx.raise_for_httperror import raise_for_httperror
  11. # about
  12. about = {
  13. "website": 'https://www.qwant.com/',
  14. "wikidata_id": 'Q14657870',
  15. "official_api_documentation": None,
  16. "use_official_api": True,
  17. "require_api_key": False,
  18. "results": 'JSON',
  19. }
  20. # engine dependent config
  21. categories = []
  22. paging = True
  23. language_support = True
  24. supported_languages_url = 'https://qwant.com/region'
  25. category_to_keyword = {'general': 'web',
  26. 'images': 'images',
  27. 'news': 'news'}
  28. # search-url
  29. url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4'
  30. # do search-request
  31. def request(query, params):
  32. offset = (params['pageno'] - 1) * 10
  33. if categories[0] and categories[0] in category_to_keyword:
  34. params['url'] = url.format(keyword=category_to_keyword[categories[0]],
  35. query=urlencode({'q': query}),
  36. offset=offset)
  37. else:
  38. params['url'] = url.format(keyword='web',
  39. query=urlencode({'q': query}),
  40. offset=offset)
  41. # add language tag
  42. if params['language'] != 'all':
  43. language = match_language(params['language'], supported_languages, language_aliases)
  44. params['url'] += '&locale=' + language.replace('-', '_').lower()
  45. params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0'
  46. params['raise_for_httperror'] = False
  47. return params
  48. # get response from search-request
  49. def response(resp):
  50. results = []
  51. # According to https://www.qwant.com/js/app.js
  52. if resp.status_code == 429:
  53. raise SearxEngineCaptchaException()
  54. # raise for other errors
  55. raise_for_httperror(resp)
  56. # load JSON result
  57. search_results = loads(resp.text)
  58. # check for an API error
  59. if search_results.get('status') != 'success':
  60. raise SearxEngineAPIException('API error ' + str(search_results.get('error', '')))
  61. # return empty array if there are no results
  62. if 'data' not in search_results:
  63. return []
  64. data = search_results.get('data', {})
  65. res = data.get('result', {})
  66. # parse results
  67. for result in res.get('items', {}):
  68. title = html_to_text(result['title'])
  69. res_url = result['url']
  70. content = html_to_text(result['desc'])
  71. if category_to_keyword.get(categories[0], '') == 'web':
  72. results.append({'title': title,
  73. 'content': content,
  74. 'url': res_url})
  75. elif category_to_keyword.get(categories[0], '') == 'images':
  76. thumbnail_src = result['thumbnail']
  77. img_src = result['media']
  78. results.append({'template': 'images.html',
  79. 'url': res_url,
  80. 'title': title,
  81. 'content': '',
  82. 'thumbnail_src': thumbnail_src,
  83. 'img_src': img_src})
  84. elif category_to_keyword.get(categories[0], '') == 'news':
  85. published_date = datetime.fromtimestamp(result['date'], None)
  86. media = result.get('media', [])
  87. if len(media) > 0:
  88. img_src = media[0].get('pict', {}).get('url', None)
  89. else:
  90. img_src = None
  91. results.append({'url': res_url,
  92. 'title': title,
  93. 'publishedDate': published_date,
  94. 'content': content,
  95. 'img_src': img_src})
  96. return results
  97. # get supported languages from their site
  98. def _fetch_supported_languages(resp):
  99. # list of regions is embedded in page as a js object
  100. response_text = resp.text
  101. response_text = response_text[response_text.find('regionalisation'):]
  102. response_text = response_text[response_text.find('{'):response_text.find(');')]
  103. regions_json = loads(response_text)
  104. supported_languages = {}
  105. for lang in regions_json['languages'].values():
  106. for country in lang['countries']:
  107. lang_code = "{lang}-{country}".format(lang=lang['code'], country=country)
  108. supported_languages[lang_code] = {'name': lang['name']}
  109. return supported_languages