qwant.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Qwant (Web, News, Images, Videos)
  4. This engine uses the Qwant API (https://api.qwant.com/v3). The API is
  5. undocumented but can be reverse engineered by reading the network log of
  6. https://www.qwant.com/ queries.
  7. This implementation is used by different qwant engines in the settings.yml::
  8. - name: qwant
  9. categories: general
  10. ...
  11. - name: qwant news
  12. categories: news
  13. ...
  14. - name: qwant images
  15. categories: images
  16. ...
  17. - name: qwant videos
  18. categories: videos
  19. ...
  20. """
  21. from datetime import (
  22. datetime,
  23. timedelta,
  24. )
  25. from json import loads
  26. from urllib.parse import urlencode
  27. # from searx import logger
  28. from searx.utils import match_language
  29. from searx.exceptions import SearxEngineAPIException
  30. from searx.network import raise_for_httperror
  31. #logger = logger.getChild('qwant')
  32. # about
  33. about = {
  34. "website": 'https://www.qwant.com/',
  35. "wikidata_id": 'Q14657870',
  36. "official_api_documentation": None,
  37. "use_official_api": True,
  38. "require_api_key": False,
  39. "results": 'JSON',
  40. }
  41. # engine dependent config
  42. categories = []
  43. paging = True
  44. supported_languages_url = about['website']
  45. category_to_keyword = {
  46. 'general': 'web',
  47. 'news': 'news',
  48. 'images': 'images',
  49. 'videos': 'videos',
  50. }
  51. # search-url
  52. url = 'https://api.qwant.com/v3/search/{keyword}?q={query}&count={count}&offset={offset}'
  53. def request(query, params):
  54. """Qwant search request"""
  55. keyword = category_to_keyword[categories[0]]
  56. count = 10 # web: count must be equal to 10
  57. if keyword == 'images':
  58. count = 50
  59. offset = (params['pageno'] - 1) * count
  60. # count + offset must be lower than 250
  61. offset = min(offset, 199)
  62. else:
  63. offset = (params['pageno'] - 1) * count
  64. # count + offset must be lower than 50
  65. offset = min(offset, 40)
  66. params['url'] = url.format(
  67. keyword = keyword,
  68. query = urlencode({'q': query}),
  69. offset = offset,
  70. count = count,
  71. )
  72. # add language tag
  73. if params['language'] != 'all':
  74. language = match_language(
  75. params['language'],
  76. # pylint: disable=undefined-variable
  77. supported_languages,
  78. language_aliases,
  79. )
  80. params['url'] += '&locale=' + language.replace('-', '_')
  81. params['raise_for_httperror'] = False
  82. return params
  83. def response(resp):
  84. """Get response from Qwant's search request"""
  85. keyword = category_to_keyword[categories[0]]
  86. results = []
  87. # load JSON result
  88. search_results = loads(resp.text)
  89. data = search_results.get('data', {})
  90. # check for an API error
  91. if search_results.get('status') != 'success':
  92. msg = ",".join(data.get('message', ['unknown', ]))
  93. raise SearxEngineAPIException('API error::' + msg)
  94. # raise for other errors
  95. raise_for_httperror(resp)
  96. if keyword == 'web':
  97. # The WEB query contains a list named 'mainline'. This list can contain
  98. # different result types (e.g. mainline[0]['type'] returns type of the
  99. # result items in mainline[0]['items']
  100. mainline = data.get('result', {}).get('items', {}).get('mainline', {})
  101. else:
  102. # Queries on News, Images and Videos do not have a list named 'mainline'
  103. # in the response. The result items are directly in the list
  104. # result['items'].
  105. mainline = data.get('result', {}).get('items', [])
  106. mainline = [
  107. {'type' : keyword, 'items' : mainline },
  108. ]
  109. # return empty array if there are no results
  110. if not mainline:
  111. return []
  112. for row in mainline:
  113. mainline_type = row.get('type', 'web')
  114. if mainline_type == 'ads':
  115. # ignore adds
  116. continue
  117. mainline_items = row.get('items', [])
  118. for item in mainline_items:
  119. title = item['title']
  120. res_url = item['url']
  121. if mainline_type == 'web':
  122. content = item['desc']
  123. results.append({
  124. 'title': title,
  125. 'url': res_url,
  126. 'content': content,
  127. })
  128. elif mainline_type == 'news':
  129. pub_date = datetime.fromtimestamp(item['date'], None)
  130. news_media = item.get('media', [])
  131. img_src = None
  132. if news_media:
  133. img_src = news_media[0].get('pict', {}).get('url', None)
  134. results.append({
  135. 'title': title,
  136. 'url': res_url,
  137. 'publishedDate': pub_date,
  138. 'img_src': img_src,
  139. })
  140. elif mainline_type == 'images':
  141. thumbnail = item['thumbnail']
  142. img_src = item['media']
  143. results.append({
  144. 'title': title,
  145. 'url': res_url,
  146. 'template': 'images.html',
  147. 'thumbnail_src': thumbnail,
  148. 'img_src': img_src,
  149. })
  150. elif mainline_type == 'videos':
  151. content = item['desc']
  152. length = timedelta(seconds=item['duration'])
  153. pub_date = datetime.fromtimestamp(item['date'])
  154. thumbnail = item['thumbnail']
  155. results.append({
  156. 'title': title,
  157. 'url': res_url,
  158. 'content': content,
  159. 'publishedDate': pub_date,
  160. 'thumbnail': thumbnail,
  161. 'template': 'videos.html',
  162. 'length': length,
  163. })
  164. return results
  165. # get supported languages from their site
  166. def _fetch_supported_languages(resp):
  167. # list of regions is embedded in page as a js object
  168. response_text = resp.text
  169. response_text = response_text[response_text.find('INITIAL_PROPS'):]
  170. response_text = response_text[response_text.find('{'):response_text.find('</script>')]
  171. regions_json = loads(response_text)
  172. supported_languages = []
  173. for country, langs in regions_json['locales'].items():
  174. for lang in langs['langs']:
  175. lang_code = "{lang}-{country}".format(lang=lang, country=country)
  176. supported_languages.append(lang_code)
  177. return supported_languages