qwant.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Qwant (Web, News, Images, Videos)
  4. This engine uses the Qwant API (https://api.qwant.com/v3). The API is
  5. undocumented but can be reverse engineered by reading the network log of
  6. https://www.qwant.com/ queries.
  7. This implementation is used by different qwant engines in the settings.yml::
  8. - name: qwant
  9. qwant_categ: web
  10. ...
  11. - name: qwant news
  12. qwant_categ: news
  13. ...
  14. - name: qwant images
  15. qwant_categ: images
  16. ...
  17. - name: qwant videos
  18. qwant_categ: videos
  19. ...
  20. """
  21. from datetime import (
  22. datetime,
  23. timedelta,
  24. )
  25. from json import loads
  26. from urllib.parse import urlencode
  27. from flask_babel import gettext
  28. import babel
  29. from searx.exceptions import SearxEngineAPIException
  30. from searx.network import raise_for_httperror
  31. from searx.locales import get_engine_locale
  32. # about
  33. about = {
  34. "website": 'https://www.qwant.com/',
  35. "wikidata_id": 'Q14657870',
  36. "official_api_documentation": None,
  37. "use_official_api": True,
  38. "require_api_key": False,
  39. "results": 'JSON',
  40. }
  41. # engine dependent config
  42. categories = []
  43. paging = True
  44. supported_languages_url = about['website']
  45. qwant_categ = None # web|news|inages|videos
  46. safesearch = True
  47. safe_search_map = {0: '&safesearch=0', 1: '&safesearch=1', 2: '&safesearch=2'}
  48. # fmt: off
  49. qwant_news_locales = [
  50. 'ca_ad', 'ca_es', 'ca_fr', 'co_fr', 'de_at', 'de_ch', 'de_de', 'en_au',
  51. 'en_ca', 'en_gb', 'en_ie', 'en_my', 'en_nz', 'en_us', 'es_ad', 'es_ar',
  52. 'es_cl', 'es_co', 'es_es', 'es_mx', 'es_pe', 'eu_es', 'eu_fr', 'fc_ca',
  53. 'fr_ad', 'fr_be', 'fr_ca', 'fr_ch', 'fr_fr', 'it_ch', 'it_it', 'nl_be',
  54. 'nl_nl', 'pt_ad', 'pt_pt',
  55. ]
  56. # fmt: on
  57. # search-url
  58. url = 'https://api.qwant.com/v3/search/{keyword}?{query}&count={count}&offset={offset}'
  59. def request(query, params):
  60. """Qwant search request"""
  61. if not query:
  62. return None
  63. count = 10 # web: count must be equal to 10
  64. if qwant_categ == 'images':
  65. count = 50
  66. offset = (params['pageno'] - 1) * count
  67. # count + offset must be lower than 250
  68. offset = min(offset, 199)
  69. else:
  70. offset = (params['pageno'] - 1) * count
  71. # count + offset must be lower than 50
  72. offset = min(offset, 40)
  73. params['url'] = url.format(
  74. keyword=qwant_categ,
  75. query=urlencode({'q': query}),
  76. offset=offset,
  77. count=count,
  78. )
  79. # add quant's locale
  80. q_locale = get_engine_locale(params['language'], supported_languages, default='en_US')
  81. params['url'] += '&locale=' + q_locale
  82. # add safesearch option
  83. params['url'] += safe_search_map.get(params['safesearch'], '')
  84. params['raise_for_httperror'] = False
  85. return params
  86. def response(resp):
  87. """Get response from Qwant's search request"""
  88. # pylint: disable=too-many-locals, too-many-branches, too-many-statements
  89. results = []
  90. # load JSON result
  91. search_results = loads(resp.text)
  92. data = search_results.get('data', {})
  93. # check for an API error
  94. if search_results.get('status') != 'success':
  95. msg = ",".join(
  96. data.get(
  97. 'message',
  98. [
  99. 'unknown',
  100. ],
  101. )
  102. )
  103. raise SearxEngineAPIException('API error::' + msg)
  104. # raise for other errors
  105. raise_for_httperror(resp)
  106. if qwant_categ == 'web':
  107. # The WEB query contains a list named 'mainline'. This list can contain
  108. # different result types (e.g. mainline[0]['type'] returns type of the
  109. # result items in mainline[0]['items']
  110. mainline = data.get('result', {}).get('items', {}).get('mainline', {})
  111. else:
  112. # Queries on News, Images and Videos do not have a list named 'mainline'
  113. # in the response. The result items are directly in the list
  114. # result['items'].
  115. mainline = data.get('result', {}).get('items', [])
  116. mainline = [
  117. {'type': qwant_categ, 'items': mainline},
  118. ]
  119. # return empty array if there are no results
  120. if not mainline:
  121. return []
  122. for row in mainline:
  123. mainline_type = row.get('type', 'web')
  124. if mainline_type != qwant_categ:
  125. continue
  126. if mainline_type == 'ads':
  127. # ignore adds
  128. continue
  129. mainline_items = row.get('items', [])
  130. for item in mainline_items:
  131. title = item.get('title', None)
  132. res_url = item.get('url', None)
  133. if mainline_type == 'web':
  134. content = item['desc']
  135. results.append(
  136. {
  137. 'title': title,
  138. 'url': res_url,
  139. 'content': content,
  140. }
  141. )
  142. elif mainline_type == 'news':
  143. pub_date = item['date']
  144. if pub_date is not None:
  145. pub_date = datetime.fromtimestamp(pub_date)
  146. news_media = item.get('media', [])
  147. img_src = None
  148. if news_media:
  149. img_src = news_media[0].get('pict', {}).get('url', None)
  150. results.append(
  151. {
  152. 'title': title,
  153. 'url': res_url,
  154. 'publishedDate': pub_date,
  155. 'img_src': img_src,
  156. }
  157. )
  158. elif mainline_type == 'images':
  159. thumbnail = item['thumbnail']
  160. img_src = item['media']
  161. results.append(
  162. {
  163. 'title': title,
  164. 'url': res_url,
  165. 'template': 'images.html',
  166. 'thumbnail_src': thumbnail,
  167. 'img_src': img_src,
  168. }
  169. )
  170. elif mainline_type == 'videos':
  171. # some videos do not have a description: while qwant-video
  172. # returns an empty string, such video from a qwant-web query
  173. # miss the 'desc' key.
  174. d, s, c = item.get('desc'), item.get('source'), item.get('channel')
  175. content_parts = []
  176. if d:
  177. content_parts.append(d)
  178. if s:
  179. content_parts.append("%s: %s " % (gettext("Source"), s))
  180. if c:
  181. content_parts.append("%s: %s " % (gettext("Channel"), c))
  182. content = ' // '.join(content_parts)
  183. length = item['duration']
  184. if length is not None:
  185. length = timedelta(milliseconds=length)
  186. pub_date = item['date']
  187. if pub_date is not None:
  188. pub_date = datetime.fromtimestamp(pub_date)
  189. thumbnail = item['thumbnail']
  190. # from some locations (DE and others?) the s2 link do
  191. # response a 'Please wait ..' but does not deliver the thumbnail
  192. thumbnail = thumbnail.replace('https://s2.qwant.com', 'https://s1.qwant.com', 1)
  193. results.append(
  194. {
  195. 'title': title,
  196. 'url': res_url,
  197. 'content': content,
  198. 'publishedDate': pub_date,
  199. 'thumbnail': thumbnail,
  200. 'template': 'videos.html',
  201. 'length': length,
  202. }
  203. )
  204. return results
  205. def _fetch_supported_languages(resp):
  206. text = resp.text
  207. text = text[text.find('INITIAL_PROPS') :]
  208. text = text[text.find('{') : text.find('</script>')]
  209. q_initial_props = loads(text)
  210. q_locales = q_initial_props.get('locales')
  211. q_valid_locales = []
  212. for country, v in q_locales.items():
  213. for lang in v['langs']:
  214. _locale = "{lang}_{country}".format(lang=lang, country=country)
  215. if qwant_categ == 'news' and _locale.lower() not in qwant_news_locales:
  216. # qwant-news does not support all locales from qwant-web:
  217. continue
  218. q_valid_locales.append(_locale)
  219. supported_languages = {}
  220. for q_locale in q_valid_locales:
  221. try:
  222. locale = babel.Locale.parse(q_locale, sep='_')
  223. except babel.core.UnknownLocaleError:
  224. print("ERROR: can't determine babel locale of quant's locale %s" % q_locale)
  225. continue
  226. # note: supported_languages (dict)
  227. #
  228. # dict's key is a string build up from a babel.Locale object / the
  229. # notation 'xx-XX' (and 'xx') conforms to SearXNG's locale (and
  230. # language) notation and dict's values are the locale strings used by
  231. # the engine.
  232. searxng_locale = locale.language + '-' + locale.territory # --> params['language']
  233. supported_languages[searxng_locale] = q_locale
  234. return supported_languages