brave.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Brave supports the categories listed in :py:obj:`brave_category` (General,
  4. news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
  5. <time_range_support>` is limited (see remarks).
  6. Configured ``brave`` engines:
  7. .. code:: yaml
  8. - name: brave
  9. engine: brave
  10. ...
  11. brave_category: search
  12. time_range_support: true
  13. paging: true
  14. - name: brave.images
  15. engine: brave
  16. ...
  17. brave_category: images
  18. - name: brave.videos
  19. engine: brave
  20. ...
  21. brave_category: videos
  22. - name: brave.news
  23. engine: brave
  24. ...
  25. brave_category: news
  26. Implementations
  27. ===============
  28. """
  29. # pylint: disable=fixme
  30. from urllib.parse import (
  31. urlencode,
  32. urlparse,
  33. parse_qs,
  34. )
  35. import chompjs
  36. from lxml import html
  37. from searx.utils import (
  38. extract_text,
  39. eval_xpath_list,
  40. eval_xpath_getindex,
  41. )
  42. about = {
  43. "website": 'https://search.brave.com/',
  44. "wikidata_id": 'Q22906900',
  45. "official_api_documentation": None,
  46. "use_official_api": False,
  47. "require_api_key": False,
  48. "results": 'HTML',
  49. }
  50. base_url = "https://search.brave.com/"
  51. categories = []
  52. brave_category = 'search'
  53. """Brave supports common web-search, video search, image and video search.
  54. - ``search``: Common WEB search
  55. - ``videos``: search for videos
  56. - ``images``: search for images
  57. - ``news``: search for news
  58. """
  59. brave_spellcheck = False
  60. """Brave supports some kind of spell checking. When activated, Brave tries to
  61. fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
  62. the UI of Brave the user gets warned about this, since we can not warn the user
  63. in SearXNG, the spellchecking is disabled by default.
  64. """
  65. send_accept_language_header = True
  66. paging = False
  67. """Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
  68. category All)."""
  69. safesearch = True
  70. safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
  71. time_range_support = False
  72. """Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
  73. category All)."""
  74. time_range_map = {
  75. 'day': 'pd',
  76. 'week': 'pw',
  77. 'month': 'pm',
  78. 'year': 'py',
  79. }
  80. def request(query, params):
  81. # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
  82. params['headers']['Accept-Encoding'] = 'gzip, deflate'
  83. args = {
  84. 'q': query,
  85. }
  86. if brave_spellcheck:
  87. args['spellcheck'] = '1'
  88. if brave_category == 'search':
  89. if params.get('pageno', 1) - 1:
  90. args['offset'] = params.get('pageno', 1) - 1
  91. if time_range_map.get(params['time_range']):
  92. args['tf'] = time_range_map.get(params['time_range'])
  93. params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
  94. # set preferences in cookie
  95. params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
  96. # ToDo: we need a fetch_traits(..) implementation / the ui_lang of Brave are
  97. # limited and the country handling has it quirks
  98. eng_locale = params.get('searxng_locale')
  99. params['cookies']['useLocation'] = '0' # the useLocation is IP based, we use 'country'
  100. params['cookies']['summarizer'] = '0'
  101. if not eng_locale or eng_locale == 'all':
  102. params['cookies']['country'] = 'all' # country=all
  103. else:
  104. params['cookies']['country'] = eng_locale.split('-')[-1].lower()
  105. params['cookies']['ui_lang'] = eng_locale.split('-')[0].lower()
  106. # logger.debug("cookies %s", params['cookies'])
  107. def response(resp):
  108. if brave_category == 'search':
  109. return _parse_search(resp)
  110. datastr = ""
  111. for line in resp.text.split("\n"):
  112. if "const data = " in line:
  113. datastr = line.replace("const data = ", "").strip()[:-1]
  114. break
  115. json_data = chompjs.parse_js_object(datastr)
  116. json_resp = json_data[1]['data']['body']['response']
  117. if brave_category == 'news':
  118. json_resp = json_resp['news']
  119. return _parse_news(json_resp)
  120. if brave_category == 'images':
  121. return _parse_images(json_resp)
  122. if brave_category == 'videos':
  123. return _parse_videos(json_resp)
  124. return []
  125. def _parse_search(resp):
  126. result_list = []
  127. dom = html.fromstring(resp.text)
  128. answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
  129. if answer_tag:
  130. result_list.append({'answer': extract_text(answer_tag)})
  131. # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
  132. xpath_results = '//div[contains(@class, "snippet")]'
  133. for result in eval_xpath_list(dom, xpath_results):
  134. url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None)
  135. title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None)
  136. if not (url and title_tag):
  137. continue
  138. content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='')
  139. img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='')
  140. item = {
  141. 'url': url,
  142. 'title': extract_text(title_tag),
  143. 'content': extract_text(content_tag),
  144. 'img_src': img_src,
  145. }
  146. video_tag = eval_xpath_getindex(
  147. result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
  148. )
  149. if video_tag:
  150. # In my tests a video tag in the WEB search was mostoften not a
  151. # video, except the ones from youtube ..
  152. iframe_src = _get_iframe_src(url)
  153. if iframe_src:
  154. item['iframe_src'] = iframe_src
  155. item['template'] = 'videos.html'
  156. item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
  157. else:
  158. item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
  159. result_list.append(item)
  160. return result_list
  161. def _get_iframe_src(url):
  162. parsed_url = urlparse(url)
  163. if parsed_url.path == '/watch' and parsed_url.query:
  164. video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore
  165. if video_id:
  166. return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore
  167. return None
  168. def _parse_news(json_resp):
  169. result_list = []
  170. for result in json_resp["results"]:
  171. item = {
  172. 'url': result['url'],
  173. 'title': result['title'],
  174. 'content': result['description'],
  175. }
  176. if result['thumbnail'] != "null":
  177. item['img_src'] = result['thumbnail']['src']
  178. result_list.append(item)
  179. return result_list
  180. def _parse_images(json_resp):
  181. result_list = []
  182. for result in json_resp["results"]:
  183. item = {
  184. 'url': result['url'],
  185. 'title': result['title'],
  186. 'content': result['description'],
  187. 'template': 'images.html',
  188. 'img_format': result['properties']['format'],
  189. 'source': result['source'],
  190. 'img_src': result['properties']['url'],
  191. }
  192. result_list.append(item)
  193. return result_list
  194. def _parse_videos(json_resp):
  195. result_list = []
  196. for result in json_resp["results"]:
  197. url = result['url']
  198. item = {
  199. 'url': url,
  200. 'title': result['title'],
  201. 'content': result['description'],
  202. 'template': 'videos.html',
  203. 'length': result['video']['duration'],
  204. 'duration': result['video']['duration'],
  205. }
  206. if result['thumbnail'] != "null":
  207. item['thumbnail'] = result['thumbnail']['src']
  208. iframe_src = _get_iframe_src(url)
  209. if iframe_src:
  210. item['iframe_src'] = iframe_src
  211. result_list.append(item)
  212. return result_list