bing.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """This is the implementation of the Bing-WEB engine. Some of this
  4. implementations are shared by other engines:
  5. - :ref:`bing images engine`
  6. - :ref:`bing news engine`
  7. - :ref:`bing videos engine`
  8. On the `preference page`_ Bing offers a lot of languages an regions (see section
  9. 'Search results languages' and 'Country/region'). However, the abundant choice
  10. does not correspond to reality, where Bing has a full-text indexer only for a
  11. limited number of languages. By example: you can select a language like Māori
  12. but you never get a result in this language.
  13. What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem
  14. to be completely correct either (if you take a closer look you will find some
  15. inaccuracies there too):
  16. - :py:obj:`searx.engines.bing.bing_traits_url`
  17. - :py:obj:`searx.engines.bing_videos.bing_traits_url`
  18. - :py:obj:`searx.engines.bing_images.bing_traits_url`
  19. - :py:obj:`searx.engines.bing_news.bing_traits_url`
  20. .. _preference page: https://www.bing.com/account/general
  21. .. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
  22. """
  23. # pylint: disable=too-many-branches, invalid-name
  24. from typing import TYPE_CHECKING
  25. import datetime
  26. import re
  27. import uuid
  28. from urllib.parse import urlencode
  29. from lxml import html
  30. import babel
  31. import babel.languages
  32. from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
  33. from searx import network
  34. from searx.locales import language_tag, region_tag
  35. from searx.enginelib.traits import EngineTraits
  36. if TYPE_CHECKING:
  37. import logging
  38. logger: logging.Logger
  39. traits: EngineTraits
  40. about = {
  41. "website": 'https://www.bing.com',
  42. "wikidata_id": 'Q182496',
  43. "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
  44. "use_official_api": False,
  45. "require_api_key": False,
  46. "results": 'HTML',
  47. }
  48. send_accept_language_header = True
  49. """Bing tries to guess user's language and territory from the HTTP
  50. Accept-Language. Optional the user can select a search-language (can be
  51. different to the UI language) and a region (market code)."""
  52. # engine dependent config
  53. categories = ['general', 'web']
  54. paging = True
  55. time_range_support = True
  56. safesearch = True
  57. safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT
  58. base_url = 'https://www.bing.com/search'
  59. """Bing (Web) search URL"""
  60. bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
  61. """Bing (Web) search API description"""
  62. def _get_offset_from_pageno(pageno):
  63. return (pageno - 1) * 10 + 1
  64. def set_bing_cookies(params, engine_language, engine_region, SID):
  65. # set cookies
  66. # -----------
  67. params['cookies']['_EDGE_V'] = '1'
  68. # _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw
  69. _EDGE_S = [
  70. 'F=1',
  71. 'SID=%s' % SID,
  72. 'mkt=%s' % engine_region.lower(),
  73. 'ui=%s' % engine_language.lower(),
  74. ]
  75. params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S)
  76. logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S'])
  77. # "_EDGE_CD": "m=zh-tw",
  78. _EDGE_CD = [ # pylint: disable=invalid-name
  79. 'm=%s' % engine_region.lower(), # search region: zh-cn
  80. 'u=%s' % engine_language.lower(), # UI: en-us
  81. ]
  82. params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';'
  83. logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD'])
  84. SRCHHPGUSR = [ # pylint: disable=invalid-name
  85. 'SRCHLANG=%s' % engine_language,
  86. # Trying to set ADLT cookie here seems not to have any effect, I assume
  87. # there is some age verification by a cookie (and/or session ID) needed,
  88. # to disable the SafeSearch.
  89. 'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'),
  90. ]
  91. params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR)
  92. logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR'])
  93. def request(query, params):
  94. """Assemble a Bing-Web request."""
  95. engine_region = traits.get_region(params['searxng_locale'], 'en-US')
  96. engine_language = traits.get_language(params['searxng_locale'], 'en')
  97. SID = uuid.uuid1().hex.upper()
  98. CVID = uuid.uuid1().hex.upper()
  99. set_bing_cookies(params, engine_language, engine_region, SID)
  100. # build URL query
  101. # ---------------
  102. # query term
  103. page = int(params.get('pageno', 1))
  104. query_params = {
  105. # fmt: off
  106. 'q': query,
  107. 'pq': query,
  108. 'cvid': CVID,
  109. 'qs': 'n',
  110. 'sp': '-1'
  111. # fmt: on
  112. }
  113. # page
  114. if page > 1:
  115. referer = base_url + '?' + urlencode(query_params)
  116. params['headers']['Referer'] = referer
  117. logger.debug("headers.Referer --> %s", referer)
  118. query_params['first'] = _get_offset_from_pageno(page)
  119. if page == 2:
  120. query_params['FORM'] = 'PERE'
  121. elif page > 2:
  122. query_params['FORM'] = 'PERE%s' % (page - 2)
  123. filters = ''
  124. if params['time_range']:
  125. query_params['filt'] = 'custom'
  126. if params['time_range'] == 'day':
  127. filters = 'ex1:"ez1"'
  128. elif params['time_range'] == 'week':
  129. filters = 'ex1:"ez2"'
  130. elif params['time_range'] == 'month':
  131. filters = 'ex1:"ez3"'
  132. elif params['time_range'] == 'year':
  133. epoch_1970 = datetime.date(1970, 1, 1)
  134. today_no = (datetime.date.today() - epoch_1970).days
  135. filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no)
  136. params['url'] = base_url + '?' + urlencode(query_params)
  137. if filters:
  138. params['url'] = params['url'] + '&filters=' + filters
  139. return params
  140. def response(resp):
  141. results = []
  142. result_len = 0
  143. dom = html.fromstring(resp.text)
  144. # parse results again if nothing is found yet
  145. url_to_resolve = []
  146. url_to_resolve_index = []
  147. i = 0
  148. for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
  149. link = eval_xpath_getindex(result, './/h2/a', 0, None)
  150. if link is None:
  151. continue
  152. url = link.attrib.get('href')
  153. title = extract_text(link)
  154. # Make sure that the element is free of <a href> links and <span class='algoSlug_icon'>
  155. content = eval_xpath(result, '(.//p)[1]')
  156. for p in content:
  157. for e in p.xpath('.//a'):
  158. e.getparent().remove(e)
  159. for e in p.xpath('.//span[@class="algoSlug_icon"]'):
  160. e.getparent().remove(e)
  161. content = extract_text(content)
  162. # get the real URL either using the URL shown to user or following the Bing URL
  163. if url.startswith('https://www.bing.com/ck/a?'):
  164. url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
  165. # Bing can shorten the URL either at the end or in the middle of the string
  166. if (
  167. url_cite
  168. and url_cite.startswith('https://')
  169. and '…' not in url_cite
  170. and '...' not in url_cite
  171. and '›' not in url_cite
  172. ):
  173. # no need for an additional HTTP request
  174. url = url_cite
  175. else:
  176. # resolve the URL with an additional HTTP request
  177. url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
  178. url_to_resolve_index.append(i)
  179. url = None # remove the result if the HTTP Bing redirect raise an exception
  180. # append result
  181. results.append({'url': url, 'title': title, 'content': content})
  182. # increment result pointer for the next iteration in this loop
  183. i += 1
  184. # resolve all Bing redirections in parallel
  185. request_list = [
  186. network.Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
  187. ]
  188. response_list = network.multi_requests(request_list)
  189. for i, redirect_response in enumerate(response_list):
  190. if not isinstance(redirect_response, Exception):
  191. results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
  192. # get number_of_results
  193. try:
  194. result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
  195. if "-" in result_len_container:
  196. # Remove the part "from-to" for paginated request ...
  197. result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]
  198. result_len_container = re.sub('[^0-9]', '', result_len_container)
  199. if len(result_len_container) > 0:
  200. result_len = int(result_len_container)
  201. except Exception as e: # pylint: disable=broad-except
  202. logger.debug('result error :\n%s', e)
  203. if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
  204. return []
  205. results.append({'number_of_results': result_len})
  206. return results
  207. def fetch_traits(engine_traits: EngineTraits):
  208. """Fetch languages and regions from Bing-Web."""
  209. xpath_market_codes = '//table[1]/tbody/tr/td[3]'
  210. # xpath_country_codes = '//table[2]/tbody/tr/td[2]'
  211. xpath_language_codes = '//table[3]/tbody/tr/td[2]'
  212. _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
  213. def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):
  214. # insert alias to map from a language (zh) to a language + script (zh_Hans)
  215. engine_traits.languages['zh'] = 'zh-hans'
  216. resp = network.get(url)
  217. if not resp.ok:
  218. print("ERROR: response from peertube is not OK.")
  219. dom = html.fromstring(resp.text)
  220. map_lang = {'jp': 'ja'}
  221. for td in eval_xpath(dom, xpath_language_codes):
  222. eng_lang = td.text
  223. if eng_lang in ('en-gb', 'pt-br'):
  224. # language 'en' is already in the list and a language 'en-gb' can't
  225. # be handled in SearXNG, same with pt-br which is covered by pt-pt.
  226. continue
  227. babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_')
  228. try:
  229. sxng_tag = language_tag(babel.Locale.parse(babel_lang))
  230. except babel.UnknownLocaleError:
  231. print("ERROR: language (%s) is unknown by babel" % (eng_lang))
  232. continue
  233. conflict = engine_traits.languages.get(sxng_tag)
  234. if conflict:
  235. if conflict != eng_lang:
  236. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
  237. continue
  238. engine_traits.languages[sxng_tag] = eng_lang
  239. map_region = {
  240. 'en-ID': 'id_ID',
  241. 'no-NO': 'nb_NO',
  242. }
  243. for td in eval_xpath(dom, xpath_market_codes):
  244. eng_region = td.text
  245. babel_region = map_region.get(eng_region, eng_region).replace('-', '_')
  246. if eng_region == 'en-WW':
  247. engine_traits.all_locale = eng_region
  248. continue
  249. try:
  250. sxng_tag = region_tag(babel.Locale.parse(babel_region))
  251. except babel.UnknownLocaleError:
  252. print("ERROR: region (%s) is unknown by babel" % (eng_region))
  253. continue
  254. conflict = engine_traits.regions.get(sxng_tag)
  255. if conflict:
  256. if conflict != eng_region:
  257. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region))
  258. continue
  259. engine_traits.regions[sxng_tag] = eng_region