bing.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """This is the implementation of the Bing-WEB engine. Some of this
  4. implementations are shared by other engines:
  5. - :ref:`bing images engine`
  6. - :ref:`bing news engine`
  7. - :ref:`bing videos engine`
  8. On the `preference page`_ Bing offers a lot of languages an regions (see section
  9. 'Search results languages' and 'Country/region'). However, the abundant choice
  10. does not correspond to reality, where Bing has a full-text indexer only for a
  11. limited number of languages. By example: you can select a language like Māori
  12. but you never get a result in this language.
  13. What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem
  14. to be completely correct either (if you take a closer look you will find some
  15. inaccuracies there too):
  16. - :py:obj:`searx.engines.bing.bing_traits_url`
  17. - :py:obj:`searx.engines.bing_videos.bing_traits_url`
  18. - :py:obj:`searx.engines.bing_images.bing_traits_url`
  19. - :py:obj:`searx.engines.bing_news.bing_traits_url`
  20. .. _preference page: https://www.bing.com/account/general
  21. .. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
  22. """
  23. # pylint: disable=too-many-branches, invalid-name
  24. from typing import TYPE_CHECKING
  25. import datetime
  26. import re
  27. import uuid
  28. from urllib.parse import urlencode
  29. from lxml import html
  30. import babel
  31. import babel.languages
  32. from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
  33. from searx import network
  34. from searx.locales import language_tag, region_tag
  35. from searx.enginelib.traits import EngineTraits
  36. if TYPE_CHECKING:
  37. import logging
  38. logger: logging.Logger
  39. traits: EngineTraits
  40. about = {
  41. "website": 'https://www.bing.com',
  42. "wikidata_id": 'Q182496',
  43. "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
  44. "use_official_api": False,
  45. "require_api_key": False,
  46. "results": 'HTML',
  47. }
  48. send_accept_language_header = True
  49. """Bing tries to guess user's language and territory from the HTTP
  50. Accept-Language. Optional the user can select a search-language (can be
  51. different to the UI language) and a region (market code)."""
  52. # engine dependent config
  53. categories = ['general', 'web']
  54. paging = True
  55. time_range_support = True
  56. safesearch = True
  57. safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT
  58. base_url = 'https://www.bing.com/search'
  59. """Bing (Web) search URL"""
  60. bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
  61. """Bing (Web) search API description"""
  62. def _get_offset_from_pageno(pageno):
  63. return (pageno - 1) * 10 + 1
  64. def set_bing_cookies(params, engine_language, engine_region, SID):
  65. # set cookies
  66. # -----------
  67. params['cookies']['_EDGE_V'] = '1'
  68. # _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw
  69. _EDGE_S = [
  70. 'F=1',
  71. 'SID=%s' % SID,
  72. 'mkt=%s' % engine_region.lower(),
  73. 'ui=%s' % engine_language.lower(),
  74. ]
  75. params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S)
  76. logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S'])
  77. # "_EDGE_CD": "m=zh-tw",
  78. _EDGE_CD = [ # pylint: disable=invalid-name
  79. 'm=%s' % engine_region.lower(), # search region: zh-cn
  80. 'u=%s' % engine_language.lower(), # UI: en-us
  81. ]
  82. params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';'
  83. logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD'])
  84. SRCHHPGUSR = [ # pylint: disable=invalid-name
  85. 'SRCHLANG=%s' % engine_language,
  86. # Trying to set ADLT cookie here seems not to have any effect, I assume
  87. # there is some age verification by a cookie (and/or session ID) needed,
  88. # to disable the SafeSearch.
  89. 'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'),
  90. ]
  91. params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR)
  92. logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR'])
  93. def request(query, params):
  94. """Assemble a Bing-Web request."""
  95. engine_region = traits.get_region(params['searxng_locale'], 'en-US')
  96. engine_language = traits.get_language(params['searxng_locale'], 'en')
  97. SID = uuid.uuid1().hex.upper()
  98. CVID = uuid.uuid1().hex.upper()
  99. set_bing_cookies(params, engine_language, engine_region, SID)
  100. # build URL query
  101. # ---------------
  102. # query term
  103. page = int(params.get('pageno', 1))
  104. query_params = {
  105. # fmt: off
  106. 'q': query,
  107. 'pq': query,
  108. 'cvid': CVID,
  109. 'qs': 'n',
  110. 'sp': '-1'
  111. # fmt: on
  112. }
  113. # page
  114. if page > 1:
  115. referer = base_url + '?' + urlencode(query_params)
  116. params['headers']['Referer'] = referer
  117. logger.debug("headers.Referer --> %s", referer)
  118. query_params['first'] = _get_offset_from_pageno(page)
  119. if page == 2:
  120. query_params['FORM'] = 'PERE'
  121. elif page > 2:
  122. query_params['FORM'] = 'PERE%s' % (page - 2)
  123. filters = ''
  124. if params['time_range']:
  125. query_params['filt'] = 'custom'
  126. if params['time_range'] == 'day':
  127. filters = 'ex1:"ez1"'
  128. elif params['time_range'] == 'week':
  129. filters = 'ex1:"ez2"'
  130. elif params['time_range'] == 'month':
  131. filters = 'ex1:"ez3"'
  132. elif params['time_range'] == 'year':
  133. epoch_1970 = datetime.date(1970, 1, 1)
  134. today_no = (datetime.date.today() - epoch_1970).days
  135. filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no)
  136. params['url'] = base_url + '?' + urlencode(query_params)
  137. if filters:
  138. params['url'] = params['url'] + '&filters=' + filters
  139. return params
  140. def response(resp):
  141. results = []
  142. result_len = 0
  143. dom = html.fromstring(resp.text)
  144. # parse results again if nothing is found yet
  145. url_to_resolve = []
  146. url_to_resolve_index = []
  147. i = 0
  148. for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
  149. link = eval_xpath_getindex(result, './/h2/a', 0, None)
  150. if link is None:
  151. continue
  152. url = link.attrib.get('href')
  153. title = extract_text(link)
  154. content = eval_xpath(result, '(.//p)[1]')
  155. for p in content:
  156. # Make sure that the element is free of <a href> links
  157. for e in p.xpath('.//a'):
  158. e.getparent().remove(e)
  159. content = extract_text(content)
  160. # get the real URL either using the URL shown to user or following the Bing URL
  161. if url.startswith('https://www.bing.com/ck/a?'):
  162. url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
  163. # Bing can shorten the URL either at the end or in the middle of the string
  164. if (
  165. url_cite
  166. and url_cite.startswith('https://')
  167. and '…' not in url_cite
  168. and '...' not in url_cite
  169. and '›' not in url_cite
  170. ):
  171. # no need for an additional HTTP request
  172. url = url_cite
  173. else:
  174. # resolve the URL with an additional HTTP request
  175. url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
  176. url_to_resolve_index.append(i)
  177. url = None # remove the result if the HTTP Bing redirect raise an exception
  178. # append result
  179. results.append({'url': url, 'title': title, 'content': content})
  180. # increment result pointer for the next iteration in this loop
  181. i += 1
  182. # resolve all Bing redirections in parallel
  183. request_list = [
  184. network.Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
  185. ]
  186. response_list = network.multi_requests(request_list)
  187. for i, redirect_response in enumerate(response_list):
  188. if not isinstance(redirect_response, Exception):
  189. results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
  190. # get number_of_results
  191. try:
  192. result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
  193. if "-" in result_len_container:
  194. # Remove the part "from-to" for paginated request ...
  195. result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]
  196. result_len_container = re.sub('[^0-9]', '', result_len_container)
  197. if len(result_len_container) > 0:
  198. result_len = int(result_len_container)
  199. except Exception as e: # pylint: disable=broad-except
  200. logger.debug('result error :\n%s', e)
  201. if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
  202. return []
  203. results.append({'number_of_results': result_len})
  204. return results
  205. def fetch_traits(engine_traits: EngineTraits):
  206. """Fetch languages and regions from Bing-Web."""
  207. xpath_market_codes = '//table[1]/tbody/tr/td[3]'
  208. # xpath_country_codes = '//table[2]/tbody/tr/td[2]'
  209. xpath_language_codes = '//table[3]/tbody/tr/td[2]'
  210. _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
  211. def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):
  212. # insert alias to map from a language (zh) to a language + script (zh_Hans)
  213. engine_traits.languages['zh'] = 'zh-hans'
  214. resp = network.get(url)
  215. if not resp.ok:
  216. print("ERROR: response from peertube is not OK.")
  217. dom = html.fromstring(resp.text)
  218. map_lang = {'jp': 'ja'}
  219. for td in eval_xpath(dom, xpath_language_codes):
  220. eng_lang = td.text
  221. if eng_lang in ('en-gb', 'pt-br'):
  222. # language 'en' is already in the list and a language 'en-gb' can't
  223. # be handled in SearXNG, same with pt-br which is covered by pt-pt.
  224. continue
  225. babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_')
  226. try:
  227. sxng_tag = language_tag(babel.Locale.parse(babel_lang))
  228. except babel.UnknownLocaleError:
  229. print("ERROR: language (%s) is unknown by babel" % (eng_lang))
  230. continue
  231. conflict = engine_traits.languages.get(sxng_tag)
  232. if conflict:
  233. if conflict != eng_lang:
  234. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
  235. continue
  236. engine_traits.languages[sxng_tag] = eng_lang
  237. map_region = {
  238. 'en-ID': 'id_ID',
  239. 'no-NO': 'nb_NO',
  240. }
  241. for td in eval_xpath(dom, xpath_market_codes):
  242. eng_region = td.text
  243. babel_region = map_region.get(eng_region, eng_region).replace('-', '_')
  244. if eng_region == 'en-WW':
  245. engine_traits.all_locale = eng_region
  246. continue
  247. try:
  248. sxng_tag = region_tag(babel.Locale.parse(babel_region))
  249. except babel.UnknownLocaleError:
  250. print("ERROR: region (%s) is unknown by babel" % (eng_region))
  251. continue
  252. conflict = engine_traits.regions.get(sxng_tag)
  253. if conflict:
  254. if conflict != eng_region:
  255. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region))
  256. continue
  257. engine_traits.regions[sxng_tag] = eng_region