bing.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """This is the implementation of the Bing-WEB engine. Some of this
  4. implementations are shared by other engines:
  5. - :ref:`bing images engine`
  6. - :ref:`bing news engine`
  7. - :ref:`bing videos engine`
  8. On the `preference page`_ Bing offers a lot of languages an regions (see section
  9. 'Search results languages' and 'Country/region'). However, the abundant choice
  10. does not correspond to reality, where Bing has a full-text indexer only for a
  11. limited number of languages. By example: you can select a language like Māori
  12. but you never get a result in this language.
  13. What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem
  14. to be completely correct either (if you take a closer look you will find some
  15. inaccuracies there too):
  16. - :py:obj:`searx.engines.bing.bing_traits_url`
  17. - :py:obj:`searx.engines.bing_videos.bing_traits_url`
  18. - :py:obj:`searx.engines.bing_images.bing_traits_url`
  19. - :py:obj:`searx.engines.bing_news.bing_traits_url`
  20. .. _preference page: https://www.bing.com/account/general
  21. .. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
  22. """
  23. # pylint: disable=too-many-branches, invalid-name
  24. from typing import TYPE_CHECKING
  25. import datetime
  26. import re
  27. import uuid
  28. from urllib.parse import urlencode
  29. from lxml import html
  30. import babel
  31. import babel.languages
  32. from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
  33. from searx.locales import language_tag, region_tag
  34. from searx.enginelib.traits import EngineTraits
  35. if TYPE_CHECKING:
  36. import logging
  37. logger: logging.Logger
  38. traits: EngineTraits
  39. about = {
  40. "website": 'https://www.bing.com',
  41. "wikidata_id": 'Q182496',
  42. "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
  43. "use_official_api": False,
  44. "require_api_key": False,
  45. "results": 'HTML',
  46. }
  47. send_accept_language_header = True
  48. """Bing tries to guess user's language and territory from the HTTP
  49. Accept-Language. Optional the user can select a search-language (can be
  50. different to the UI language) and a region (market code)."""
  51. # engine dependent config
  52. categories = ['general', 'web']
  53. paging = True
  54. time_range_support = True
  55. safesearch = True
  56. safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT
  57. base_url = 'https://www.bing.com/search'
  58. """Bing (Web) search URL"""
  59. bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
  60. """Bing (Web) search API description"""
  61. def _get_offset_from_pageno(pageno):
  62. return (pageno - 1) * 10 + 1
  63. def set_bing_cookies(params, engine_language, engine_region, SID):
  64. # set cookies
  65. # -----------
  66. params['cookies']['_EDGE_V'] = '1'
  67. # _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw
  68. _EDGE_S = [
  69. 'F=1',
  70. 'SID=%s' % SID,
  71. 'mkt=%s' % engine_region.lower(),
  72. 'ui=%s' % engine_language.lower(),
  73. ]
  74. params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S)
  75. logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S'])
  76. # "_EDGE_CD": "m=zh-tw",
  77. _EDGE_CD = [ # pylint: disable=invalid-name
  78. 'm=%s' % engine_region.lower(), # search region: zh-cn
  79. 'u=%s' % engine_language.lower(), # UI: en-us
  80. ]
  81. params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';'
  82. logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD'])
  83. SRCHHPGUSR = [ # pylint: disable=invalid-name
  84. 'SRCHLANG=%s' % engine_language,
  85. # Trying to set ADLT cookie here seems not to have any effect, I assume
  86. # there is some age verification by a cookie (and/or session ID) needed,
  87. # to disable the SafeSearch.
  88. 'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'),
  89. ]
  90. params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR)
  91. logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR'])
  92. def request(query, params):
  93. """Assemble a Bing-Web request."""
  94. engine_region = traits.get_region(params['searxng_locale'], 'en-US')
  95. engine_language = traits.get_language(params['searxng_locale'], 'en')
  96. SID = uuid.uuid1().hex.upper()
  97. CVID = uuid.uuid1().hex.upper()
  98. set_bing_cookies(params, engine_language, engine_region, SID)
  99. # build URL query
  100. # ---------------
  101. # query term
  102. page = int(params.get('pageno', 1))
  103. query_params = {
  104. # fmt: off
  105. 'q': query,
  106. 'pq': query,
  107. 'cvid': CVID,
  108. 'qs': 'n',
  109. 'sp': '-1'
  110. # fmt: on
  111. }
  112. # page
  113. if page > 1:
  114. referer = base_url + '?' + urlencode(query_params)
  115. params['headers']['Referer'] = referer
  116. logger.debug("headers.Referer --> %s", referer)
  117. query_params['first'] = _get_offset_from_pageno(page)
  118. if page == 2:
  119. query_params['FORM'] = 'PERE'
  120. elif page > 2:
  121. query_params['FORM'] = 'PERE%s' % (page - 2)
  122. filters = ''
  123. if params['time_range']:
  124. query_params['filt'] = 'custom'
  125. if params['time_range'] == 'day':
  126. filters = 'ex1:"ez1"'
  127. elif params['time_range'] == 'week':
  128. filters = 'ex1:"ez2"'
  129. elif params['time_range'] == 'month':
  130. filters = 'ex1:"ez3"'
  131. elif params['time_range'] == 'year':
  132. epoch_1970 = datetime.date(1970, 1, 1)
  133. today_no = (datetime.date.today() - epoch_1970).days
  134. filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no)
  135. params['url'] = base_url + '?' + urlencode(query_params)
  136. if filters:
  137. params['url'] = params['url'] + '&filters=' + filters
  138. return params
  139. def response(resp):
  140. # pylint: disable=too-many-locals,import-outside-toplevel
  141. from searx.network import Request, multi_requests # see https://github.com/searxng/searxng/issues/762
  142. results = []
  143. result_len = 0
  144. dom = html.fromstring(resp.text)
  145. # parse results again if nothing is found yet
  146. url_to_resolve = []
  147. url_to_resolve_index = []
  148. i = 0
  149. for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
  150. link = eval_xpath_getindex(result, './/h2/a', 0, None)
  151. if link is None:
  152. continue
  153. url = link.attrib.get('href')
  154. title = extract_text(link)
  155. content = eval_xpath(result, '(.//p)[1]')
  156. for p in content:
  157. # Make sure that the element is free of <a href> links
  158. for e in p.xpath('.//a'):
  159. e.getparent().remove(e)
  160. content = extract_text(content)
  161. # get the real URL either using the URL shown to user or following the Bing URL
  162. if url.startswith('https://www.bing.com/ck/a?'):
  163. url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
  164. # Bing can shorten the URL either at the end or in the middle of the string
  165. if (
  166. url_cite
  167. and url_cite.startswith('https://')
  168. and '…' not in url_cite
  169. and '...' not in url_cite
  170. and '›' not in url_cite
  171. ):
  172. # no need for an additional HTTP request
  173. url = url_cite
  174. else:
  175. # resolve the URL with an additional HTTP request
  176. url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
  177. url_to_resolve_index.append(i)
  178. url = None # remove the result if the HTTP Bing redirect raise an exception
  179. # append result
  180. results.append({'url': url, 'title': title, 'content': content})
  181. # increment result pointer for the next iteration in this loop
  182. i += 1
  183. # resolve all Bing redirections in parallel
  184. request_list = [
  185. Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
  186. ]
  187. response_list = multi_requests(request_list)
  188. for i, redirect_response in enumerate(response_list):
  189. if not isinstance(redirect_response, Exception):
  190. results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
  191. # get number_of_results
  192. try:
  193. result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
  194. if "-" in result_len_container:
  195. # Remove the part "from-to" for paginated request ...
  196. result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]
  197. result_len_container = re.sub('[^0-9]', '', result_len_container)
  198. if len(result_len_container) > 0:
  199. result_len = int(result_len_container)
  200. except Exception as e: # pylint: disable=broad-except
  201. logger.debug('result error :\n%s', e)
  202. if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
  203. return []
  204. results.append({'number_of_results': result_len})
  205. return results
  206. def fetch_traits(engine_traits: EngineTraits):
  207. """Fetch languages and regions from Bing-Web."""
  208. xpath_market_codes = '//table[1]/tbody/tr/td[3]'
  209. # xpath_country_codes = '//table[2]/tbody/tr/td[2]'
  210. xpath_language_codes = '//table[3]/tbody/tr/td[2]'
  211. _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
  212. def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):
  213. # pylint: disable=too-many-locals,import-outside-toplevel
  214. from searx.network import get # see https://github.com/searxng/searxng/issues/762
  215. # insert alias to map from a language (zh) to a language + script (zh_Hans)
  216. engine_traits.languages['zh'] = 'zh-hans'
  217. resp = get(url)
  218. if not resp.ok: # type: ignore
  219. print("ERROR: response from peertube is not OK.")
  220. dom = html.fromstring(resp.text) # type: ignore
  221. map_lang = {'jp': 'ja'}
  222. for td in eval_xpath(dom, xpath_language_codes):
  223. eng_lang = td.text
  224. if eng_lang in ('en-gb', 'pt-br'):
  225. # language 'en' is already in the list and a language 'en-gb' can't
  226. # be handled in SearXNG, same with pt-br which is covered by pt-pt.
  227. continue
  228. babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_')
  229. try:
  230. sxng_tag = language_tag(babel.Locale.parse(babel_lang))
  231. except babel.UnknownLocaleError:
  232. print("ERROR: language (%s) is unknown by babel" % (eng_lang))
  233. continue
  234. conflict = engine_traits.languages.get(sxng_tag)
  235. if conflict:
  236. if conflict != eng_lang:
  237. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
  238. continue
  239. engine_traits.languages[sxng_tag] = eng_lang
  240. map_region = {
  241. 'en-ID': 'id_ID',
  242. 'no-NO': 'nb_NO',
  243. }
  244. for td in eval_xpath(dom, xpath_market_codes):
  245. eng_region = td.text
  246. babel_region = map_region.get(eng_region, eng_region).replace('-', '_')
  247. if eng_region == 'en-WW':
  248. engine_traits.all_locale = eng_region
  249. continue
  250. try:
  251. sxng_tag = region_tag(babel.Locale.parse(babel_region))
  252. except babel.UnknownLocaleError:
  253. print("ERROR: region (%s) is unknown by babel" % (eng_region))
  254. continue
  255. conflict = engine_traits.regions.get(sxng_tag)
  256. if conflict:
  257. if conflict != eng_region:
  258. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region))
  259. continue
  260. engine_traits.regions[sxng_tag] = eng_region