bing.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Bing (Web)
  4. - https://github.com/searx/searx/issues/2019#issuecomment-648227442
  5. """
  6. # pylint: disable=too-many-branches, invalid-name
  7. from typing import TYPE_CHECKING
  8. import datetime
  9. import re
  10. import uuid
  11. from urllib.parse import urlencode
  12. from lxml import html
  13. import babel
  14. import babel.languages
  15. from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
  16. from searx import network
  17. from searx.locales import language_tag, region_tag
  18. from searx.enginelib.traits import EngineTraits
  19. if TYPE_CHECKING:
  20. import logging
  21. logger: logging.Logger
  22. traits: EngineTraits
  23. about = {
  24. "website": 'https://www.bing.com',
  25. "wikidata_id": 'Q182496',
  26. "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
  27. "use_official_api": False,
  28. "require_api_key": False,
  29. "results": 'HTML',
  30. }
  31. send_accept_language_header = True
  32. """Bing tries to guess user's language and territory from the HTTP
  33. Accept-Language. Optional the user can select a search-language (can be
  34. different to the UI language) and a region (market code)."""
  35. # engine dependent config
  36. categories = ['general', 'web']
  37. paging = True
  38. time_range_support = True
  39. safesearch = True
  40. safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT
  41. base_url = 'https://www.bing.com/search'
  42. """Bing (Web) search URL"""
  43. bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
  44. """Bing (Web) search API description"""
  45. def _get_offset_from_pageno(pageno):
  46. return (pageno - 1) * 10 + 1
  47. def set_bing_cookies(params, engine_language, engine_region, SID):
  48. # set cookies
  49. # -----------
  50. params['cookies']['_EDGE_V'] = '1'
  51. # _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw
  52. _EDGE_S = [
  53. 'F=1',
  54. 'SID=%s' % SID,
  55. 'mkt=%s' % engine_region.lower(),
  56. 'ui=%s' % engine_language.lower(),
  57. ]
  58. params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S)
  59. logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S'])
  60. # "_EDGE_CD": "m=zh-tw",
  61. _EDGE_CD = [ # pylint: disable=invalid-name
  62. 'm=%s' % engine_region.lower(), # search region: zh-cn
  63. 'u=%s' % engine_language.lower(), # UI: en-us
  64. ]
  65. params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';'
  66. logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD'])
  67. SRCHHPGUSR = [ # pylint: disable=invalid-name
  68. 'SRCHLANG=%s' % engine_language,
  69. # Trying to set ADLT cookie here seems not to have any effect, I assume
  70. # there is some age verification by a cookie (and/or session ID) needed,
  71. # to disable the SafeSearch.
  72. 'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'),
  73. ]
  74. params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR)
  75. logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR'])
  76. def request(query, params):
  77. """Assemble a Bing-Web request."""
  78. engine_region = traits.get_region(params['searxng_locale'], 'en-US')
  79. engine_language = traits.get_language(params['searxng_locale'], 'en')
  80. SID = uuid.uuid1().hex.upper()
  81. CVID = uuid.uuid1().hex.upper()
  82. set_bing_cookies(params, engine_language, engine_region, SID)
  83. # build URL query
  84. # ---------------
  85. # query term
  86. page = int(params.get('pageno', 1))
  87. query_params = {
  88. # fmt: off
  89. 'q': query,
  90. 'pq': query,
  91. 'cvid': CVID,
  92. 'qs': 'n',
  93. 'sp': '-1'
  94. # fmt: on
  95. }
  96. # page
  97. if page > 1:
  98. referer = base_url + '?' + urlencode(query_params)
  99. params['headers']['Referer'] = referer
  100. logger.debug("headers.Referer --> %s", referer)
  101. query_params['first'] = _get_offset_from_pageno(page)
  102. if page == 2:
  103. query_params['FORM'] = 'PERE'
  104. elif page > 2:
  105. query_params['FORM'] = 'PERE%s' % (page - 2)
  106. filters = ''
  107. if params['time_range']:
  108. query_params['filt'] = 'custom'
  109. if params['time_range'] == 'day':
  110. filters = 'ex1:"ez1"'
  111. elif params['time_range'] == 'week':
  112. filters = 'ex1:"ez2"'
  113. elif params['time_range'] == 'month':
  114. filters = 'ex1:"ez3"'
  115. elif params['time_range'] == 'year':
  116. epoch_1970 = datetime.date(1970, 1, 1)
  117. today_no = (datetime.date.today() - epoch_1970).days
  118. filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no)
  119. params['url'] = base_url + '?' + urlencode(query_params)
  120. if filters:
  121. params['url'] = params['url'] + '&filters=' + filters
  122. return params
  123. def response(resp):
  124. results = []
  125. result_len = 0
  126. dom = html.fromstring(resp.text)
  127. # parse results again if nothing is found yet
  128. url_to_resolve = []
  129. url_to_resolve_index = []
  130. i = 0
  131. for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
  132. link = eval_xpath_getindex(result, './/h2/a', 0, None)
  133. if link is None:
  134. continue
  135. url = link.attrib.get('href')
  136. title = extract_text(link)
  137. # Make sure that the element is free of <a href> links and <span class='algoSlug_icon'>
  138. content = eval_xpath(result, '(.//p)[1]')
  139. for p in content:
  140. for e in p.xpath('.//a'):
  141. e.getparent().remove(e)
  142. for e in p.xpath('.//span[@class="algoSlug_icon"]'):
  143. e.getparent().remove(e)
  144. content = extract_text(content)
  145. # get the real URL either using the URL shown to user or following the Bing URL
  146. if url.startswith('https://www.bing.com/ck/a?'):
  147. url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
  148. # Bing can shorten the URL either at the end or in the middle of the string
  149. if (
  150. url_cite
  151. and url_cite.startswith('https://')
  152. and '…' not in url_cite
  153. and '...' not in url_cite
  154. and '›' not in url_cite
  155. ):
  156. # no need for an additional HTTP request
  157. url = url_cite
  158. else:
  159. # resolve the URL with an additional HTTP request
  160. url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
  161. url_to_resolve_index.append(i)
  162. url = None # remove the result if the HTTP Bing redirect raise an exception
  163. # append result
  164. results.append({'url': url, 'title': title, 'content': content})
  165. # increment result pointer for the next iteration in this loop
  166. i += 1
  167. # resolve all Bing redirections in parallel
  168. request_list = [
  169. network.Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
  170. ]
  171. response_list = network.multi_requests(request_list)
  172. for i, redirect_response in enumerate(response_list):
  173. if not isinstance(redirect_response, Exception):
  174. results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
  175. # get number_of_results
  176. try:
  177. result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
  178. if "-" in result_len_container:
  179. # Remove the part "from-to" for paginated request ...
  180. result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]
  181. result_len_container = re.sub('[^0-9]', '', result_len_container)
  182. if len(result_len_container) > 0:
  183. result_len = int(result_len_container)
  184. except Exception as e: # pylint: disable=broad-except
  185. logger.debug('result error :\n%s', e)
  186. if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
  187. return []
  188. results.append({'number_of_results': result_len})
  189. return results
  190. def fetch_traits(engine_traits: EngineTraits):
  191. """Fetch languages and regions from Bing-Web."""
  192. xpath_market_codes = '//table[1]/tbody/tr/td[3]'
  193. # xpath_country_codes = '//table[2]/tbody/tr/td[2]'
  194. xpath_language_codes = '//table[3]/tbody/tr/td[2]'
  195. _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
  196. def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):
  197. # insert alias to map from a language (zh) to a language + script (zh_Hans)
  198. engine_traits.languages['zh'] = 'zh-hans'
  199. resp = network.get(url)
  200. if not resp.ok:
  201. print("ERROR: response from peertube is not OK.")
  202. dom = html.fromstring(resp.text)
  203. map_lang = {'jp': 'ja'}
  204. for td in eval_xpath(dom, xpath_language_codes):
  205. eng_lang = td.text
  206. if eng_lang in ('en-gb', 'pt-br'):
  207. # language 'en' is already in the list and a language 'en-gb' can't
  208. # be handled in SearXNG, same with pt-br which is covered by pt-pt.
  209. continue
  210. babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_')
  211. try:
  212. sxng_tag = language_tag(babel.Locale.parse(babel_lang))
  213. except babel.UnknownLocaleError:
  214. print("ERROR: language (%s) is unknown by babel" % (eng_lang))
  215. continue
  216. conflict = engine_traits.languages.get(sxng_tag)
  217. if conflict:
  218. if conflict != eng_lang:
  219. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
  220. continue
  221. engine_traits.languages[sxng_tag] = eng_lang
  222. map_region = {
  223. 'en-ID': 'id_ID',
  224. 'no-NO': 'nb_NO',
  225. }
  226. for td in eval_xpath(dom, xpath_market_codes):
  227. eng_region = td.text
  228. babel_region = map_region.get(eng_region, eng_region).replace('-', '_')
  229. if eng_region == 'en-WW':
  230. engine_traits.all_locale = eng_region
  231. continue
  232. try:
  233. sxng_tag = region_tag(babel.Locale.parse(babel_region))
  234. except babel.UnknownLocaleError:
  235. print("ERROR: region (%s) is unknown by babel" % (eng_region))
  236. continue
  237. conflict = engine_traits.regions.get(sxng_tag)
  238. if conflict:
  239. if conflict != eng_region:
  240. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region))
  241. continue
  242. engine_traits.regions[sxng_tag] = eng_region