bing.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """This is the implementation of the Bing-WEB engine. Some of this
  4. implementations are shared by other engines:
  5. - :ref:`bing images engine`
  6. - :ref:`bing news engine`
  7. - :ref:`bing videos engine`
  8. On the `preference page`_ Bing offers a lot of languages an regions (see section
  9. 'Search results languages' and 'Country/region'). However, the abundant choice
  10. does not correspond to reality, where Bing has a full-text indexer only for a
  11. limited number of languages. By example: you can select a language like Māori
  12. but you never get a result in this language.
  13. What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem
  14. to be completely correct either (if you take a closer look you will find some
  15. inaccuracies there too):
  16. - :py:obj:`searx.engines.bing.bing_traits_url`
  17. - :py:obj:`searx.engines.bing_videos.bing_traits_url`
  18. - :py:obj:`searx.engines.bing_images.bing_traits_url`
  19. - :py:obj:`searx.engines.bing_news.bing_traits_url`
  20. .. _preference page: https://www.bing.com/account/general
  21. .. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
  22. """
  23. # pylint: disable=too-many-branches, invalid-name
  24. from typing import TYPE_CHECKING
  25. import base64
  26. import re
  27. import time
  28. from urllib.parse import parse_qs, urlencode, urlparse
  29. from lxml import html
  30. import babel
  31. import babel.languages
  32. from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
  33. from searx.locales import language_tag, region_tag
  34. from searx.enginelib.traits import EngineTraits
  35. if TYPE_CHECKING:
  36. import logging
  37. logger: logging.Logger
  38. traits: EngineTraits
  39. about = {
  40. "website": 'https://www.bing.com',
  41. "wikidata_id": 'Q182496',
  42. "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
  43. "use_official_api": False,
  44. "require_api_key": False,
  45. "results": 'HTML',
  46. }
  47. # engine dependent config
  48. categories = ['general', 'web']
  49. paging = True
  50. time_range_support = True
  51. base_url = 'https://www.bing.com/search'
  52. """Bing (Web) search URL"""
  53. bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
  54. """Bing (Web) search API description"""
  55. def _page_offset(pageno):
  56. return (int(pageno) - 1) * 10 + 1
  57. def set_bing_cookies(params, engine_language, engine_region):
  58. params['cookies']['_EDGE_CD'] = f'm={engine_region.lower()}&u={engine_language.lower()};'
  59. def request(query, params):
  60. """Assemble a Bing-Web request."""
  61. engine_region = traits.get_region(params['searxng_locale'], 'en-us')
  62. engine_language = traits.get_language(params['searxng_locale'], 'en-us')
  63. set_bing_cookies(params, engine_language, engine_region)
  64. query_params = {'q': query, 'first': _page_offset(params.get('pageno', 1))}
  65. params['url'] = f'{base_url}?{urlencode(query_params)}'
  66. unix_day = int(time.time() / 86400)
  67. time_ranges = {'day': '1', 'week': '2', 'month': '3', 'year': f'5_{unix_day-365}_{unix_day}'}
  68. if params.get('time_range') in time_ranges:
  69. params['url'] += f'&filters=ex1:"ez{time_ranges[params["time_range"]]}"'
  70. return params
  71. def response(resp):
  72. # pylint: disable=too-many-locals
  73. results = []
  74. result_len = 0
  75. dom = html.fromstring(resp.text)
  76. # parse results again if nothing is found yet
  77. for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
  78. link = eval_xpath_getindex(result, './/h2/a', 0, None)
  79. if link is None:
  80. continue
  81. url = link.attrib.get('href')
  82. title = extract_text(link)
  83. content = eval_xpath(result, '(.//p)[1]')
  84. for p in content:
  85. # Make sure that the element is free of <a href> links
  86. for e in p.xpath('.//a'):
  87. e.getparent().remove(e)
  88. content = extract_text(content)
  89. # get the real URL
  90. if url.startswith('https://www.bing.com/ck/a?'):
  91. # get the first value of u parameter
  92. url_query = urlparse(url).query
  93. parsed_url_query = parse_qs(url_query)
  94. param_u = parsed_url_query["u"][0]
  95. # remove "a1" in front
  96. encoded_url = param_u[2:]
  97. # add padding
  98. encoded_url = encoded_url + '=' * (-len(encoded_url) % 4)
  99. # decode base64 encoded URL
  100. url = base64.urlsafe_b64decode(encoded_url).decode()
  101. # append result
  102. results.append({'url': url, 'title': title, 'content': content})
  103. # get number_of_results
  104. try:
  105. result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
  106. if "-" in result_len_container:
  107. # Remove the part "from-to" for paginated request ...
  108. result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]
  109. result_len_container = re.sub('[^0-9]', '', result_len_container)
  110. if len(result_len_container) > 0:
  111. result_len = int(result_len_container)
  112. except Exception as e: # pylint: disable=broad-except
  113. logger.debug('result error :\n%s', e)
  114. if result_len and _page_offset(resp.search_params.get("pageno", 0)) > result_len:
  115. # Avoid reading more results than avalaible.
  116. # For example, if there is 100 results from some search and we try to get results from 120 to 130,
  117. # Bing will send back the results from 0 to 10 and no error.
  118. # If we compare results count with the first parameter of the request we can avoid this "invalid" results.
  119. return []
  120. results.append({'number_of_results': result_len})
  121. return results
  122. def fetch_traits(engine_traits: EngineTraits):
  123. """Fetch languages and regions from Bing-Web."""
  124. xpath_market_codes = '//table[1]/tbody/tr/td[3]'
  125. # xpath_country_codes = '//table[2]/tbody/tr/td[2]'
  126. xpath_language_codes = '//table[3]/tbody/tr/td[2]'
  127. _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
  128. def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):
  129. # pylint: disable=too-many-locals,import-outside-toplevel
  130. from searx.network import get # see https://github.com/searxng/searxng/issues/762
  131. # insert alias to map from a language (zh) to a language + script (zh_Hans)
  132. engine_traits.languages['zh'] = 'zh-hans'
  133. resp = get(url)
  134. if not resp.ok: # type: ignore
  135. print("ERROR: response from peertube is not OK.")
  136. dom = html.fromstring(resp.text) # type: ignore
  137. map_lang = {'jp': 'ja'}
  138. for td in eval_xpath(dom, xpath_language_codes):
  139. eng_lang = td.text
  140. if eng_lang in ('en-gb', 'pt-br'):
  141. # language 'en' is already in the list and a language 'en-gb' can't
  142. # be handled in SearXNG, same with pt-br which is covered by pt-pt.
  143. continue
  144. babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_')
  145. try:
  146. sxng_tag = language_tag(babel.Locale.parse(babel_lang))
  147. except babel.UnknownLocaleError:
  148. print("ERROR: language (%s) is unknown by babel" % (eng_lang))
  149. continue
  150. conflict = engine_traits.languages.get(sxng_tag)
  151. if conflict:
  152. if conflict != eng_lang:
  153. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
  154. continue
  155. engine_traits.languages[sxng_tag] = eng_lang
  156. map_region = {
  157. 'en-ID': 'id_ID',
  158. 'no-NO': 'nb_NO',
  159. }
  160. for td in eval_xpath(dom, xpath_market_codes):
  161. eng_region = td.text
  162. babel_region = map_region.get(eng_region, eng_region).replace('-', '_')
  163. if eng_region == 'en-WW':
  164. engine_traits.all_locale = eng_region
  165. continue
  166. try:
  167. sxng_tag = region_tag(babel.Locale.parse(babel_region))
  168. except babel.UnknownLocaleError:
  169. print("ERROR: region (%s) is unknown by babel" % (eng_region))
  170. continue
  171. conflict = engine_traits.regions.get(sxng_tag)
  172. if conflict:
  173. if conflict != eng_region:
  174. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region))
  175. continue
  176. engine_traits.regions[sxng_tag] = eng_region