123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331 |
- # SPDX-License-Identifier: AGPL-3.0-or-later
- # lint: pylint
- """This is the implementation of the Bing-WEB engine. Some of this
- implementations are shared by other engines:
- - :ref:`bing images engine`
- - :ref:`bing news engine`
- - :ref:`bing videos engine`
- On the `preference page`_ Bing offers a lot of languages an regions (see section
- 'Search results languages' and 'Country/region'). However, the abundant choice
- does not correspond to reality, where Bing has a full-text indexer only for a
- limited number of languages. By example: you can select a language like Māori
- but you never get a result in this language.
- What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem
- to be completely correct either (if you take a closer look you will find some
- inaccuracies there too):
- - :py:obj:`searx.engines.bing.bing_traits_url`
- - :py:obj:`searx.engines.bing_videos.bing_traits_url`
- - :py:obj:`searx.engines.bing_images.bing_traits_url`
- - :py:obj:`searx.engines.bing_news.bing_traits_url`
- .. _preference page: https://www.bing.com/account/general
- .. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
- """
- # pylint: disable=too-many-branches, invalid-name
- from typing import TYPE_CHECKING
- import datetime
- import re
- import uuid
- from urllib.parse import urlencode
- from lxml import html
- import babel
- import babel.languages
- from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
- from searx import network
- from searx.locales import language_tag, region_tag
- from searx.enginelib.traits import EngineTraits
- if TYPE_CHECKING:
- import logging
- logger: logging.Logger
- traits: EngineTraits
- about = {
- "website": 'https://www.bing.com',
- "wikidata_id": 'Q182496',
- "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
- "use_official_api": False,
- "require_api_key": False,
- "results": 'HTML',
- }
- send_accept_language_header = True
- """Bing tries to guess user's language and territory from the HTTP
- Accept-Language. Optional the user can select a search-language (can be
- different to the UI language) and a region (market code)."""
- # engine dependent config
- categories = ['general', 'web']
- paging = True
- time_range_support = True
- safesearch = True
- safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT
- base_url = 'https://www.bing.com/search'
- """Bing (Web) search URL"""
- bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
- """Bing (Web) search API description"""
- def _get_offset_from_pageno(pageno):
- return (pageno - 1) * 10 + 1
- def set_bing_cookies(params, engine_language, engine_region, SID):
- # set cookies
- # -----------
- params['cookies']['_EDGE_V'] = '1'
- # _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw
- _EDGE_S = [
- 'F=1',
- 'SID=%s' % SID,
- 'mkt=%s' % engine_region.lower(),
- 'ui=%s' % engine_language.lower(),
- ]
- params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S)
- logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S'])
- # "_EDGE_CD": "m=zh-tw",
- _EDGE_CD = [ # pylint: disable=invalid-name
- 'm=%s' % engine_region.lower(), # search region: zh-cn
- 'u=%s' % engine_language.lower(), # UI: en-us
- ]
- params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';'
- logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD'])
- SRCHHPGUSR = [ # pylint: disable=invalid-name
- 'SRCHLANG=%s' % engine_language,
- # Trying to set ADLT cookie here seems not to have any effect, I assume
- # there is some age verification by a cookie (and/or session ID) needed,
- # to disable the SafeSearch.
- 'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'),
- ]
- params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR)
- logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR'])
- def request(query, params):
- """Assemble a Bing-Web request."""
- engine_region = traits.get_region(params['searxng_locale'], 'en-US')
- engine_language = traits.get_language(params['searxng_locale'], 'en')
- SID = uuid.uuid1().hex.upper()
- CVID = uuid.uuid1().hex.upper()
- set_bing_cookies(params, engine_language, engine_region, SID)
- # build URL query
- # ---------------
- # query term
- page = int(params.get('pageno', 1))
- query_params = {
- # fmt: off
- 'q': query,
- 'pq': query,
- 'cvid': CVID,
- 'qs': 'n',
- 'sp': '-1'
- # fmt: on
- }
- # page
- if page > 1:
- referer = base_url + '?' + urlencode(query_params)
- params['headers']['Referer'] = referer
- logger.debug("headers.Referer --> %s", referer)
- query_params['first'] = _get_offset_from_pageno(page)
- if page == 2:
- query_params['FORM'] = 'PERE'
- elif page > 2:
- query_params['FORM'] = 'PERE%s' % (page - 2)
- filters = ''
- if params['time_range']:
- query_params['filt'] = 'custom'
- if params['time_range'] == 'day':
- filters = 'ex1:"ez1"'
- elif params['time_range'] == 'week':
- filters = 'ex1:"ez2"'
- elif params['time_range'] == 'month':
- filters = 'ex1:"ez3"'
- elif params['time_range'] == 'year':
- epoch_1970 = datetime.date(1970, 1, 1)
- today_no = (datetime.date.today() - epoch_1970).days
- filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no)
- params['url'] = base_url + '?' + urlencode(query_params)
- if filters:
- params['url'] = params['url'] + '&filters=' + filters
- return params
- def response(resp):
- results = []
- result_len = 0
- dom = html.fromstring(resp.text)
- # parse results again if nothing is found yet
- url_to_resolve = []
- url_to_resolve_index = []
- i = 0
- for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
- link = eval_xpath_getindex(result, './/h2/a', 0, None)
- if link is None:
- continue
- url = link.attrib.get('href')
- title = extract_text(link)
- content = eval_xpath(result, '(.//p)[1]')
- for p in content:
- # Make sure that the element is free of <a href> links
- for e in p.xpath('.//a'):
- e.getparent().remove(e)
- content = extract_text(content)
- # get the real URL either using the URL shown to user or following the Bing URL
- if url.startswith('https://www.bing.com/ck/a?'):
- url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
- # Bing can shorten the URL either at the end or in the middle of the string
- if (
- url_cite
- and url_cite.startswith('https://')
- and '…' not in url_cite
- and '...' not in url_cite
- and '›' not in url_cite
- ):
- # no need for an additional HTTP request
- url = url_cite
- else:
- # resolve the URL with an additional HTTP request
- url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
- url_to_resolve_index.append(i)
- url = None # remove the result if the HTTP Bing redirect raise an exception
- # append result
- results.append({'url': url, 'title': title, 'content': content})
- # increment result pointer for the next iteration in this loop
- i += 1
- # resolve all Bing redirections in parallel
- request_list = [
- network.Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
- ]
- response_list = network.multi_requests(request_list)
- for i, redirect_response in enumerate(response_list):
- if not isinstance(redirect_response, Exception):
- results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
- # get number_of_results
- try:
- result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
- if "-" in result_len_container:
- # Remove the part "from-to" for paginated request ...
- result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]
- result_len_container = re.sub('[^0-9]', '', result_len_container)
- if len(result_len_container) > 0:
- result_len = int(result_len_container)
- except Exception as e: # pylint: disable=broad-except
- logger.debug('result error :\n%s', e)
- if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
- return []
- results.append({'number_of_results': result_len})
- return results
- def fetch_traits(engine_traits: EngineTraits):
- """Fetch languages and regions from Bing-Web."""
- xpath_market_codes = '//table[1]/tbody/tr/td[3]'
- # xpath_country_codes = '//table[2]/tbody/tr/td[2]'
- xpath_language_codes = '//table[3]/tbody/tr/td[2]'
- _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
- def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):
- # insert alias to map from a language (zh) to a language + script (zh_Hans)
- engine_traits.languages['zh'] = 'zh-hans'
- resp = network.get(url)
- if not resp.ok:
- print("ERROR: response from peertube is not OK.")
- dom = html.fromstring(resp.text)
- map_lang = {'jp': 'ja'}
- for td in eval_xpath(dom, xpath_language_codes):
- eng_lang = td.text
- if eng_lang in ('en-gb', 'pt-br'):
- # language 'en' is already in the list and a language 'en-gb' can't
- # be handled in SearXNG, same with pt-br which is covered by pt-pt.
- continue
- babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_')
- try:
- sxng_tag = language_tag(babel.Locale.parse(babel_lang))
- except babel.UnknownLocaleError:
- print("ERROR: language (%s) is unknown by babel" % (eng_lang))
- continue
- conflict = engine_traits.languages.get(sxng_tag)
- if conflict:
- if conflict != eng_lang:
- print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
- continue
- engine_traits.languages[sxng_tag] = eng_lang
- map_region = {
- 'en-ID': 'id_ID',
- 'no-NO': 'nb_NO',
- }
- for td in eval_xpath(dom, xpath_market_codes):
- eng_region = td.text
- babel_region = map_region.get(eng_region, eng_region).replace('-', '_')
- if eng_region == 'en-WW':
- engine_traits.all_locale = eng_region
- continue
- try:
- sxng_tag = region_tag(babel.Locale.parse(babel_region))
- except babel.UnknownLocaleError:
- print("ERROR: region (%s) is unknown by babel" % (eng_region))
- continue
- conflict = engine_traits.regions.get(sxng_tag)
- if conflict:
- if conflict != eng_region:
- print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region))
- continue
- engine_traits.regions[sxng_tag] = eng_region
|