| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331 | # SPDX-License-Identifier: AGPL-3.0-or-later# lint: pylint"""This is the implementation of the Bing-WEB engine. Some of thisimplementations are shared by other engines:- :ref:`bing images engine`- :ref:`bing news engine`- :ref:`bing videos engine`On the `preference page`_ Bing offers a lot of languages an regions (see section'Search results languages' and 'Country/region').  However, the abundant choicedoes not correspond to reality, where Bing has a full-text indexer only for alimited number of languages.  By example: you can select a language like Māoribut you never get a result in this language.What comes a bit closer to the truth are the `search-APIs`_ but they don`t seemto be completely correct either (if you take a closer look you will find someinaccuracies there too):- :py:obj:`searx.engines.bing.bing_traits_url`- :py:obj:`searx.engines.bing_videos.bing_traits_url`- :py:obj:`searx.engines.bing_images.bing_traits_url`- :py:obj:`searx.engines.bing_news.bing_traits_url`.. _preference page: https://www.bing.com/account/general.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/"""# pylint: disable=too-many-branches, invalid-namefrom typing import TYPE_CHECKINGimport datetimeimport reimport uuidfrom urllib.parse import urlencodefrom lxml import htmlimport babelimport babel.languagesfrom searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindexfrom searx import networkfrom searx.locales import language_tag, region_tagfrom searx.enginelib.traits import EngineTraitsif TYPE_CHECKING:    import logging    logger: logging.Loggertraits: EngineTraitsabout = {    "website": 'https://www.bing.com',    "wikidata_id": 'Q182496',    "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',    "use_official_api": False,    "require_api_key": False,    "results": 'HTML',}send_accept_language_header = True"""Bing tries to guess user's language and territory from the HTTPAccept-Language.  Optional the user can select a search-language (can bedifferent to the UI language) and a region (market code)."""# engine dependent configcategories = ['general', 'web']paging = Truetime_range_support = Truesafesearch = Truesafesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'}  # cookie: ADLT=STRICTbase_url = 'https://www.bing.com/search'"""Bing (Web) search URL"""bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'"""Bing (Web) search API description"""def _get_offset_from_pageno(pageno):    return (pageno - 1) * 10 + 1def set_bing_cookies(params, engine_language, engine_region, SID):    # set cookies    # -----------    params['cookies']['_EDGE_V'] = '1'    # _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw    _EDGE_S = [        'F=1',        'SID=%s' % SID,        'mkt=%s' % engine_region.lower(),        'ui=%s' % engine_language.lower(),    ]    params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S)    logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S'])    # "_EDGE_CD": "m=zh-tw",    _EDGE_CD = [  # pylint: disable=invalid-name        'm=%s' % engine_region.lower(),  # search region: zh-cn        'u=%s' % engine_language.lower(),  # UI: en-us    ]    params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';'    logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD'])    SRCHHPGUSR = [  # pylint: disable=invalid-name        'SRCHLANG=%s' % engine_language,        # Trying to set ADLT cookie here seems not to have any effect, I assume        # there is some age verification by a cookie (and/or session ID) needed,        # to disable the SafeSearch.        'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'),    ]    params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR)    logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR'])def request(query, params):    """Assemble a Bing-Web request."""    engine_region = traits.get_region(params['searxng_locale'], 'en-US')    engine_language = traits.get_language(params['searxng_locale'], 'en')    SID = uuid.uuid1().hex.upper()    CVID = uuid.uuid1().hex.upper()    set_bing_cookies(params, engine_language, engine_region, SID)    # build URL query    # ---------------    # query term    page = int(params.get('pageno', 1))    query_params = {        # fmt: off        'q': query,        'pq': query,        'cvid': CVID,        'qs': 'n',        'sp': '-1'        # fmt: on    }    # page    if page > 1:        referer = base_url + '?' + urlencode(query_params)        params['headers']['Referer'] = referer        logger.debug("headers.Referer --> %s", referer)    query_params['first'] = _get_offset_from_pageno(page)    if page == 2:        query_params['FORM'] = 'PERE'    elif page > 2:        query_params['FORM'] = 'PERE%s' % (page - 2)    filters = ''    if params['time_range']:        query_params['filt'] = 'custom'        if params['time_range'] == 'day':            filters = 'ex1:"ez1"'        elif params['time_range'] == 'week':            filters = 'ex1:"ez2"'        elif params['time_range'] == 'month':            filters = 'ex1:"ez3"'        elif params['time_range'] == 'year':            epoch_1970 = datetime.date(1970, 1, 1)            today_no = (datetime.date.today() - epoch_1970).days            filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no)    params['url'] = base_url + '?' + urlencode(query_params)    if filters:        params['url'] = params['url'] + '&filters=' + filters    return paramsdef response(resp):    results = []    result_len = 0    dom = html.fromstring(resp.text)    # parse results again if nothing is found yet    url_to_resolve = []    url_to_resolve_index = []    i = 0    for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):        link = eval_xpath_getindex(result, './/h2/a', 0, None)        if link is None:            continue        url = link.attrib.get('href')        title = extract_text(link)        content = eval_xpath(result, '(.//p)[1]')        for p in content:            # Make sure that the element is free of <a href> links            for e in p.xpath('.//a'):                e.getparent().remove(e)        content = extract_text(content)        # get the real URL either using the URL shown to user or following the Bing URL        if url.startswith('https://www.bing.com/ck/a?'):            url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))            # Bing can shorten the URL either at the end or in the middle of the string            if (                url_cite                and url_cite.startswith('https://')                and '…' not in url_cite                and '...' not in url_cite                and '›' not in url_cite            ):                # no need for an additional HTTP request                url = url_cite            else:                # resolve the URL with an additional HTTP request                url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))                url_to_resolve_index.append(i)                url = None  # remove the result if the HTTP Bing redirect raise an exception        # append result        results.append({'url': url, 'title': title, 'content': content})        # increment result pointer for the next iteration in this loop        i += 1    # resolve all Bing redirections in parallel    request_list = [        network.Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve    ]    response_list = network.multi_requests(request_list)    for i, redirect_response in enumerate(response_list):        if not isinstance(redirect_response, Exception):            results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']    # get number_of_results    try:        result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))        if "-" in result_len_container:            # Remove the part "from-to" for paginated request ...            result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]        result_len_container = re.sub('[^0-9]', '', result_len_container)        if len(result_len_container) > 0:            result_len = int(result_len_container)    except Exception as e:  # pylint: disable=broad-except        logger.debug('result error :\n%s', e)    if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:        return []    results.append({'number_of_results': result_len})    return resultsdef fetch_traits(engine_traits: EngineTraits):    """Fetch languages and regions from Bing-Web."""    xpath_market_codes = '//table[1]/tbody/tr/td[3]'    # xpath_country_codes = '//table[2]/tbody/tr/td[2]'    xpath_language_codes = '//table[3]/tbody/tr/td[2]'    _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):    # insert alias to map from a language (zh) to a language + script (zh_Hans)    engine_traits.languages['zh'] = 'zh-hans'    resp = network.get(url)    if not resp.ok:        print("ERROR: response from peertube is not OK.")    dom = html.fromstring(resp.text)    map_lang = {'jp': 'ja'}    for td in eval_xpath(dom, xpath_language_codes):        eng_lang = td.text        if eng_lang in ('en-gb', 'pt-br'):            # language 'en' is already in the list and a language 'en-gb' can't            # be handled in SearXNG, same with pt-br which is covered by pt-pt.            continue        babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_')        try:            sxng_tag = language_tag(babel.Locale.parse(babel_lang))        except babel.UnknownLocaleError:            print("ERROR: language (%s) is unknown by babel" % (eng_lang))            continue        conflict = engine_traits.languages.get(sxng_tag)        if conflict:            if conflict != eng_lang:                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))            continue        engine_traits.languages[sxng_tag] = eng_lang    map_region = {        'en-ID': 'id_ID',        'no-NO': 'nb_NO',    }    for td in eval_xpath(dom, xpath_market_codes):        eng_region = td.text        babel_region = map_region.get(eng_region, eng_region).replace('-', '_')        if eng_region == 'en-WW':            engine_traits.all_locale = eng_region            continue        try:            sxng_tag = region_tag(babel.Locale.parse(babel_region))        except babel.UnknownLocaleError:            print("ERROR: region (%s) is unknown by babel" % (eng_region))            continue        conflict = engine_traits.regions.get(sxng_tag)        if conflict:            if conflict != eng_region:                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region))            continue        engine_traits.regions[sxng_tag] = eng_region
 |