| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247 | # SPDX-License-Identifier: AGPL-3.0-or-later# lint: pylint"""The XPath engine is a *generic* engine with which it is possible to configureengines in the settings.Here is a simple example of a XPath engine configured in the:ref:`settings engine` section, further read :ref:`engines-dev`... code:: yaml  - name : bitbucket    engine : xpath    paging : True    search_url : https://bitbucket.org/repo/all/{pageno}?name={query}    url_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]/@href    title_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]    content_xpath : //article[@class="repo-summary"]/p"""from urllib.parse import urlencodefrom lxml import htmlfrom searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_listsearch_url = None"""Search URL of the engine. Example::    https://example.org/?search={query}&page={pageno}{time_range}{safe_search}Replacements are:``{query}``:  Search terms from user.``{pageno}``:  Page number if engine supports pagging :py:obj:`paging```{lang}``:  ISO 639-1 language code (en, de, fr ..)``{time_range}``:  :py:obj:`URL parameter <time_range_url>` if engine :py:obj:`supports time  range <time_range_support>`.  The value for the parameter is taken from  :py:obj:`time_range_map`.``{safe_search}``:  Safe-search :py:obj:`URL parameter <safe_search_map>` if engine  :py:obj:`supports safe-search <safe_search_support>`.  The ``{safe_search}``  replacement is taken from the :py:obj:`safes_search_map`.  Filter results::      0: none, 1: moderate, 2:strict  If not supported, the URL paramter is an empty string."""lang_all='en''''Replacement ``{lang}`` in :py:obj:`search_url` if language ``all`` isselected.'''soft_max_redirects = 0'''Maximum redirects, soft limit. Record an error but don't stop the engine'''results_xpath = '''''XPath selector for the list of result items'''url_xpath = None'''XPath selector of result's ``url``.'''content_xpath = None'''XPath selector of result's ``content``.'''title_xpath = None'''XPath selector of result's ``title``.'''thumbnail_xpath = False'''XPath selector of result's ``img_src``.'''suggestion_xpath = '''''XPath selector of result's ``suggestion``.'''cached_xpath = ''cached_url = ''paging = False'''Engine supports paging [True or False].'''page_size = 1'''Number of results on each page.  Only needed if the site requires not a pagenumber, but an offset.'''first_page_num = 1'''Number of the first page (usually 0 or 1).'''time_range_support = False'''Engine supports search time range.'''time_range_url = '&hours={time_range_val}''''Time range URL parameter in the in :py:obj:`search_url`.  If no time range isrequested by the user, the URL paramter is an empty string.  The``{time_range_val}`` replacement is taken from the :py:obj:`time_range_map`... code:: yaml    time_range_url : '&days={time_range_val}''''time_range_map = {    'day': 24,    'week': 24*7,    'month': 24*30,    'year': 24*365,}'''Maps time range value from user to ``{time_range_val}`` in:py:obj:`time_range_url`... code:: yaml    time_range_map:      day: 1      week: 7      month: 30      year: 365'''safe_search_support = False'''Engine supports safe-search.'''safe_search_map = {    0: '&filter=none',    1: '&filter=moderate',    2: '&filter=strict'}'''Maps safe-search value to ``{safe_search}`` in :py:obj:`search_url`... code:: yaml    safesearch: true    safes_search_map:      0: '&filter=none'      1: '&filter=moderate'      2: '&filter=strict''''def request(query, params):    '''Build request parameters (see :ref:`engine request`).    '''    lang = lang_all    if params['language'] != 'all':        lang = params['language'][:2]    time_range = ''    if params.get('time_range'):        time_range_val = time_range_map.get(params.get('time_range'))        time_range = time_range_url.format(time_range_val=time_range_val)    safe_search = ''    if params['safesearch']:        safe_search = safe_search_map[params['safesearch']]    fargs = {        'query': urlencode({'q': query})[2:],        'lang': lang,        'pageno': (params['pageno'] - 1) * page_size + first_page_num,        'time_range' : time_range,        'safe_search' : safe_search,    }    params['url'] = search_url.format(**fargs)    params['soft_max_redirects'] = soft_max_redirects    return paramsdef response(resp):    '''Scrap *results* from the response (see :ref:`engine results`).    '''    results = []    dom = html.fromstring(resp.text)    is_onion = 'onions' in categories    if results_xpath:        for result in eval_xpath_list(dom, results_xpath):            url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)            title = extract_text(eval_xpath_list(result, title_xpath, min_len=1))            content = extract_text(eval_xpath_list(result, content_xpath))            tmp_result = {'url': url, 'title': title, 'content': content}            # add thumbnail if available            if thumbnail_xpath:                thumbnail_xpath_result = eval_xpath_list(result, thumbnail_xpath)                if len(thumbnail_xpath_result) > 0:                    tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)            # add alternative cached url if available            if cached_xpath:                tmp_result['cached_url'] = (                    cached_url                    + extract_text(eval_xpath_list(result, cached_xpath, min_len=1))                )            if is_onion:                tmp_result['is_onion'] = True            results.append(tmp_result)    else:        if cached_xpath:            for url, title, content, cached in zip(                (extract_url(x, search_url) for                 x in eval_xpath_list(dom, url_xpath)),                map(extract_text, eval_xpath_list(dom, title_xpath)),                map(extract_text, eval_xpath_list(dom, content_xpath)),                map(extract_text, eval_xpath_list(dom, cached_xpath))            ):                results.append({                    'url': url,                    'title': title,                    'content': content,                    'cached_url': cached_url + cached, 'is_onion': is_onion                })        else:            for url, title, content in zip(                (extract_url(x, search_url) for                 x in eval_xpath_list(dom, url_xpath)),                map(extract_text, eval_xpath_list(dom, title_xpath)),                map(extract_text, eval_xpath_list(dom, content_xpath))            ):                results.append({                    'url': url,                    'title': title,                    'content': content,                    'is_onion': is_onion                })    if suggestion_xpath:        for suggestion in eval_xpath(dom, suggestion_xpath):            results.append({'suggestion': extract_text(suggestion)})    logger.debug("found %s results", len(results))    return results
 |