| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265 | # SPDX-License-Identifier: AGPL-3.0-or-laterfrom urllib.parse import urlparsefrom time import timeimport threadingimport requests.exceptionsimport searx.poolrequests as poolrequestsfrom searx.engines import settingsfrom searx import loggerfrom searx.utils import gen_useragentfrom searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException,                              SearxEngineTooManyRequestsException,)from searx.metrology.error_recorder import record_exception, record_errorfrom searx.search.processors.abstract import EngineProcessorlogger = logger.getChild('search.processor.online')def default_request_params():    return {        'method': 'GET',        'headers': {},        'data': {},        'url': '',        'cookies': {},        'verify': True,        'auth': None    }class OnlineProcessor(EngineProcessor):    engine_type = 'online'    def get_params(self, search_query, engine_category):        params = super().get_params(search_query, engine_category)        if params is None:            return None        # skip suspended engines        if self.engine.suspend_end_time >= time():            logger.debug('Engine currently suspended: %s', self.engine_name)            return None        # add default params        params.update(default_request_params())        # add an user agent        params['headers']['User-Agent'] = gen_useragent()        return params    def _send_http_request(self, params):        # create dictionary which contain all        # informations about the request        request_args = dict(            headers=params['headers'],            cookies=params['cookies'],            verify=params['verify'],            auth=params['auth']        )        # setting engine based proxies        if hasattr(self.engine, 'proxies'):            request_args['proxies'] = poolrequests.get_proxies(self.engine.proxies)        # max_redirects        max_redirects = params.get('max_redirects')        if max_redirects:            request_args['max_redirects'] = max_redirects        # allow_redirects        if 'allow_redirects' in params:            request_args['allow_redirects'] = params['allow_redirects']        # soft_max_redirects        soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0)        # raise_for_status        request_args['raise_for_httperror'] = params.get('raise_for_httperror', True)        # specific type of request (GET or POST)        if params['method'] == 'GET':            req = poolrequests.get        else:            req = poolrequests.post        request_args['data'] = params['data']        # send the request        response = req(params['url'], **request_args)        # check soft limit of the redirect count        if len(response.history) > soft_max_redirects:            # unexpected redirect : record an error            # but the engine might still return valid results.            status_code = str(response.status_code or '')            reason = response.reason or ''            hostname = str(urlparse(response.url or '').netloc)            record_error(self.engine_name,                         '{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects),                         (status_code, reason, hostname))        return response    def _search_basic(self, query, params):        # update request parameters dependent on        # search-engine (contained in engines folder)        self.engine.request(query, params)        # ignoring empty urls        if params['url'] is None:            return None        if not params['url']:            return None        # send request        response = self._send_http_request(params)        # parse the response        response.search_params = params        return self.engine.response(response)    def search(self, query, params, result_container, start_time, timeout_limit):        # set timeout for all HTTP requests        poolrequests.set_timeout_for_thread(timeout_limit, start_time=start_time)        # reset the HTTP total time        poolrequests.reset_time_for_thread()        # enable HTTP only if explicitly enabled        poolrequests.set_enable_http_protocol(self.engine.enable_http)        # suppose everything will be alright        requests_exception = False        suspended_time = None        try:            # send requests and parse the results            search_results = self._search_basic(query, params)            # check if the engine accepted the request            if search_results is not None:                # yes, so add results                result_container.extend(self.engine_name, search_results)                # update engine time when there is no exception                engine_time = time() - start_time                page_load_time = poolrequests.get_time_for_thread()                result_container.add_timing(self.engine_name, engine_time, page_load_time)                with threading.RLock():                    self.engine.stats['engine_time'] += engine_time                    self.engine.stats['engine_time_count'] += 1                    # update stats with the total HTTP time                    self.engine.stats['page_load_time'] += page_load_time                    self.engine.stats['page_load_count'] += 1        except Exception as e:            record_exception(self.engine_name, e)            # Timing            engine_time = time() - start_time            page_load_time = poolrequests.get_time_for_thread()            result_container.add_timing(self.engine_name, engine_time, page_load_time)            # Record the errors            with threading.RLock():                self.engine.stats['errors'] += 1            if (issubclass(e.__class__, requests.exceptions.Timeout)):                result_container.add_unresponsive_engine(self.engine_name, 'HTTP timeout')                # requests timeout (connect or read)                logger.error("engine {0} : HTTP requests timeout"                             "(search duration : {1} s, timeout: {2} s) : {3}"                             .format(self.engine_name, engine_time, timeout_limit, e.__class__.__name__))                requests_exception = True            elif (issubclass(e.__class__, requests.exceptions.RequestException)):                result_container.add_unresponsive_engine(self.engine_name, 'HTTP error')                # other requests exception                logger.exception("engine {0} : requests exception"                                 "(search duration : {1} s, timeout: {2} s) : {3}"                                 .format(self.engine_name, engine_time, timeout_limit, e))                requests_exception = True            elif (issubclass(e.__class__, SearxEngineCaptchaException)):                result_container.add_unresponsive_engine(self.engine_name, 'CAPTCHA required')                logger.exception('engine {0} : CAPTCHA'.format(self.engine_name))                suspended_time = e.suspended_time  # pylint: disable=no-member            elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)):                result_container.add_unresponsive_engine(self.engine_name, 'too many requests')                logger.exception('engine {0} : Too many requests'.format(self.engine_name))                suspended_time = e.suspended_time  # pylint: disable=no-member            elif (issubclass(e.__class__, SearxEngineAccessDeniedException)):                result_container.add_unresponsive_engine(self.engine_name, 'blocked')                logger.exception('engine {0} : Searx is blocked'.format(self.engine_name))                suspended_time = e.suspended_time  # pylint: disable=no-member            else:                result_container.add_unresponsive_engine(self.engine_name, 'unexpected crash')                # others errors                logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e))        else:            if getattr(threading.current_thread(), '_timeout', False):                record_error(self.engine_name, 'Timeout')        # suspend the engine if there is an HTTP error        # or suspended_time is defined        with threading.RLock():            if requests_exception or suspended_time:                # update continuous_errors / suspend_end_time                self.engine.continuous_errors += 1                if suspended_time is None:                    suspended_time = min(settings['search']['max_ban_time_on_fail'],                                         self.engine.continuous_errors * settings['search']['ban_time_on_fail'])                self.engine.suspend_end_time = time() + suspended_time            else:                # reset the suspend variables                self.engine.continuous_errors = 0                self.engine.suspend_end_time = 0    def get_default_tests(self):        tests = {}        tests['simple'] = {            'matrix': {'query': ('life', 'computer')},            'result_container': ['not_empty'],        }        if getattr(self.engine, 'paging', False):            tests['paging'] = {                'matrix': {'query': 'time',                           'pageno': (1, 2, 3)},                'result_container': ['not_empty'],                'test': ['unique_results']            }            if 'general' in self.engine.categories:                # avoid documentation about HTML tags (<time> and <input type="time">)                tests['paging']['matrix']['query'] = 'news'        if getattr(self.engine, 'time_range', False):            tests['time_range'] = {                'matrix': {'query': 'news',                           'time_range': (None, 'day')},                'result_container': ['not_empty'],                'test': ['unique_results']            }        if getattr(self.engine, 'supported_languages', []):            tests['lang_fr'] = {                'matrix': {'query': 'paris', 'lang': 'fr'},                'result_container': ['not_empty', ('has_language', 'fr')],            }            tests['lang_en'] = {                'matrix': {'query': 'paris', 'lang': 'en'},                'result_container': ['not_empty', ('has_language', 'en')],            }        if getattr(self.engine, 'safesearch', False):            tests['safesearch'] = {                'matrix': {'query': 'porn',                           'safesearch': (0, 2)},                'test': ['unique_results']            }        return tests
 |