| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344 | '''searx is free software: you can redistribute it and/or modifyit under the terms of the GNU Affero General Public License as published bythe Free Software Foundation, either version 3 of the License, or(at your option) any later version.searx is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See theGNU Affero General Public License for more details.You should have received a copy of the GNU Affero General Public Licensealong with searx. If not, see < http://www.gnu.org/licenses/ >.(C) 2013- by Adam Tauber, <asciimoo@gmail.com>'''import threadingimport searx.poolrequests as requests_libfrom time import timefrom searx import settingsfrom searx.engines import (    categories, engines)from searx.languages import language_codesfrom searx.utils import gen_useragentfrom searx.query import Queryfrom searx.results import ResultContainerfrom searx import loggerlogger = logger.getChild('search')number_of_searches = 0def search_request_wrapper(fn, url, engine_name, **kwargs):    ret = None    engine = engines[engine_name]    try:        ret = fn(url, **kwargs)        with threading.RLock():            engine.continuous_errors = 0            engine.suspend_end_time = 0    except:        # increase errors stats        with threading.RLock():            engine.stats['errors'] += 1            engine.continuous_errors += 1            engine.suspend_end_time = time() + min(60, engine.continuous_errors)        # print engine name and specific error message        logger.exception('engine crash: {0}'.format(engine_name))    return retdef threaded_requests(requests):    timeout_limit = max(r[2]['timeout'] for r in requests)    search_start = time()    for fn, url, request_args, engine_name in requests:        request_args['timeout'] = timeout_limit        th = threading.Thread(            target=search_request_wrapper,            args=(fn, url, engine_name),            kwargs=request_args,            name='search_request',        )        th._engine_name = engine_name        th.start()    for th in threading.enumerate():        if th.name == 'search_request':            remaining_time = max(0.0, timeout_limit - (time() - search_start))            th.join(remaining_time)            if th.isAlive():                logger.warning('engine timeout: {0}'.format(th._engine_name))# get default reqest parameterdef default_request_params():    return {        'method': 'GET',        'headers': {},        'data': {},        'url': '',        'cookies': {},        'verify': True    }# create a callback wrapper for the search engine resultsdef make_callback(engine_name, callback, params, result_container):    # creating a callback wrapper for the search engine results    def process_callback(response, **kwargs):        # check if redirect comparing to the True value,        # because resp can be a Mock object, and any attribut name returns something.        if response.is_redirect is True:            logger.debug('{0} redirect on: {1}'.format(engine_name, response))            return        response.search_params = params        search_duration = time() - params['started']        # update stats with current page-load-time        with threading.RLock():            engines[engine_name].stats['page_load_time'] += search_duration        timeout_overhead = 0.2  # seconds        timeout_limit = engines[engine_name].timeout + timeout_overhead        if search_duration > timeout_limit:            with threading.RLock():                engines[engine_name].stats['errors'] += 1            return        # callback        search_results = callback(response)        # add results        for result in search_results:            result['engine'] = engine_name        result_container.extend(engine_name, search_results)    return process_callbackclass Search(object):    """Search information container"""    def __init__(self, request):        # init vars        super(Search, self).__init__()        self.query = None        self.engines = []        self.categories = []        self.paging = False        self.pageno = 1        self.lang = 'all'        # set blocked engines        self.disabled_engines = request.preferences.engines.get_disabled()        self.result_container = ResultContainer()        self.request_data = {}        # set specific language if set        self.lang = request.preferences.get_value('language')        # set request method        if request.method == 'POST':            self.request_data = request.form        else:            self.request_data = request.args        # TODO better exceptions        if not self.request_data.get('q'):            raise Exception('noquery')        # set pagenumber        pageno_param = self.request_data.get('pageno', '1')        if not pageno_param.isdigit() or int(pageno_param) < 1:            pageno_param = 1        self.pageno = int(pageno_param)        # parse query, if tags are set, which change        # the serch engine or search-language        query_obj = Query(self.request_data['q'], self.disabled_engines)        query_obj.parse_query()        # set query        self.query = query_obj.getSearchQuery()        # get last selected language in query, if possible        # TODO support search with multible languages        if len(query_obj.languages):            self.lang = query_obj.languages[-1]        self.engines = query_obj.engines        self.categories = []        # if engines are calculated from query,        # set categories by using that informations        if self.engines and query_obj.specific:            self.categories = list(set(engine['category']                                       for engine in self.engines))        # otherwise, using defined categories to        # calculate which engines should be used        else:            # set categories/engines            load_default_categories = True            for pd_name, pd in self.request_data.items():                if pd_name == 'categories':                    self.categories.extend(categ for categ in map(unicode.strip, pd.split(',')) if categ in categories)                elif pd_name == 'engines':                    pd_engines = [{'category': engines[engine].categories[0],                                   'name': engine}                                  for engine in map(unicode.strip, pd.split(',')) if engine in engines]                    if pd_engines:                        self.engines.extend(pd_engines)                        load_default_categories = False                elif pd_name.startswith('category_'):                    category = pd_name[9:]                    # if category is not found in list, skip                    if category not in categories:                        continue                    if pd != 'off':                        # add category to list                        self.categories.append(category)                    elif category in self.categories:                        # remove category from list if property is set to 'off'                        self.categories.remove(category)            if not load_default_categories:                if not self.categories:                    self.categories = list(set(engine['category']                                               for engine in self.engines))                return            # if no category is specified for this search,            # using user-defined default-configuration which            # (is stored in cookie)            if not self.categories:                cookie_categories = request.preferences.get_value('categories')                for ccateg in cookie_categories:                    if ccateg in categories:                        self.categories.append(ccateg)            # if still no category is specified, using general            # as default-category            if not self.categories:                self.categories = ['general']            # using all engines for that search, which are            # declared under the specific categories            for categ in self.categories:                self.engines.extend({'category': categ,                                     'name': engine.name}                                    for engine in categories[categ]                                    if (engine.name, categ) not in self.disabled_engines)        # remove suspended engines        self.engines = [e for e in self.engines                        if engines[e['name']].suspend_end_time <= time()]    # do search-request    def search(self, request):        global number_of_searches        # init vars        requests = []        # increase number of searches        number_of_searches += 1        # set default useragent        # user_agent = request.headers.get('User-Agent', '')        user_agent = gen_useragent()        # start search-reqest for all selected engines        for selected_engine in self.engines:            if selected_engine['name'] not in engines:                continue            engine = engines[selected_engine['name']]            # if paging is not supported, skip            if self.pageno > 1 and not engine.paging:                continue            # if search-language is set and engine does not            # provide language-support, skip            if self.lang != 'all' and not engine.language_support:                continue            # set default request parameters            request_params = default_request_params()            request_params['headers']['User-Agent'] = user_agent            request_params['category'] = selected_engine['category']            request_params['started'] = time()            request_params['pageno'] = self.pageno            if hasattr(engine, 'language') and engine.language:                request_params['language'] = engine.language            else:                request_params['language'] = self.lang            # 0 = None, 1 = Moderate, 2 = Strict            request_params['safesearch'] = request.preferences.get_value('safesearch')            # update request parameters dependent on            # search-engine (contained in engines folder)            engine.request(self.query.encode('utf-8'), request_params)            if request_params['url'] is None:                # TODO add support of offline engines                pass            # create a callback wrapper for the search engine results            callback = make_callback(                selected_engine['name'],                engine.response,                request_params,                self.result_container)            # create dictionary which contain all            # informations about the request            request_args = dict(                headers=request_params['headers'],                hooks=dict(response=callback),                cookies=request_params['cookies'],                timeout=engine.timeout,                verify=request_params['verify']            )            # specific type of request (GET or POST)            if request_params['method'] == 'GET':                req = requests_lib.get            else:                req = requests_lib.post                request_args['data'] = request_params['data']            # ignoring empty urls            if not request_params['url']:                continue            # append request to list            requests.append((req, request_params['url'],                             request_args,                             selected_engine['name']))        if not requests:            return self        # send all search-request        threaded_requests(requests)        # return results, suggestions, answers and infoboxes        return self
 |