| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445 | import refrom collections import defaultdictfrom operator import itemgetterfrom threading import RLockfrom typing import List, NamedTuple, Setfrom urllib.parse import urlparse, unquotefrom searx import loggerfrom searx import utilsfrom searx.engines import enginesfrom searx.metrics import histogram_observe, counter_add, count_errorCONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)def result_content_len(content):    if isinstance(content, str):        return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))    else:        return 0def compare_urls(url_a, url_b):    """Lazy compare between two URL.    "www.example.com" and "example.com" are equals.    "www.example.com/path/" and "www.example.com/path" are equals.    "https://www.example.com/" and "http://www.example.com/" are equals.    Args:        url_a (ParseResult): first URL        url_b (ParseResult): second URL    Returns:        bool: True if url_a and url_b are equals    """        if url_a.netloc.startswith('www.'):        host_a = url_a.netloc.replace('www.', '', 1)    else:        host_a = url_a.netloc    if url_b.netloc.startswith('www.'):        host_b = url_b.netloc.replace('www.', '', 1)    else:        host_b = url_b.netloc    if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:        return False        path_a = url_a.path[:-1] if url_a.path.endswith('/') else url_a.path    path_b = url_b.path[:-1] if url_b.path.endswith('/') else url_b.path    return unquote(path_a) == unquote(path_b)def merge_two_infoboxes(infobox1, infobox2):        if hasattr(engines[infobox1['engine']], 'weight'):        weight1 = engines[infobox1['engine']].weight    else:        weight1 = 1    if hasattr(engines[infobox2['engine']], 'weight'):        weight2 = engines[infobox2['engine']].weight    else:        weight2 = 1    if weight2 > weight1:        infobox1['engine'] = infobox2['engine']    infobox1['engines'] |= infobox2['engines']    if 'urls' in infobox2:        urls1 = infobox1.get('urls', None)        if urls1 is None:            urls1 = []        for url2 in infobox2.get('urls', []):            unique_url = True            parsed_url2 = urlparse(url2.get('url', ''))            entity_url2 = url2.get('entity')            for url1 in urls1:                if (entity_url2 is not None and url1.get('entity') == entity_url2) or compare_urls(                    urlparse(url1.get('url', '')), parsed_url2                ):                    unique_url = False                    break            if unique_url:                urls1.append(url2)        infobox1['urls'] = urls1    if 'img_src' in infobox2:        img1 = infobox1.get('img_src', None)        img2 = infobox2.get('img_src')        if img1 is None:            infobox1['img_src'] = img2        elif weight2 > weight1:            infobox1['img_src'] = img2    if 'attributes' in infobox2:        attributes1 = infobox1.get('attributes')        if attributes1 is None:            infobox1['attributes'] = attributes1 = []        attributeSet = set()        for attribute in attributes1:            label = attribute.get('label')            if label not in attributeSet:                attributeSet.add(label)            entity = attribute.get('entity')            if entity not in attributeSet:                attributeSet.add(entity)        for attribute in infobox2.get('attributes', []):            if attribute.get('label') not in attributeSet and attribute.get('entity') not in attributeSet:                attributes1.append(attribute)    if 'content' in infobox2:        content1 = infobox1.get('content', None)        content2 = infobox2.get('content', '')        if content1 is not None:            if result_content_len(content2) > result_content_len(content1):                infobox1['content'] = content2        else:            infobox1['content'] = content2def result_score(result):    weight = 1.0    for result_engine in result['engines']:        if hasattr(engines[result_engine], 'weight'):            weight *= float(engines[result_engine].weight)    occurrences = len(result['positions'])    return sum((occurrences * weight) / position for position in result['positions'])class Timing(NamedTuple):    engine: str    total: float    load: floatclass UnresponsiveEngine(NamedTuple):    engine: str    error_type: str    suspended: boolclass ResultContainer:    """docstring for ResultContainer"""    __slots__ = (        '_merged_results',        'infoboxes',        'suggestions',        'answers',        'corrections',        '_number_of_results',        '_closed',        'paging',        'unresponsive_engines',        'timings',        'redirect_url',        'engine_data',        'on_result',        '_lock',    )    def __init__(self):        super().__init__()        self._merged_results = []        self.infoboxes = []        self.suggestions = set()        self.answers = {}        self.corrections = set()        self._number_of_results = []        self.engine_data = defaultdict(dict)        self._closed = False        self.paging = False        self.unresponsive_engines: Set[UnresponsiveEngine] = set()        self.timings: List[Timing] = []        self.redirect_url = None        self.on_result = lambda _: True        self._lock = RLock()    def extend(self, engine_name, results):        if self._closed:            return        standard_result_count = 0        error_msgs = set()        for result in list(results):            result['engine'] = engine_name            if 'suggestion' in result and self.on_result(result):                self.suggestions.add(result['suggestion'])            elif 'answer' in result and self.on_result(result):                self.answers[result['answer']] = result            elif 'correction' in result and self.on_result(result):                self.corrections.add(result['correction'])            elif 'infobox' in result and self.on_result(result):                self._merge_infobox(result)            elif 'number_of_results' in result and self.on_result(result):                self._number_of_results.append(result['number_of_results'])            elif 'engine_data' in result and self.on_result(result):                self.engine_data[engine_name][result['key']] = result['engine_data']            elif 'url' in result:                                if not self._is_valid_url_result(result, error_msgs):                    continue                                self._normalize_url_result(result)                                                if not self.on_result(result):                    continue                self.__merge_url_result(result, standard_result_count + 1)                standard_result_count += 1            elif self.on_result(result):                self.__merge_result_no_url(result, standard_result_count + 1)                standard_result_count += 1        if len(error_msgs) > 0:            for msg in error_msgs:                count_error(engine_name, 'some results are invalids: ' + msg, secondary=True)        if engine_name in engines:            histogram_observe(standard_result_count, 'engine', engine_name, 'result', 'count')        if not self.paging and standard_result_count > 0 and engine_name in engines and engines[engine_name].paging:            self.paging = True    def _merge_infobox(self, infobox):        add_infobox = True        infobox_id = infobox.get('id', None)        infobox['engines'] = set([infobox['engine']])        if infobox_id is not None:            parsed_url_infobox_id = urlparse(infobox_id)            with self._lock:                for existingIndex in self.infoboxes:                    if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id):                        merge_two_infoboxes(existingIndex, infobox)                        add_infobox = False        if add_infobox:            self.infoboxes.append(infobox)    def _is_valid_url_result(self, result, error_msgs):        if 'url' in result:            if not isinstance(result['url'], str):                logger.debug('result: invalid URL: %s', str(result))                error_msgs.add('invalid URL')                return False        if 'title' in result and not isinstance(result['title'], str):            logger.debug('result: invalid title: %s', str(result))            error_msgs.add('invalid title')            return False        if 'content' in result:            if not isinstance(result['content'], str):                logger.debug('result: invalid content: %s', str(result))                error_msgs.add('invalid content')                return False        return True    def _normalize_url_result(self, result):        """Return True if the result is valid"""        result['parsed_url'] = urlparse(result['url'])                if not result['parsed_url'].scheme:            result['parsed_url'] = result['parsed_url']._replace(scheme="http")            result['url'] = result['parsed_url'].geturl()                if result.get('content') == result.get('title'):            del result['content']                if 'template' not in result:            result['template'] = 'default.html'                if result.get('content'):            result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])    def __merge_url_result(self, result, position):        result['engines'] = set([result['engine']])        with self._lock:            duplicated = self.__find_duplicated_http_result(result)            if duplicated:                self.__merge_duplicated_http_result(duplicated, result, position)                return                        result['positions'] = [position]            self._merged_results.append(result)    def __find_duplicated_http_result(self, result):        result_template = result.get('template')        for merged_result in self._merged_results:            if 'parsed_url' not in merged_result:                continue            if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(                'template'            ):                if result_template != 'images.html':                                        return merged_result                else:                                                            if result.get('img_src', '') == merged_result.get('img_src', ''):                        return merged_result        return None    def __merge_duplicated_http_result(self, duplicated, result, position):                if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):            duplicated['content'] = result['content']                for key in result.keys():            if not duplicated.get(key):                duplicated[key] = result.get(key)                duplicated['positions'].append(position)                duplicated['engines'].add(result['engine'])                if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':            duplicated['url'] = result['parsed_url'].geturl()            duplicated['parsed_url'] = result['parsed_url']    def __merge_result_no_url(self, result, position):        result['engines'] = set([result['engine']])        result['positions'] = [position]        with self._lock:            self._merged_results.append(result)    def close(self):        self._closed = True        for result in self._merged_results:            score = result_score(result)            result['score'] = score            if result.get('content'):                result['content'] = utils.html_to_text(result['content']).strip()                        result['title'] = ' '.join(utils.html_to_text(result['title']).strip().split())            for result_engine in result['engines']:                counter_add(score, 'engine', result_engine, 'score')        results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)                gresults = []        categoryPositions = {}        for res in results:                        engine = engines[res['engine']]            res['category'] = engine.categories[0] if len(engine.categories) > 0 else ''                        category = (                res['category']                + ':'                + res.get('template', '')                + ':'                + ('img_src' if 'img_src' in res or 'thumbnail' in res else '')            )            current = None if category not in categoryPositions else categoryPositions[category]                                                if current is not None and (current['count'] > 0) and (len(gresults) - current['index'] < 20):                                                index = current['index']                gresults.insert(index, res)                                                for k in categoryPositions:                    v = categoryPositions[k]['index']                    if v >= index:                        categoryPositions[k]['index'] = v + 1                                current['count'] -= 1            else:                                gresults.append(res)                                categoryPositions[category] = {'index': len(gresults), 'count': 8}                self._merged_results = gresults    def get_ordered_results(self):        if not self._closed:            self.close()        return self._merged_results    def results_length(self):        return len(self._merged_results)    @property    def number_of_results(self) -> int:        """Returns the average of results number, returns zero if the average        result number is smaller than the actual result count."""        resultnum_sum = sum(self._number_of_results)        if not resultnum_sum or not self._number_of_results:            return 0        average = int(resultnum_sum / len(self._number_of_results))        if average < self.results_length():            average = 0        return average    def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):        if engines[engine_name].display_error_messages:            self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))    def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):        self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))    def get_timings(self):        return self.timings
 |