| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255 | # SPDX-License-Identifier: AGPL-3.0-or-later# lint: pylint"""DuckDuckGo Instant Answer API~~~~~~~~~~~~~~~~~~~~~~~~~~~~~The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but fromreverse engineering we can see that some services (e.g. instant answers) stillin use from the DDG search engine.As far we can say the *instant answers* API does not support languages, or atleast we could not find out how language support should work.  It seems thatmost of the features are based on English terms."""from typing import TYPE_CHECKINGfrom urllib.parse import urlencode, urlparse, urljoinfrom lxml import htmlfrom searx.data import WIKIDATA_UNITSfrom searx.utils import extract_text, html_to_text, get_string_replaces_functionfrom searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoomif TYPE_CHECKING:    import logging    logger: logging.Logger# aboutabout = {    "website": 'https://duckduckgo.com/',    "wikidata_id": 'Q12805',    "official_api_documentation": 'https://duckduckgo.com/api',    "use_official_api": True,    "require_api_key": False,    "results": 'JSON',}send_accept_language_header = TrueURL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']replace_http_by_https = get_string_replaces_function({'http:': 'https:'})def is_broken_text(text):    """duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``    The href URL is broken, the "Related website" may contains some HTML.    The best solution seems to ignore these results.    """    return text.startswith('http') and ' ' in textdef result_to_text(text, htmlResult):    # TODO : remove result ending with "Meaning" or "Category"  # pylint: disable=fixme    result = None    dom = html.fromstring(htmlResult)    a = dom.xpath('//a')    if len(a) >= 1:        result = extract_text(a[0])    else:        result = text    if not is_broken_text(result):        return result    return Nonedef request(query, params):    params['url'] = URL.format(query=urlencode({'q': query}))    return paramsdef response(resp):    # pylint: disable=too-many-locals, too-many-branches, too-many-statements    results = []    search_res = resp.json()    # search_res.get('Entity') possible values (not exhaustive) :    # * continent / country / department / location / waterfall    # * actor / musician / artist    # * book / performing art / film / television  / media franchise / concert tour / playwright    # * prepared food    # * website / software / os / programming language / file format / software engineer    # * company    content = ''    heading = search_res.get('Heading', '')    attributes = []    urls = []    infobox_id = None    relatedTopics = []    # add answer if there is one    answer = search_res.get('Answer', '')    if answer:        logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer)        if search_res.get('AnswerType') not in ['calc', 'ip']:            results.append({'answer': html_to_text(answer)})    # add infobox    if 'Definition' in search_res:        content = content + search_res.get('Definition', '')    if 'Abstract' in search_res:        content = content + search_res.get('Abstract', '')    # image    image = search_res.get('Image')    image = None if image == '' else image    if image is not None and urlparse(image).netloc == '':        image = urljoin('https://duckduckgo.com', image)    # urls    # Official website, Wikipedia page    for ddg_result in search_res.get('Results', []):        firstURL = ddg_result.get('FirstURL')        text = ddg_result.get('Text')        if firstURL is not None and text is not None:            urls.append({'title': text, 'url': firstURL})            results.append({'title': heading, 'url': firstURL})    # related topics    for ddg_result in search_res.get('RelatedTopics', []):        if 'FirstURL' in ddg_result:            firstURL = ddg_result.get('FirstURL')            text = ddg_result.get('Text')            if not is_broken_text(text):                suggestion = result_to_text(text, ddg_result.get('Result'))                if suggestion != heading and suggestion is not None:                    results.append({'suggestion': suggestion})        elif 'Topics' in ddg_result:            suggestions = []            relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions})            for topic_result in ddg_result.get('Topics', []):                suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result'))                if suggestion != heading and suggestion is not None:                    suggestions.append(suggestion)    # abstract    abstractURL = search_res.get('AbstractURL', '')    if abstractURL != '':        # add as result ? problem always in english        infobox_id = abstractURL        urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True})        results.append({'url': abstractURL, 'title': heading})    # definition    definitionURL = search_res.get('DefinitionURL', '')    if definitionURL != '':        # add as result ? as answer ? problem always in english        infobox_id = definitionURL        urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL})    # to merge with wikidata's infobox    if infobox_id:        infobox_id = replace_http_by_https(infobox_id)    # attributes    # some will be converted to urls    if 'Infobox' in search_res:        infobox = search_res.get('Infobox')        if 'content' in infobox:            osm_zoom = 17            coordinates = None            for info in infobox.get('content'):                data_type = info.get('data_type')                data_label = info.get('label')                data_value = info.get('value')                # Workaround: ddg may return a double quote                if data_value == '""':                    continue                # Is it an external URL ?                # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile                # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id                # * netflix_id                external_url = get_external_url(data_type, data_value)                if external_url is not None:                    urls.append({'title': data_label, 'url': external_url})                elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:                    # ignore instance: Wikidata value from "Instance Of" (Qxxxx)                    # ignore wiki_maps_trigger: reference to a javascript                    # ignore google_play_artist_id: service shutdown                    pass                elif data_type == 'string' and data_label == 'Website':                    # There is already an URL for the website                    pass                elif data_type == 'area':                    attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'})                    osm_zoom = area_to_osm_zoom(data_value.get('amount'))                elif data_type == 'coordinates':                    if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':                        # coordinate on Earth                        # get the zoom information from the area                        coordinates = info                    else:                        # coordinate NOT on Earth                        attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'})                elif data_type == 'string':                    attributes.append({'label': data_label, 'value': data_value})            if coordinates:                data_label = coordinates.get('label')                data_value = coordinates.get('value')                latitude = data_value.get('latitude')                longitude = data_value.get('longitude')                url = get_earth_coordinates_url(latitude, longitude, osm_zoom)                urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'})    if len(heading) > 0:        # TODO get infobox.meta.value where .label='article_title'    # pylint: disable=fixme        if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0:            results.append({'url': urls[0]['url'], 'title': heading, 'content': content})        else:            results.append(                {                    'infobox': heading,                    'id': infobox_id,                    'content': content,                    'img_src': image,                    'attributes': attributes,                    'urls': urls,                    'relatedTopics': relatedTopics,                }            )    return resultsdef unit_to_str(unit):    for prefix in WIKIDATA_PREFIX:        if unit.startswith(prefix):            wikidata_entity = unit[len(prefix) :]            return WIKIDATA_UNITS.get(wikidata_entity, unit)    return unitdef area_to_str(area):    """parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""    unit = unit_to_str(area.get('unit'))    if unit is not None:        try:            amount = float(area.get('amount'))            return '{} {}'.format(amount, unit)        except ValueError:            pass    return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))
 |