duckduckgo_definitions.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. DuckDuckGo Instant Answer API
  4. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  5. The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
  6. reverse engineering we can see that some services (e.g. instant answers) still
  7. in use from the DDG search engine.
  8. As far we can say the *instant answers* API does not support languages, or at
  9. least we could not find out how language support should work. It seems that
  10. most of the features are based on English terms.
  11. """
  12. from typing import TYPE_CHECKING
  13. from urllib.parse import urlencode, urlparse, urljoin
  14. from lxml import html
  15. from searx.data import WIKIDATA_UNITS
  16. from searx.utils import extract_text, html_to_text, get_string_replaces_function
  17. from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
  18. from searx.result_types import Answer
  19. if TYPE_CHECKING:
  20. import logging
  21. logger: logging.Logger
  22. # about
  23. about = {
  24. "website": 'https://duckduckgo.com/',
  25. "wikidata_id": 'Q12805',
  26. "official_api_documentation": 'https://duckduckgo.com/api',
  27. "use_official_api": True,
  28. "require_api_key": False,
  29. "results": 'JSON',
  30. }
  31. send_accept_language_header = True
  32. URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
  33. WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
  34. replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
  35. def is_broken_text(text):
  36. """duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
  37. The href URL is broken, the "Related website" may contains some HTML.
  38. The best solution seems to ignore these results.
  39. """
  40. return text.startswith('http') and ' ' in text
  41. def result_to_text(text, htmlResult):
  42. # TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme
  43. result = None
  44. dom = html.fromstring(htmlResult)
  45. a = dom.xpath('//a')
  46. if len(a) >= 1:
  47. result = extract_text(a[0])
  48. else:
  49. result = text
  50. if not is_broken_text(result):
  51. return result
  52. return None
  53. def request(query, params):
  54. params['url'] = URL.format(query=urlencode({'q': query}))
  55. return params
  56. def response(resp):
  57. # pylint: disable=too-many-locals, too-many-branches, too-many-statements
  58. results = []
  59. search_res = resp.json()
  60. # search_res.get('Entity') possible values (not exhaustive) :
  61. # * continent / country / department / location / waterfall
  62. # * actor / musician / artist
  63. # * book / performing art / film / television / media franchise / concert tour / playwright
  64. # * prepared food
  65. # * website / software / os / programming language / file format / software engineer
  66. # * company
  67. content = ''
  68. heading = search_res.get('Heading', '')
  69. attributes = []
  70. urls = []
  71. infobox_id = None
  72. relatedTopics = []
  73. # add answer if there is one
  74. answer = search_res.get('Answer', '')
  75. if answer:
  76. answer_type = search_res.get('AnswerType')
  77. logger.debug('AnswerType="%s" Answer="%s"', answer_type, answer)
  78. if isinstance(answer, str) and answer_type not in ['calc', 'ip']:
  79. Answer(results=results, answer=html_to_text(answer), url=search_res.get('AbstractURL', ''))
  80. # add infobox
  81. if 'Definition' in search_res:
  82. content = content + search_res.get('Definition', '')
  83. if 'Abstract' in search_res:
  84. content = content + search_res.get('Abstract', '')
  85. # image
  86. image = search_res.get('Image')
  87. image = None if image == '' else image
  88. if image is not None and urlparse(image).netloc == '':
  89. image = urljoin('https://duckduckgo.com', image)
  90. # urls
  91. # Official website, Wikipedia page
  92. for ddg_result in search_res.get('Results', []):
  93. firstURL = ddg_result.get('FirstURL')
  94. text = ddg_result.get('Text')
  95. if firstURL is not None and text is not None:
  96. urls.append({'title': text, 'url': firstURL})
  97. results.append({'title': heading, 'url': firstURL})
  98. # related topics
  99. for ddg_result in search_res.get('RelatedTopics', []):
  100. if 'FirstURL' in ddg_result:
  101. firstURL = ddg_result.get('FirstURL')
  102. text = ddg_result.get('Text')
  103. if not is_broken_text(text):
  104. suggestion = result_to_text(text, ddg_result.get('Result'))
  105. if suggestion != heading and suggestion is not None:
  106. results.append({'suggestion': suggestion})
  107. elif 'Topics' in ddg_result:
  108. suggestions = []
  109. relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions})
  110. for topic_result in ddg_result.get('Topics', []):
  111. suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result'))
  112. if suggestion != heading and suggestion is not None:
  113. suggestions.append(suggestion)
  114. # abstract
  115. abstractURL = search_res.get('AbstractURL', '')
  116. if abstractURL != '':
  117. # add as result ? problem always in english
  118. infobox_id = abstractURL
  119. urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True})
  120. results.append({'url': abstractURL, 'title': heading})
  121. # definition
  122. definitionURL = search_res.get('DefinitionURL', '')
  123. if definitionURL != '':
  124. # add as result ? as answer ? problem always in english
  125. infobox_id = definitionURL
  126. urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL})
  127. # to merge with wikidata's infobox
  128. if infobox_id:
  129. infobox_id = replace_http_by_https(infobox_id)
  130. # attributes
  131. # some will be converted to urls
  132. if 'Infobox' in search_res:
  133. infobox = search_res.get('Infobox')
  134. if 'content' in infobox:
  135. osm_zoom = 17
  136. coordinates = None
  137. for info in infobox.get('content'):
  138. data_type = info.get('data_type')
  139. data_label = info.get('label')
  140. data_value = info.get('value')
  141. # Workaround: ddg may return a double quote
  142. if data_value == '""':
  143. continue
  144. # Is it an external URL ?
  145. # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
  146. # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
  147. # * netflix_id
  148. external_url = get_external_url(data_type, data_value)
  149. if external_url is not None:
  150. urls.append({'title': data_label, 'url': external_url})
  151. elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
  152. # ignore instance: Wikidata value from "Instance Of" (Qxxxx)
  153. # ignore wiki_maps_trigger: reference to a javascript
  154. # ignore google_play_artist_id: service shutdown
  155. pass
  156. elif data_type == 'string' and data_label == 'Website':
  157. # There is already an URL for the website
  158. pass
  159. elif data_type == 'area':
  160. attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'})
  161. osm_zoom = area_to_osm_zoom(data_value.get('amount'))
  162. elif data_type == 'coordinates':
  163. if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
  164. # coordinate on Earth
  165. # get the zoom information from the area
  166. coordinates = info
  167. else:
  168. # coordinate NOT on Earth
  169. attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'})
  170. elif data_type == 'string':
  171. attributes.append({'label': data_label, 'value': data_value})
  172. if coordinates:
  173. data_label = coordinates.get('label')
  174. data_value = coordinates.get('value')
  175. latitude = data_value.get('latitude')
  176. longitude = data_value.get('longitude')
  177. url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
  178. urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'})
  179. if len(heading) > 0:
  180. # TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme
  181. if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0:
  182. results.append({'url': urls[0]['url'], 'title': heading, 'content': content})
  183. else:
  184. results.append(
  185. {
  186. 'infobox': heading,
  187. 'id': infobox_id,
  188. 'content': content,
  189. 'img_src': image,
  190. 'attributes': attributes,
  191. 'urls': urls,
  192. 'relatedTopics': relatedTopics,
  193. }
  194. )
  195. return results
  196. def unit_to_str(unit):
  197. for prefix in WIKIDATA_PREFIX:
  198. if unit.startswith(prefix):
  199. wikidata_entity = unit[len(prefix) :]
  200. real_unit = WIKIDATA_UNITS.get(wikidata_entity)
  201. if real_unit is None:
  202. return unit
  203. return real_unit['symbol']
  204. return unit
  205. def area_to_str(area):
  206. """parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""
  207. unit = unit_to_str(area.get('unit'))
  208. if unit is not None:
  209. try:
  210. amount = float(area.get('amount'))
  211. return '{} {}'.format(amount, unit)
  212. except ValueError:
  213. pass
  214. return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))