duckduckgo_definitions.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. DuckDuckGo Instant Answer API
  4. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  5. The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
  6. reverse engineering we can see that some services (e.g. instant answers) still
  7. in use from the DDG search engine.
  8. As far we can say the *instant answers* API does not support languages, or at
  9. least we could not find out how language support should work. It seems that
  10. most of the features are based on English terms.
  11. """
  12. from typing import TYPE_CHECKING
  13. from urllib.parse import urlencode, urlparse, urljoin
  14. from lxml import html
  15. from searx.data import WIKIDATA_UNITS
  16. from searx.utils import extract_text, html_to_text, get_string_replaces_function
  17. from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
  18. if TYPE_CHECKING:
  19. import logging
  20. logger: logging.Logger
  21. # about
  22. about = {
  23. "website": 'https://duckduckgo.com/',
  24. "wikidata_id": 'Q12805',
  25. "official_api_documentation": 'https://duckduckgo.com/api',
  26. "use_official_api": True,
  27. "require_api_key": False,
  28. "results": 'JSON',
  29. }
  30. send_accept_language_header = True
  31. URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
  32. WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
  33. replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
  34. def is_broken_text(text):
  35. """duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
  36. The href URL is broken, the "Related website" may contains some HTML.
  37. The best solution seems to ignore these results.
  38. """
  39. return text.startswith('http') and ' ' in text
  40. def result_to_text(text, htmlResult):
  41. # TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme
  42. result = None
  43. dom = html.fromstring(htmlResult)
  44. a = dom.xpath('//a')
  45. if len(a) >= 1:
  46. result = extract_text(a[0])
  47. else:
  48. result = text
  49. if not is_broken_text(result):
  50. return result
  51. return None
  52. def request(query, params):
  53. params['url'] = URL.format(query=urlencode({'q': query}))
  54. return params
  55. def response(resp):
  56. # pylint: disable=too-many-locals, too-many-branches, too-many-statements
  57. results = []
  58. search_res = resp.json()
  59. # search_res.get('Entity') possible values (not exhaustive) :
  60. # * continent / country / department / location / waterfall
  61. # * actor / musician / artist
  62. # * book / performing art / film / television / media franchise / concert tour / playwright
  63. # * prepared food
  64. # * website / software / os / programming language / file format / software engineer
  65. # * company
  66. content = ''
  67. heading = search_res.get('Heading', '')
  68. attributes = []
  69. urls = []
  70. infobox_id = None
  71. relatedTopics = []
  72. # add answer if there is one
  73. answer = search_res.get('Answer', '')
  74. if answer:
  75. logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer)
  76. if search_res.get('AnswerType') not in ['calc', 'ip']:
  77. results.append({'answer': html_to_text(answer), 'url': search_res.get('AbstractURL', '')})
  78. # add infobox
  79. if 'Definition' in search_res:
  80. content = content + search_res.get('Definition', '')
  81. if 'Abstract' in search_res:
  82. content = content + search_res.get('Abstract', '')
  83. # image
  84. image = search_res.get('Image')
  85. image = None if image == '' else image
  86. if image is not None and urlparse(image).netloc == '':
  87. image = urljoin('https://duckduckgo.com', image)
  88. # urls
  89. # Official website, Wikipedia page
  90. for ddg_result in search_res.get('Results', []):
  91. firstURL = ddg_result.get('FirstURL')
  92. text = ddg_result.get('Text')
  93. if firstURL is not None and text is not None:
  94. urls.append({'title': text, 'url': firstURL})
  95. results.append({'title': heading, 'url': firstURL})
  96. # related topics
  97. for ddg_result in search_res.get('RelatedTopics', []):
  98. if 'FirstURL' in ddg_result:
  99. firstURL = ddg_result.get('FirstURL')
  100. text = ddg_result.get('Text')
  101. if not is_broken_text(text):
  102. suggestion = result_to_text(text, ddg_result.get('Result'))
  103. if suggestion != heading and suggestion is not None:
  104. results.append({'suggestion': suggestion})
  105. elif 'Topics' in ddg_result:
  106. suggestions = []
  107. relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions})
  108. for topic_result in ddg_result.get('Topics', []):
  109. suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result'))
  110. if suggestion != heading and suggestion is not None:
  111. suggestions.append(suggestion)
  112. # abstract
  113. abstractURL = search_res.get('AbstractURL', '')
  114. if abstractURL != '':
  115. # add as result ? problem always in english
  116. infobox_id = abstractURL
  117. urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True})
  118. results.append({'url': abstractURL, 'title': heading})
  119. # definition
  120. definitionURL = search_res.get('DefinitionURL', '')
  121. if definitionURL != '':
  122. # add as result ? as answer ? problem always in english
  123. infobox_id = definitionURL
  124. urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL})
  125. # to merge with wikidata's infobox
  126. if infobox_id:
  127. infobox_id = replace_http_by_https(infobox_id)
  128. # attributes
  129. # some will be converted to urls
  130. if 'Infobox' in search_res:
  131. infobox = search_res.get('Infobox')
  132. if 'content' in infobox:
  133. osm_zoom = 17
  134. coordinates = None
  135. for info in infobox.get('content'):
  136. data_type = info.get('data_type')
  137. data_label = info.get('label')
  138. data_value = info.get('value')
  139. # Workaround: ddg may return a double quote
  140. if data_value == '""':
  141. continue
  142. # Is it an external URL ?
  143. # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
  144. # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
  145. # * netflix_id
  146. external_url = get_external_url(data_type, data_value)
  147. if external_url is not None:
  148. urls.append({'title': data_label, 'url': external_url})
  149. elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
  150. # ignore instance: Wikidata value from "Instance Of" (Qxxxx)
  151. # ignore wiki_maps_trigger: reference to a javascript
  152. # ignore google_play_artist_id: service shutdown
  153. pass
  154. elif data_type == 'string' and data_label == 'Website':
  155. # There is already an URL for the website
  156. pass
  157. elif data_type == 'area':
  158. attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'})
  159. osm_zoom = area_to_osm_zoom(data_value.get('amount'))
  160. elif data_type == 'coordinates':
  161. if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
  162. # coordinate on Earth
  163. # get the zoom information from the area
  164. coordinates = info
  165. else:
  166. # coordinate NOT on Earth
  167. attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'})
  168. elif data_type == 'string':
  169. attributes.append({'label': data_label, 'value': data_value})
  170. if coordinates:
  171. data_label = coordinates.get('label')
  172. data_value = coordinates.get('value')
  173. latitude = data_value.get('latitude')
  174. longitude = data_value.get('longitude')
  175. url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
  176. urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'})
  177. if len(heading) > 0:
  178. # TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme
  179. if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0:
  180. results.append({'url': urls[0]['url'], 'title': heading, 'content': content})
  181. else:
  182. results.append(
  183. {
  184. 'infobox': heading,
  185. 'id': infobox_id,
  186. 'content': content,
  187. 'img_src': image,
  188. 'attributes': attributes,
  189. 'urls': urls,
  190. 'relatedTopics': relatedTopics,
  191. }
  192. )
  193. return results
  194. def unit_to_str(unit):
  195. for prefix in WIKIDATA_PREFIX:
  196. if unit.startswith(prefix):
  197. wikidata_entity = unit[len(prefix) :]
  198. real_unit = WIKIDATA_UNITS.get(wikidata_entity)
  199. if real_unit is None:
  200. return unit
  201. return real_unit['symbol']
  202. return unit
  203. def area_to_str(area):
  204. """parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""
  205. unit = unit_to_str(area.get('unit'))
  206. if unit is not None:
  207. try:
  208. amount = float(area.get('amount'))
  209. return '{} {}'.format(amount, unit)
  210. except ValueError:
  211. pass
  212. return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))