duckduckgo_definitions.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """
  4. DuckDuckGo Instant Answer API
  5. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  6. The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
  7. reverse engineering we can see that some services (e.g. instant answers) still
  8. in use from the DDG search engine.
  9. As far we can say the *instant answers* API does not support languages, or at
  10. least we could not find out how language support should work. It seems that
  11. most of the features are based on English terms.
  12. """
  13. from typing import TYPE_CHECKING
  14. from urllib.parse import urlencode, urlparse, urljoin
  15. from lxml import html
  16. from searx.data import WIKIDATA_UNITS
  17. from searx.utils import extract_text, html_to_text, get_string_replaces_function
  18. from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
  19. if TYPE_CHECKING:
  20. import logging
  21. logger: logging.Logger
  22. # about
  23. about = {
  24. "website": 'https://duckduckgo.com/',
  25. "wikidata_id": 'Q12805',
  26. "official_api_documentation": 'https://duckduckgo.com/api',
  27. "use_official_api": True,
  28. "require_api_key": False,
  29. "results": 'JSON',
  30. }
  31. send_accept_language_header = True
  32. URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
  33. WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
  34. replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
  35. def is_broken_text(text):
  36. """duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
  37. The href URL is broken, the "Related website" may contains some HTML.
  38. The best solution seems to ignore these results.
  39. """
  40. return text.startswith('http') and ' ' in text
  41. def result_to_text(text, htmlResult):
  42. # TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme
  43. result = None
  44. dom = html.fromstring(htmlResult)
  45. a = dom.xpath('//a')
  46. if len(a) >= 1:
  47. result = extract_text(a[0])
  48. else:
  49. result = text
  50. if not is_broken_text(result):
  51. return result
  52. return None
  53. def request(query, params):
  54. params['url'] = URL.format(query=urlencode({'q': query}))
  55. return params
  56. def response(resp):
  57. # pylint: disable=too-many-locals, too-many-branches, too-many-statements
  58. results = []
  59. search_res = resp.json()
  60. # search_res.get('Entity') possible values (not exhaustive) :
  61. # * continent / country / department / location / waterfall
  62. # * actor / musician / artist
  63. # * book / performing art / film / television / media franchise / concert tour / playwright
  64. # * prepared food
  65. # * website / software / os / programming language / file format / software engineer
  66. # * company
  67. content = ''
  68. heading = search_res.get('Heading', '')
  69. attributes = []
  70. urls = []
  71. infobox_id = None
  72. relatedTopics = []
  73. # add answer if there is one
  74. answer = search_res.get('Answer', '')
  75. if answer:
  76. logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer)
  77. if search_res.get('AnswerType') not in ['calc', 'ip']:
  78. results.append({'answer': html_to_text(answer), 'url': search_res.get('AbstractURL', '')})
  79. # add infobox
  80. if 'Definition' in search_res:
  81. content = content + search_res.get('Definition', '')
  82. if 'Abstract' in search_res:
  83. content = content + search_res.get('Abstract', '')
  84. # image
  85. image = search_res.get('Image')
  86. image = None if image == '' else image
  87. if image is not None and urlparse(image).netloc == '':
  88. image = urljoin('https://duckduckgo.com', image)
  89. # urls
  90. # Official website, Wikipedia page
  91. for ddg_result in search_res.get('Results', []):
  92. firstURL = ddg_result.get('FirstURL')
  93. text = ddg_result.get('Text')
  94. if firstURL is not None and text is not None:
  95. urls.append({'title': text, 'url': firstURL})
  96. results.append({'title': heading, 'url': firstURL})
  97. # related topics
  98. for ddg_result in search_res.get('RelatedTopics', []):
  99. if 'FirstURL' in ddg_result:
  100. firstURL = ddg_result.get('FirstURL')
  101. text = ddg_result.get('Text')
  102. if not is_broken_text(text):
  103. suggestion = result_to_text(text, ddg_result.get('Result'))
  104. if suggestion != heading and suggestion is not None:
  105. results.append({'suggestion': suggestion})
  106. elif 'Topics' in ddg_result:
  107. suggestions = []
  108. relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions})
  109. for topic_result in ddg_result.get('Topics', []):
  110. suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result'))
  111. if suggestion != heading and suggestion is not None:
  112. suggestions.append(suggestion)
  113. # abstract
  114. abstractURL = search_res.get('AbstractURL', '')
  115. if abstractURL != '':
  116. # add as result ? problem always in english
  117. infobox_id = abstractURL
  118. urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True})
  119. results.append({'url': abstractURL, 'title': heading})
  120. # definition
  121. definitionURL = search_res.get('DefinitionURL', '')
  122. if definitionURL != '':
  123. # add as result ? as answer ? problem always in english
  124. infobox_id = definitionURL
  125. urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL})
  126. # to merge with wikidata's infobox
  127. if infobox_id:
  128. infobox_id = replace_http_by_https(infobox_id)
  129. # attributes
  130. # some will be converted to urls
  131. if 'Infobox' in search_res:
  132. infobox = search_res.get('Infobox')
  133. if 'content' in infobox:
  134. osm_zoom = 17
  135. coordinates = None
  136. for info in infobox.get('content'):
  137. data_type = info.get('data_type')
  138. data_label = info.get('label')
  139. data_value = info.get('value')
  140. # Workaround: ddg may return a double quote
  141. if data_value == '""':
  142. continue
  143. # Is it an external URL ?
  144. # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
  145. # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
  146. # * netflix_id
  147. external_url = get_external_url(data_type, data_value)
  148. if external_url is not None:
  149. urls.append({'title': data_label, 'url': external_url})
  150. elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
  151. # ignore instance: Wikidata value from "Instance Of" (Qxxxx)
  152. # ignore wiki_maps_trigger: reference to a javascript
  153. # ignore google_play_artist_id: service shutdown
  154. pass
  155. elif data_type == 'string' and data_label == 'Website':
  156. # There is already an URL for the website
  157. pass
  158. elif data_type == 'area':
  159. attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'})
  160. osm_zoom = area_to_osm_zoom(data_value.get('amount'))
  161. elif data_type == 'coordinates':
  162. if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
  163. # coordinate on Earth
  164. # get the zoom information from the area
  165. coordinates = info
  166. else:
  167. # coordinate NOT on Earth
  168. attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'})
  169. elif data_type == 'string':
  170. attributes.append({'label': data_label, 'value': data_value})
  171. if coordinates:
  172. data_label = coordinates.get('label')
  173. data_value = coordinates.get('value')
  174. latitude = data_value.get('latitude')
  175. longitude = data_value.get('longitude')
  176. url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
  177. urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'})
  178. if len(heading) > 0:
  179. # TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme
  180. if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0:
  181. results.append({'url': urls[0]['url'], 'title': heading, 'content': content})
  182. else:
  183. results.append(
  184. {
  185. 'infobox': heading,
  186. 'id': infobox_id,
  187. 'content': content,
  188. 'img_src': image,
  189. 'attributes': attributes,
  190. 'urls': urls,
  191. 'relatedTopics': relatedTopics,
  192. }
  193. )
  194. return results
  195. def unit_to_str(unit):
  196. for prefix in WIKIDATA_PREFIX:
  197. if unit.startswith(prefix):
  198. wikidata_entity = unit[len(prefix) :]
  199. return WIKIDATA_UNITS.get(wikidata_entity, unit)
  200. return unit
  201. def area_to_str(area):
  202. """parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""
  203. unit = unit_to_str(area.get('unit'))
  204. if unit is not None:
  205. try:
  206. amount = float(area.get('amount'))
  207. return '{} {}'.format(amount, unit)
  208. except ValueError:
  209. pass
  210. return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))