duckduckgo_definitions.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """DuckDuckGo (Instant Answer API)
  4. """
  5. import json
  6. from urllib.parse import urlencode, urlparse, urljoin
  7. from lxml import html
  8. from searx.data import WIKIDATA_UNITS
  9. from searx.engines.duckduckgo import language_aliases
  10. from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
  11. from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function
  12. from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
  13. # about
  14. about = {
  15. "website": 'https://duckduckgo.com/',
  16. "wikidata_id": 'Q12805',
  17. "official_api_documentation": 'https://duckduckgo.com/api',
  18. "use_official_api": True,
  19. "require_api_key": False,
  20. "results": 'JSON',
  21. }
  22. URL = 'https://api.duckduckgo.com/'\
  23. + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
  24. WIKIDATA_PREFIX = [
  25. 'http://www.wikidata.org/entity/',
  26. 'https://www.wikidata.org/entity/'
  27. ]
  28. replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
  29. def is_broken_text(text):
  30. """ duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>"
  31. The href URL is broken, the "Related website" may contains some HTML.
  32. The best solution seems to ignore these results.
  33. """
  34. return text.startswith('http') and ' ' in text
  35. def result_to_text(text, htmlResult):
  36. # TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme
  37. result = None
  38. dom = html.fromstring(htmlResult)
  39. a = dom.xpath('//a')
  40. if len(a) >= 1:
  41. result = extract_text(a[0])
  42. else:
  43. result = text
  44. if not is_broken_text(result):
  45. return result
  46. return None
  47. def request(query, params):
  48. params['url'] = URL.format(query=urlencode({'q': query}))
  49. language = match_language(
  50. params['language'],
  51. supported_languages,
  52. language_aliases
  53. )
  54. language = language.split('-')[0]
  55. params['headers']['Accept-Language'] = language
  56. return params
  57. def response(resp):
  58. # pylint: disable=too-many-locals, too-many-branches, too-many-statements
  59. results = []
  60. search_res = json.loads(resp.text)
  61. # search_res.get('Entity') possible values (not exhaustive) :
  62. # * continent / country / department / location / waterfall
  63. # * actor / musician / artist
  64. # * book / performing art / film / television / media franchise / concert tour / playwright
  65. # * prepared food
  66. # * website / software / os / programming language / file format / software engineer
  67. # * compagny
  68. content = ''
  69. heading = search_res.get('Heading', '')
  70. attributes = []
  71. urls = []
  72. infobox_id = None
  73. relatedTopics = []
  74. # add answer if there is one
  75. answer = search_res.get('Answer', '')
  76. if answer:
  77. logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer)
  78. if search_res.get('AnswerType') not in ['calc', 'ip']:
  79. results.append({'answer': html_to_text(answer)})
  80. # add infobox
  81. if 'Definition' in search_res:
  82. content = content + search_res.get('Definition', '')
  83. if 'Abstract' in search_res:
  84. content = content + search_res.get('Abstract', '')
  85. # image
  86. image = search_res.get('Image')
  87. image = None if image == '' else image
  88. if image is not None and urlparse(image).netloc == '':
  89. image = urljoin('https://duckduckgo.com', image)
  90. # urls
  91. # Official website, Wikipedia page
  92. for ddg_result in search_res.get('Results', []):
  93. firstURL = ddg_result.get('FirstURL')
  94. text = ddg_result.get('Text')
  95. if firstURL is not None and text is not None:
  96. urls.append({'title': text, 'url': firstURL})
  97. results.append({'title': heading, 'url': firstURL})
  98. # related topics
  99. for ddg_result in search_res.get('RelatedTopics', []):
  100. if 'FirstURL' in ddg_result:
  101. firstURL = ddg_result.get('FirstURL')
  102. text = ddg_result.get('Text')
  103. if not is_broken_text(text):
  104. suggestion = result_to_text(
  105. text,
  106. ddg_result.get('Result')
  107. )
  108. if suggestion != heading and suggestion is not None:
  109. results.append({'suggestion': suggestion})
  110. elif 'Topics' in ddg_result:
  111. suggestions = []
  112. relatedTopics.append({
  113. 'name': ddg_result.get('Name', ''),
  114. 'suggestions': suggestions
  115. })
  116. for topic_result in ddg_result.get('Topics', []):
  117. suggestion = result_to_text(
  118. topic_result.get('Text'),
  119. topic_result.get('Result')
  120. )
  121. if suggestion != heading and suggestion is not None:
  122. suggestions.append(suggestion)
  123. # abstract
  124. abstractURL = search_res.get('AbstractURL', '')
  125. if abstractURL != '':
  126. # add as result ? problem always in english
  127. infobox_id = abstractURL
  128. urls.append({
  129. 'title': search_res.get('AbstractSource'),
  130. 'url': abstractURL,
  131. 'official': True
  132. })
  133. results.append({
  134. 'url': abstractURL,
  135. 'title': heading
  136. })
  137. # definition
  138. definitionURL = search_res.get('DefinitionURL', '')
  139. if definitionURL != '':
  140. # add as result ? as answer ? problem always in english
  141. infobox_id = definitionURL
  142. urls.append({
  143. 'title': search_res.get('DefinitionSource'),
  144. 'url': definitionURL
  145. })
  146. # to merge with wikidata's infobox
  147. if infobox_id:
  148. infobox_id = replace_http_by_https(infobox_id)
  149. # attributes
  150. # some will be converted to urls
  151. if 'Infobox' in search_res:
  152. infobox = search_res.get('Infobox')
  153. if 'content' in infobox:
  154. osm_zoom = 17
  155. coordinates = None
  156. for info in infobox.get('content'):
  157. data_type = info.get('data_type')
  158. data_label = info.get('label')
  159. data_value = info.get('value')
  160. # Workaround: ddg may return a double quote
  161. if data_value == '""':
  162. continue
  163. # Is it an external URL ?
  164. # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
  165. # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
  166. # * netflix_id
  167. external_url = get_external_url(data_type, data_value)
  168. if external_url is not None:
  169. urls.append({
  170. 'title': data_label,
  171. 'url': external_url
  172. })
  173. elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
  174. # ignore instance: Wikidata value from "Instance Of" (Qxxxx)
  175. # ignore wiki_maps_trigger: reference to a javascript
  176. # ignore google_play_artist_id: service shutdown
  177. pass
  178. elif data_type == 'string' and data_label == 'Website':
  179. # There is already an URL for the website
  180. pass
  181. elif data_type == 'area':
  182. attributes.append({
  183. 'label': data_label,
  184. 'value': area_to_str(data_value),
  185. 'entity': 'P2046'
  186. })
  187. osm_zoom = area_to_osm_zoom(data_value.get('amount'))
  188. elif data_type == 'coordinates':
  189. if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
  190. # coordinate on Earth
  191. # get the zoom information from the area
  192. coordinates = info
  193. else:
  194. # coordinate NOT on Earth
  195. attributes.append({
  196. 'label': data_label,
  197. 'value': data_value,
  198. 'entity': 'P625'
  199. })
  200. elif data_type == 'string':
  201. attributes.append({
  202. 'label': data_label,
  203. 'value': data_value
  204. })
  205. if coordinates:
  206. data_label = coordinates.get('label')
  207. data_value = coordinates.get('value')
  208. latitude = data_value.get('latitude')
  209. longitude = data_value.get('longitude')
  210. url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
  211. urls.append({
  212. 'title': 'OpenStreetMap',
  213. 'url': url,
  214. 'entity': 'P625'
  215. })
  216. if len(heading) > 0:
  217. # TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme
  218. if image is None and len(attributes) == 0 and len(urls) == 1 and\
  219. len(relatedTopics) == 0 and len(content) == 0:
  220. results.append({
  221. 'url': urls[0]['url'],
  222. 'title': heading,
  223. 'content': content
  224. })
  225. else:
  226. results.append({
  227. 'infobox': heading,
  228. 'id': infobox_id,
  229. 'content': content,
  230. 'img_src': image,
  231. 'attributes': attributes,
  232. 'urls': urls,
  233. 'relatedTopics': relatedTopics
  234. })
  235. return results
  236. def unit_to_str(unit):
  237. for prefix in WIKIDATA_PREFIX:
  238. if unit.startswith(prefix):
  239. wikidata_entity = unit[len(prefix):]
  240. return WIKIDATA_UNITS.get(wikidata_entity, unit)
  241. return unit
  242. def area_to_str(area):
  243. """parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}"""
  244. unit = unit_to_str(area.get('unit'))
  245. if unit is not None:
  246. try:
  247. amount = float(area.get('amount'))
  248. return '{} {}'.format(amount, unit)
  249. except ValueError:
  250. pass
  251. return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))