duckduckgo_definitions.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. # pylint: disable=missing-function-docstring
  4. """DuckDuckGo (Instant Answer API)
  5. """
  6. import json
  7. from urllib.parse import urlencode, urlparse, urljoin
  8. from lxml import html
  9. from searx import logger
  10. from searx.data import WIKIDATA_UNITS
  11. from searx.engines.duckduckgo import language_aliases
  12. from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
  13. from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function
  14. from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
  15. logger = logger.getChild('duckduckgo_definitions')
  16. # about
  17. about = {
  18. "website": 'https://duckduckgo.com/',
  19. "wikidata_id": 'Q12805',
  20. "official_api_documentation": 'https://duckduckgo.com/api',
  21. "use_official_api": True,
  22. "require_api_key": False,
  23. "results": 'JSON',
  24. }
  25. URL = 'https://api.duckduckgo.com/'\
  26. + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
  27. WIKIDATA_PREFIX = [
  28. 'http://www.wikidata.org/entity/',
  29. 'https://www.wikidata.org/entity/'
  30. ]
  31. replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
  32. def is_broken_text(text):
  33. """ duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>"
  34. The href URL is broken, the "Related website" may contains some HTML.
  35. The best solution seems to ignore these results.
  36. """
  37. return text.startswith('http') and ' ' in text
  38. def result_to_text(text, htmlResult):
  39. # TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme
  40. result = None
  41. dom = html.fromstring(htmlResult)
  42. a = dom.xpath('//a')
  43. if len(a) >= 1:
  44. result = extract_text(a[0])
  45. else:
  46. result = text
  47. if not is_broken_text(result):
  48. return result
  49. return None
  50. def request(query, params):
  51. params['url'] = URL.format(query=urlencode({'q': query}))
  52. language = match_language(
  53. params['language'],
  54. supported_languages, # pylint: disable=undefined-variable
  55. language_aliases
  56. )
  57. language = language.split('-')[0]
  58. params['headers']['Accept-Language'] = language
  59. return params
  60. def response(resp):
  61. # pylint: disable=too-many-locals, too-many-branches, too-many-statements
  62. results = []
  63. search_res = json.loads(resp.text)
  64. # search_res.get('Entity') possible values (not exhaustive) :
  65. # * continent / country / department / location / waterfall
  66. # * actor / musician / artist
  67. # * book / performing art / film / television / media franchise / concert tour / playwright
  68. # * prepared food
  69. # * website / software / os / programming language / file format / software engineer
  70. # * compagny
  71. content = ''
  72. heading = search_res.get('Heading', '')
  73. attributes = []
  74. urls = []
  75. infobox_id = None
  76. relatedTopics = []
  77. # add answer if there is one
  78. answer = search_res.get('Answer', '')
  79. if answer:
  80. logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer)
  81. if search_res.get('AnswerType') not in ['calc', 'ip']:
  82. results.append({'answer': html_to_text(answer)})
  83. # add infobox
  84. if 'Definition' in search_res:
  85. content = content + search_res.get('Definition', '')
  86. if 'Abstract' in search_res:
  87. content = content + search_res.get('Abstract', '')
  88. # image
  89. image = search_res.get('Image')
  90. image = None if image == '' else image
  91. if image is not None and urlparse(image).netloc == '':
  92. image = urljoin('https://duckduckgo.com', image)
  93. # urls
  94. # Official website, Wikipedia page
  95. for ddg_result in search_res.get('Results', []):
  96. firstURL = ddg_result.get('FirstURL')
  97. text = ddg_result.get('Text')
  98. if firstURL is not None and text is not None:
  99. urls.append({'title': text, 'url': firstURL})
  100. results.append({'title': heading, 'url': firstURL})
  101. # related topics
  102. for ddg_result in search_res.get('RelatedTopics', []):
  103. if 'FirstURL' in ddg_result:
  104. firstURL = ddg_result.get('FirstURL')
  105. text = ddg_result.get('Text')
  106. if not is_broken_text(text):
  107. suggestion = result_to_text(
  108. text,
  109. ddg_result.get('Result')
  110. )
  111. if suggestion != heading and suggestion is not None:
  112. results.append({'suggestion': suggestion})
  113. elif 'Topics' in ddg_result:
  114. suggestions = []
  115. relatedTopics.append({
  116. 'name': ddg_result.get('Name', ''),
  117. 'suggestions': suggestions
  118. })
  119. for topic_result in ddg_result.get('Topics', []):
  120. suggestion = result_to_text(
  121. topic_result.get('Text'),
  122. topic_result.get('Result')
  123. )
  124. if suggestion != heading and suggestion is not None:
  125. suggestions.append(suggestion)
  126. # abstract
  127. abstractURL = search_res.get('AbstractURL', '')
  128. if abstractURL != '':
  129. # add as result ? problem always in english
  130. infobox_id = abstractURL
  131. urls.append({
  132. 'title': search_res.get('AbstractSource'),
  133. 'url': abstractURL,
  134. 'official': True
  135. })
  136. results.append({
  137. 'url': abstractURL,
  138. 'title': heading
  139. })
  140. # definition
  141. definitionURL = search_res.get('DefinitionURL', '')
  142. if definitionURL != '':
  143. # add as result ? as answer ? problem always in english
  144. infobox_id = definitionURL
  145. urls.append({
  146. 'title': search_res.get('DefinitionSource'),
  147. 'url': definitionURL
  148. })
  149. # to merge with wikidata's infobox
  150. if infobox_id:
  151. infobox_id = replace_http_by_https(infobox_id)
  152. # attributes
  153. # some will be converted to urls
  154. if 'Infobox' in search_res:
  155. infobox = search_res.get('Infobox')
  156. if 'content' in infobox:
  157. osm_zoom = 17
  158. coordinates = None
  159. for info in infobox.get('content'):
  160. data_type = info.get('data_type')
  161. data_label = info.get('label')
  162. data_value = info.get('value')
  163. # Workaround: ddg may return a double quote
  164. if data_value == '""':
  165. continue
  166. # Is it an external URL ?
  167. # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
  168. # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
  169. # * netflix_id
  170. external_url = get_external_url(data_type, data_value)
  171. if external_url is not None:
  172. urls.append({
  173. 'title': data_label,
  174. 'url': external_url
  175. })
  176. elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
  177. # ignore instance: Wikidata value from "Instance Of" (Qxxxx)
  178. # ignore wiki_maps_trigger: reference to a javascript
  179. # ignore google_play_artist_id: service shutdown
  180. pass
  181. elif data_type == 'string' and data_label == 'Website':
  182. # There is already an URL for the website
  183. pass
  184. elif data_type == 'area':
  185. attributes.append({
  186. 'label': data_label,
  187. 'value': area_to_str(data_value),
  188. 'entity': 'P2046'
  189. })
  190. osm_zoom = area_to_osm_zoom(data_value.get('amount'))
  191. elif data_type == 'coordinates':
  192. if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
  193. # coordinate on Earth
  194. # get the zoom information from the area
  195. coordinates = info
  196. else:
  197. # coordinate NOT on Earth
  198. attributes.append({
  199. 'label': data_label,
  200. 'value': data_value,
  201. 'entity': 'P625'
  202. })
  203. elif data_type == 'string':
  204. attributes.append({
  205. 'label': data_label,
  206. 'value': data_value
  207. })
  208. if coordinates:
  209. data_label = coordinates.get('label')
  210. data_value = coordinates.get('value')
  211. latitude = data_value.get('latitude')
  212. longitude = data_value.get('longitude')
  213. url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
  214. urls.append({
  215. 'title': 'OpenStreetMap',
  216. 'url': url,
  217. 'entity': 'P625'
  218. })
  219. if len(heading) > 0:
  220. # TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme
  221. if image is None and len(attributes) == 0 and len(urls) == 1 and\
  222. len(relatedTopics) == 0 and len(content) == 0:
  223. results.append({
  224. 'url': urls[0]['url'],
  225. 'title': heading,
  226. 'content': content
  227. })
  228. else:
  229. results.append({
  230. 'infobox': heading,
  231. 'id': infobox_id,
  232. 'content': content,
  233. 'img_src': image,
  234. 'attributes': attributes,
  235. 'urls': urls,
  236. 'relatedTopics': relatedTopics
  237. })
  238. return results
  239. def unit_to_str(unit):
  240. for prefix in WIKIDATA_PREFIX:
  241. if unit.startswith(prefix):
  242. wikidata_entity = unit[len(prefix):]
  243. return WIKIDATA_UNITS.get(wikidata_entity, unit)
  244. return unit
  245. def area_to_str(area):
  246. """parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}"""
  247. unit = unit_to_str(area.get('unit'))
  248. if unit is not None:
  249. try:
  250. amount = float(area.get('amount'))
  251. return '{} {}'.format(amount, unit)
  252. except ValueError:
  253. pass
  254. return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))