duckduckgo_definitions.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. """
  2. DuckDuckGo (definitions)
  3. - `Instant Answer API`_
  4. - `DuckDuckGo query`_
  5. .. _Instant Answer API: https://duckduckgo.com/api
  6. .. _DuckDuckGo query: https://api.duckduckgo.com/?q=DuckDuckGo&format=json&pretty=1
  7. """
  8. import json
  9. from urllib.parse import urlencode
  10. from lxml import html
  11. from re import compile
  12. from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
  13. from searx.utils import extract_text, html_to_text, match_language
  14. url = 'https://api.duckduckgo.com/'\
  15. + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
  16. http_regex = compile(r'^http:')
  17. def result_to_text(url, text, htmlResult):
  18. # TODO : remove result ending with "Meaning" or "Category"
  19. dom = html.fromstring(htmlResult)
  20. a = dom.xpath('//a')
  21. if len(a) >= 1:
  22. return extract_text(a[0])
  23. else:
  24. return text
  25. def request(query, params):
  26. params['url'] = url.format(query=urlencode({'q': query}))
  27. language = match_language(params['language'], supported_languages, language_aliases)
  28. language = language.split('-')[0]
  29. params['headers']['Accept-Language'] = language
  30. return params
  31. def response(resp):
  32. results = []
  33. search_res = json.loads(resp.text)
  34. content = ''
  35. heading = search_res.get('Heading', '')
  36. attributes = []
  37. urls = []
  38. infobox_id = None
  39. relatedTopics = []
  40. # add answer if there is one
  41. answer = search_res.get('Answer', '')
  42. if answer:
  43. if search_res.get('AnswerType', '') not in ['calc']:
  44. results.append({'answer': html_to_text(answer)})
  45. # add infobox
  46. if 'Definition' in search_res:
  47. content = content + search_res.get('Definition', '')
  48. if 'Abstract' in search_res:
  49. content = content + search_res.get('Abstract', '')
  50. # image
  51. image = search_res.get('Image', '')
  52. image = None if image == '' else image
  53. # attributes
  54. if 'Infobox' in search_res:
  55. infobox = search_res.get('Infobox', None)
  56. if 'content' in infobox:
  57. for info in infobox.get('content'):
  58. attributes.append({'label': info.get('label'),
  59. 'value': info.get('value')})
  60. # urls
  61. for ddg_result in search_res.get('Results', []):
  62. if 'FirstURL' in ddg_result:
  63. firstURL = ddg_result.get('FirstURL', '')
  64. text = ddg_result.get('Text', '')
  65. urls.append({'title': text, 'url': firstURL})
  66. results.append({'title': heading, 'url': firstURL})
  67. # related topics
  68. for ddg_result in search_res.get('RelatedTopics', []):
  69. if 'FirstURL' in ddg_result:
  70. suggestion = result_to_text(ddg_result.get('FirstURL', None),
  71. ddg_result.get('Text', None),
  72. ddg_result.get('Result', None))
  73. if suggestion != heading:
  74. results.append({'suggestion': suggestion})
  75. elif 'Topics' in ddg_result:
  76. suggestions = []
  77. relatedTopics.append({'name': ddg_result.get('Name', ''),
  78. 'suggestions': suggestions})
  79. for topic_result in ddg_result.get('Topics', []):
  80. suggestion = result_to_text(topic_result.get('FirstURL', None),
  81. topic_result.get('Text', None),
  82. topic_result.get('Result', None))
  83. if suggestion != heading:
  84. suggestions.append(suggestion)
  85. # abstract
  86. abstractURL = search_res.get('AbstractURL', '')
  87. if abstractURL != '':
  88. # add as result ? problem always in english
  89. infobox_id = abstractURL
  90. urls.append({'title': search_res.get('AbstractSource'),
  91. 'url': abstractURL})
  92. # definition
  93. definitionURL = search_res.get('DefinitionURL', '')
  94. if definitionURL != '':
  95. # add as result ? as answer ? problem always in english
  96. infobox_id = definitionURL
  97. urls.append({'title': search_res.get('DefinitionSource'),
  98. 'url': definitionURL})
  99. # to merge with wikidata's infobox
  100. if infobox_id:
  101. infobox_id = http_regex.sub('https:', infobox_id)
  102. # entity
  103. entity = search_res.get('Entity', None)
  104. # TODO continent / country / department / location / waterfall /
  105. # mountain range :
  106. # link to map search, get weather, near by locations
  107. # TODO musician : link to music search
  108. # TODO concert tour : ??
  109. # TODO film / actor / television / media franchise :
  110. # links to IMDB / rottentomatoes (or scrap result)
  111. # TODO music : link tu musicbrainz / last.fm
  112. # TODO book : ??
  113. # TODO artist / playwright : ??
  114. # TODO compagny : ??
  115. # TODO software / os : ??
  116. # TODO software engineer : ??
  117. # TODO prepared food : ??
  118. # TODO website : ??
  119. # TODO performing art : ??
  120. # TODO prepared food : ??
  121. # TODO programming language : ??
  122. # TODO file format : ??
  123. if len(heading) > 0:
  124. # TODO get infobox.meta.value where .label='article_title'
  125. if image is None and len(attributes) == 0 and len(urls) == 1 and\
  126. len(relatedTopics) == 0 and len(content) == 0:
  127. results.append({
  128. 'url': urls[0]['url'],
  129. 'title': heading,
  130. 'content': content
  131. })
  132. else:
  133. results.append({
  134. 'infobox': heading,
  135. 'id': infobox_id,
  136. 'entity': entity,
  137. 'content': content,
  138. 'img_src': image,
  139. 'attributes': attributes,
  140. 'urls': urls,
  141. 'relatedTopics': relatedTopics
  142. })
  143. return results