duckduckgo_definitions.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. """
  2. DuckDuckGo (definitions)
  3. - `Instant Answer API`_
  4. - `DuckDuckGo query`_
  5. .. _Instant Answer API: https://duckduckgo.com/api
  6. .. _DuckDuckGo query: https://api.duckduckgo.com/?q=DuckDuckGo&format=json&pretty=1
  7. """
  8. import json
  9. from lxml import html
  10. from re import compile
  11. import logging
  12. from searx.engines.xpath import extract_text
  13. from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
  14. from searx.url_utils import urlencode
  15. from searx.utils import html_to_text, match_language
  16. logger = logging.getLogger('searx.engines.'+ __name__)
  17. url = 'https://api.duckduckgo.com/'\
  18. + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
  19. http_regex = compile(r'^http:')
  20. def result_to_text(url, text, htmlResult):
  21. # TODO : remove result ending with "Meaning" or "Category"
  22. dom = html.fromstring(htmlResult)
  23. a = dom.xpath('//a')
  24. if len(a) >= 1:
  25. return extract_text(a[0])
  26. else:
  27. return text
  28. def request(query, params):
  29. params['url'] = url.format(query=urlencode({'q': query}))
  30. language = match_language(params['language'], supported_languages, language_aliases)
  31. language = language.split('-')[0]
  32. params['headers']['Accept-Language'] = language
  33. logger.debug("query %s: // headers: %s", params['url'], params['headers'])
  34. return params
  35. def response(resp):
  36. results = []
  37. search_res = json.loads(resp.text)
  38. content = ''
  39. heading = search_res.get('Heading', '')
  40. attributes = []
  41. urls = []
  42. infobox_id = None
  43. relatedTopics = []
  44. # add answer if there is one
  45. answer = search_res.get('Answer', '')
  46. if answer:
  47. if search_res.get('AnswerType', '') not in ['calc']:
  48. results.append({'answer': html_to_text(answer)})
  49. # add infobox
  50. if 'Definition' in search_res:
  51. content = content + search_res.get('Definition', '')
  52. if 'Abstract' in search_res:
  53. content = content + search_res.get('Abstract', '')
  54. # image
  55. image = search_res.get('Image', '')
  56. image = None if image == '' else image
  57. # attributes
  58. if 'Infobox' in search_res:
  59. infobox = search_res.get('Infobox', None)
  60. if 'content' in infobox:
  61. for info in infobox.get('content'):
  62. attributes.append({'label': info.get('label'),
  63. 'value': info.get('value')})
  64. # urls
  65. for ddg_result in search_res.get('Results', []):
  66. if 'FirstURL' in ddg_result:
  67. firstURL = ddg_result.get('FirstURL', '')
  68. text = ddg_result.get('Text', '')
  69. urls.append({'title': text, 'url': firstURL})
  70. results.append({'title': heading, 'url': firstURL})
  71. # related topics
  72. for ddg_result in search_res.get('RelatedTopics', []):
  73. if 'FirstURL' in ddg_result:
  74. suggestion = result_to_text(ddg_result.get('FirstURL', None),
  75. ddg_result.get('Text', None),
  76. ddg_result.get('Result', None))
  77. if suggestion != heading:
  78. results.append({'suggestion': suggestion})
  79. elif 'Topics' in ddg_result:
  80. suggestions = []
  81. relatedTopics.append({'name': ddg_result.get('Name', ''),
  82. 'suggestions': suggestions})
  83. for topic_result in ddg_result.get('Topics', []):
  84. suggestion = result_to_text(topic_result.get('FirstURL', None),
  85. topic_result.get('Text', None),
  86. topic_result.get('Result', None))
  87. if suggestion != heading:
  88. suggestions.append(suggestion)
  89. # abstract
  90. abstractURL = search_res.get('AbstractURL', '')
  91. if abstractURL != '':
  92. # add as result ? problem always in english
  93. infobox_id = abstractURL
  94. urls.append({'title': search_res.get('AbstractSource'),
  95. 'url': abstractURL})
  96. # definition
  97. definitionURL = search_res.get('DefinitionURL', '')
  98. if definitionURL != '':
  99. # add as result ? as answer ? problem always in english
  100. infobox_id = definitionURL
  101. urls.append({'title': search_res.get('DefinitionSource'),
  102. 'url': definitionURL})
  103. # to merge with wikidata's infobox
  104. if infobox_id:
  105. infobox_id = http_regex.sub('https:', infobox_id)
  106. # entity
  107. entity = search_res.get('Entity', None)
  108. # TODO continent / country / department / location / waterfall /
  109. # mountain range :
  110. # link to map search, get weather, near by locations
  111. # TODO musician : link to music search
  112. # TODO concert tour : ??
  113. # TODO film / actor / television / media franchise :
  114. # links to IMDB / rottentomatoes (or scrap result)
  115. # TODO music : link tu musicbrainz / last.fm
  116. # TODO book : ??
  117. # TODO artist / playwright : ??
  118. # TODO compagny : ??
  119. # TODO software / os : ??
  120. # TODO software engineer : ??
  121. # TODO prepared food : ??
  122. # TODO website : ??
  123. # TODO performing art : ??
  124. # TODO prepared food : ??
  125. # TODO programming language : ??
  126. # TODO file format : ??
  127. if len(heading) > 0:
  128. # TODO get infobox.meta.value where .label='article_title'
  129. if image is None and len(attributes) == 0 and len(urls) == 1 and\
  130. len(relatedTopics) == 0 and len(content) == 0:
  131. results.append({
  132. 'url': urls[0]['url'],
  133. 'title': heading,
  134. 'content': content
  135. })
  136. else:
  137. results.append({
  138. 'infobox': heading,
  139. 'id': infobox_id,
  140. 'entity': entity,
  141. 'content': content,
  142. 'img_src': image,
  143. 'attributes': attributes,
  144. 'urls': urls,
  145. 'relatedTopics': relatedTopics
  146. })
  147. return results