duckduckgo_definitions.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. import json
  2. from urllib import urlencode
  3. from lxml import html
  4. from searx.engines.xpath import extract_text
  5. url = 'https://api.duckduckgo.com/?{query}&format=json&pretty=0&no_redirect=1&d=1'
  6. def result_to_text(url, text, htmlResult):
  7. # TODO : remove result ending with "Meaning" or "Category"
  8. dom = html.fromstring(htmlResult)
  9. a = dom.xpath('//a')
  10. if len(a)>=1:
  11. return extract_text(a[0])
  12. else:
  13. return text
  14. def html_to_text(htmlFragment):
  15. dom = html.fromstring(htmlFragment)
  16. return extract_text(dom)
  17. def request(query, params):
  18. # TODO add kl={locale}
  19. params['url'] = url.format(query=urlencode({'q': query}))
  20. return params
  21. def response(resp):
  22. search_res = json.loads(resp.text)
  23. results = []
  24. content = ''
  25. heading = search_res.get('Heading', '')
  26. attributes = []
  27. urls = []
  28. infobox_id = None
  29. relatedTopics = []
  30. # add answer if there is one
  31. answer = search_res.get('Answer', '')
  32. if answer != '':
  33. results.append({ 'answer' : html_to_text(answer) })
  34. # add infobox
  35. if 'Definition' in search_res:
  36. content = content + search_res.get('Definition', '')
  37. if 'Abstract' in search_res:
  38. content = content + search_res.get('Abstract', '')
  39. # image
  40. image = search_res.get('Image', '')
  41. image = None if image == '' else image
  42. # attributes
  43. if 'Infobox' in search_res:
  44. infobox = search_res.get('Infobox', None)
  45. if 'content' in infobox:
  46. for info in infobox.get('content'):
  47. attributes.append({'label': info.get('label'), 'value': info.get('value')})
  48. # urls
  49. for ddg_result in search_res.get('Results', []):
  50. if 'FirstURL' in ddg_result:
  51. firstURL = ddg_result.get('FirstURL', '')
  52. text = ddg_result.get('Text', '')
  53. urls.append({'title':text, 'url':firstURL})
  54. results.append({'title':heading, 'url': firstURL})
  55. # related topics
  56. for ddg_result in search_res.get('RelatedTopics', None):
  57. if 'FirstURL' in ddg_result:
  58. suggestion = result_to_text(ddg_result.get('FirstURL', None), ddg_result.get('Text', None), ddg_result.get('Result', None))
  59. if suggestion != heading:
  60. results.append({'suggestion': suggestion})
  61. elif 'Topics' in ddg_result:
  62. suggestions = []
  63. relatedTopics.append({ 'name' : ddg_result.get('Name', ''), 'suggestions': suggestions })
  64. for topic_result in ddg_result.get('Topics', []):
  65. suggestion = result_to_text(topic_result.get('FirstURL', None), topic_result.get('Text', None), topic_result.get('Result', None))
  66. if suggestion != heading:
  67. suggestions.append(suggestion)
  68. # abstract
  69. abstractURL = search_res.get('AbstractURL', '')
  70. if abstractURL != '':
  71. # add as result ? problem always in english
  72. infobox_id = abstractURL
  73. urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL})
  74. # definition
  75. definitionURL = search_res.get('DefinitionURL', '')
  76. if definitionURL != '':
  77. # add as result ? as answer ? problem always in english
  78. infobox_id = definitionURL
  79. urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL})
  80. # entity
  81. entity = search_res.get('Entity', None)
  82. # TODO continent / country / department / location / waterfall / mountain range : link to map search, get weather, near by locations
  83. # TODO musician : link to music search
  84. # TODO concert tour : ??
  85. # TODO film / actor / television / media franchise : links to IMDB / rottentomatoes (or scrap result)
  86. # TODO music : link tu musicbrainz / last.fm
  87. # TODO book : ??
  88. # TODO artist / playwright : ??
  89. # TODO compagny : ??
  90. # TODO software / os : ??
  91. # TODO software engineer : ??
  92. # TODO prepared food : ??
  93. # TODO website : ??
  94. # TODO performing art : ??
  95. # TODO prepared food : ??
  96. # TODO programming language : ??
  97. # TODO file format : ??
  98. if len(heading)>0:
  99. # TODO get infobox.meta.value where .label='article_title'
  100. results.append({
  101. 'infobox': heading,
  102. 'id': infobox_id,
  103. 'entity': entity,
  104. 'content': content,
  105. 'img_src' : image,
  106. 'attributes': attributes,
  107. 'urls': urls,
  108. 'relatedTopics': relatedTopics
  109. })
  110. return results