wikipedia.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. """
  2. Wikipedia (Web)
  3. @website https://{language}.wikipedia.org
  4. @provide-api yes
  5. @using-api yes
  6. @results JSON
  7. @stable yes
  8. @parse url, infobox
  9. """
  10. from json import loads
  11. from lxml.html import fromstring
  12. from searx.url_utils import quote, urlencode
  13. from searx.utils import match_language
  14. # search-url
  15. base_url = u'https://{language}.wikipedia.org/'
  16. search_url = base_url + u'w/api.php?'\
  17. 'action=query'\
  18. '&format=json'\
  19. '&{query}'\
  20. '&prop=extracts|pageimages|pageprops'\
  21. '&ppprop=disambiguation'\
  22. '&exintro'\
  23. '&explaintext'\
  24. '&pithumbsize=300'\
  25. '&redirects'
  26. supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
  27. # set language in base_url
  28. def url_lang(lang):
  29. lang_pre = lang.split('-')[0]
  30. if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases:
  31. return 'en'
  32. return match_language(lang, supported_languages, language_aliases).split('-')[0]
  33. # do search-request
  34. def request(query, params):
  35. if query.islower():
  36. query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8')
  37. params['url'] = search_url.format(query=urlencode({'titles': query}),
  38. language=url_lang(params['language']))
  39. return params
  40. # get response from search-request
  41. def response(resp):
  42. results = []
  43. search_result = loads(resp.text)
  44. # wikipedia article's unique id
  45. # first valid id is assumed to be the requested article
  46. if 'pages' not in search_result['query']:
  47. return results
  48. for article_id in search_result['query']['pages']:
  49. page = search_result['query']['pages'][article_id]
  50. if int(article_id) > 0:
  51. break
  52. if int(article_id) < 0 or 'disambiguation' in page.get('pageprops', {}):
  53. return []
  54. title = page.get('title')
  55. image = page.get('thumbnail')
  56. if image:
  57. image = image.get('source')
  58. summary = page.get('extract', '').split('\n')[0].replace('()', '')
  59. # link to wikipedia article
  60. wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
  61. + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
  62. results.append({'url': wikipedia_link, 'title': title})
  63. results.append({'infobox': title,
  64. 'id': wikipedia_link,
  65. 'content': summary,
  66. 'img_src': image,
  67. 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
  68. return results
  69. # get supported languages from their site
  70. def _fetch_supported_languages(resp):
  71. supported_languages = {}
  72. dom = fromstring(resp.text)
  73. tables = dom.xpath('//table[contains(@class,"sortable")]')
  74. for table in tables:
  75. # exclude header row
  76. trs = table.xpath('.//tr')[1:]
  77. for tr in trs:
  78. td = tr.xpath('./td')
  79. code = td[3].xpath('./a')[0].text
  80. name = td[2].xpath('./a')[0].text
  81. english_name = td[1].xpath('./a')[0].text
  82. articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
  83. # exclude languages with too few articles
  84. if articles >= 100:
  85. supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
  86. return supported_languages