wikipedia.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. """
  2. Wikipedia (Web)
  3. @website https://{language}.wikipedia.org
  4. @provide-api yes
  5. @using-api yes
  6. @results JSON
  7. @stable yes
  8. @parse url, infobox
  9. """
  10. from json import loads
  11. from urllib import urlencode, quote
  12. from lxml.html import fromstring
  13. # search-url
  14. base_url = 'https://{language}.wikipedia.org/'
  15. search_postfix = 'w/api.php?'\
  16. 'action=query'\
  17. '&format=json'\
  18. '&{query}'\
  19. '&prop=extracts|pageimages'\
  20. '&exintro'\
  21. '&explaintext'\
  22. '&pithumbsize=300'\
  23. '&redirects'
  24. supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
  25. # set language in base_url
  26. def url_lang(lang):
  27. lang = lang.split('-')[0]
  28. if lang == 'all' or lang not in supported_languages:
  29. language = 'en'
  30. else:
  31. language = lang
  32. return base_url.format(language=language)
  33. # do search-request
  34. def request(query, params):
  35. if query.islower():
  36. query += '|' + query.title()
  37. params['url'] = url_lang(params['language']) \
  38. + search_postfix.format(query=urlencode({'titles': query}))
  39. return params
  40. # get first meaningful paragraph
  41. # this should filter out disambiguation pages and notes above first paragraph
  42. # "magic numbers" were obtained by fine tuning
  43. def extract_first_paragraph(content, title, image):
  44. first_paragraph = None
  45. failed_attempts = 0
  46. for paragraph in content.split('\n'):
  47. starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
  48. length = len(paragraph)
  49. if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
  50. first_paragraph = paragraph
  51. break
  52. failed_attempts += 1
  53. if failed_attempts > 3:
  54. return None
  55. return first_paragraph
  56. # get response from search-request
  57. def response(resp):
  58. results = []
  59. search_result = loads(resp.content)
  60. # wikipedia article's unique id
  61. # first valid id is assumed to be the requested article
  62. for article_id in search_result['query']['pages']:
  63. page = search_result['query']['pages'][article_id]
  64. if int(article_id) > 0:
  65. break
  66. if int(article_id) < 0:
  67. return []
  68. title = page.get('title')
  69. image = page.get('thumbnail')
  70. if image:
  71. image = image.get('source')
  72. extract = page.get('extract')
  73. summary = extract_first_paragraph(extract, title, image)
  74. if not summary:
  75. return []
  76. # link to wikipedia article
  77. wikipedia_link = url_lang(resp.search_params['language']) \
  78. + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
  79. results.append({'url': wikipedia_link, 'title': title})
  80. results.append({'infobox': title,
  81. 'id': wikipedia_link,
  82. 'content': summary,
  83. 'img_src': image,
  84. 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
  85. return results
  86. # get supported languages from their site
  87. def _fetch_supported_languages(resp):
  88. supported_languages = {}
  89. dom = fromstring(resp.text)
  90. tables = dom.xpath('//table[contains(@class,"sortable")]')
  91. for table in tables:
  92. # exclude header row
  93. trs = table.xpath('.//tr')[1:]
  94. for tr in trs:
  95. td = tr.xpath('./td')
  96. code = td[3].xpath('./a')[0].text
  97. name = td[2].xpath('./a')[0].text
  98. english_name = td[1].xpath('./a')[0].text
  99. articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
  100. # exclude languages with too few articles
  101. if articles >= 100:
  102. supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
  103. return supported_languages