wikipedia.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. """
  2. Wikipedia (Web)
  3. @website https://{language}.wikipedia.org
  4. @provide-api yes
  5. @using-api yes
  6. @results JSON
  7. @stable yes
  8. @parse url, infobox
  9. """
  10. from json import loads
  11. from urllib import urlencode, quote
  12. from requests import get
  13. from lxml.html import fromstring
  14. # search-url
  15. base_url = 'https://{language}.wikipedia.org/'
  16. search_postfix = 'w/api.php?'\
  17. 'action=query'\
  18. '&format=json'\
  19. '&{query}'\
  20. '&prop=extracts|pageimages'\
  21. '&exintro'\
  22. '&explaintext'\
  23. '&pithumbsize=300'\
  24. '&redirects'
  25. supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
  26. # set language in base_url
  27. def url_lang(lang):
  28. lang = lang.split('-')[0]
  29. if lang == 'all' or lang not in supported_languages:
  30. language = 'en'
  31. else:
  32. language = lang
  33. return base_url.format(language=language)
  34. # do search-request
  35. def request(query, params):
  36. if query.islower():
  37. query += '|' + query.title()
  38. params['url'] = url_lang(params['language']) \
  39. + search_postfix.format(query=urlencode({'titles': query}))
  40. return params
  41. # get first meaningful paragraph
  42. # this should filter out disambiguation pages and notes above first paragraph
  43. # "magic numbers" were obtained by fine tuning
  44. def extract_first_paragraph(content, title, image):
  45. first_paragraph = None
  46. failed_attempts = 0
  47. for paragraph in content.split('\n'):
  48. starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
  49. length = len(paragraph)
  50. if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
  51. first_paragraph = paragraph
  52. break
  53. failed_attempts += 1
  54. if failed_attempts > 3:
  55. return None
  56. return first_paragraph
  57. # get response from search-request
  58. def response(resp):
  59. results = []
  60. search_result = loads(resp.content)
  61. # wikipedia article's unique id
  62. # first valid id is assumed to be the requested article
  63. for article_id in search_result['query']['pages']:
  64. page = search_result['query']['pages'][article_id]
  65. if int(article_id) > 0:
  66. break
  67. if int(article_id) < 0:
  68. return []
  69. title = page.get('title')
  70. image = page.get('thumbnail')
  71. if image:
  72. image = image.get('source')
  73. extract = page.get('extract')
  74. summary = extract_first_paragraph(extract, title, image)
  75. if not summary:
  76. return []
  77. # link to wikipedia article
  78. wikipedia_link = url_lang(resp.search_params['language']) \
  79. + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
  80. results.append({'url': wikipedia_link, 'title': title})
  81. results.append({'infobox': title,
  82. 'id': wikipedia_link,
  83. 'content': summary,
  84. 'img_src': image,
  85. 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
  86. return results
  87. # get supported languages from their site
  88. def fetch_supported_languages():
  89. supported_languages = {}
  90. response = get(supported_languages_url)
  91. dom = fromstring(response.text)
  92. tables = dom.xpath('//table[contains(@class,"sortable")]')
  93. for table in tables:
  94. # exclude header row
  95. trs = table.xpath('.//tr')[1:]
  96. for tr in trs:
  97. td = tr.xpath('./td')
  98. code = td[3].xpath('./a')[0].text
  99. name = td[2].xpath('./a')[0].text
  100. english_name = td[1].xpath('./a')[0].text
  101. articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
  102. if articles >= 10000:
  103. supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
  104. return supported_languages