wikipedia.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. """
  2. Wikipedia (Web)
  3. @website https://{language}.wikipedia.org
  4. @provide-api yes
  5. @using-api yes
  6. @results JSON
  7. @stable yes
  8. @parse url, infobox
  9. """
  10. from json import loads
  11. from urllib import urlencode, quote
  12. # search-url
  13. base_url = 'https://{language}.wikipedia.org/'
  14. search_postfix = 'w/api.php?'\
  15. 'action=query'\
  16. '&format=json'\
  17. '&{query}'\
  18. '&prop=extracts|pageimages'\
  19. '&exintro'\
  20. '&explaintext'\
  21. '&pithumbsize=300'\
  22. '&redirects'
  23. # set language in base_url
  24. def url_lang(lang):
  25. if lang == 'all':
  26. language = 'en'
  27. else:
  28. language = lang.split('_')[0]
  29. return base_url.format(language=language)
  30. # do search-request
  31. def request(query, params):
  32. if query.islower():
  33. query += '|' + query.title()
  34. params['url'] = url_lang(params['language']) \
  35. + search_postfix.format(query=urlencode({'titles': query}))
  36. return params
  37. # get first meaningful paragraph
  38. # this should filter out disambiguation pages and notes above first paragraph
  39. # "magic numbers" were obtained by fine tuning
  40. def extract_first_paragraph(content, title, image):
  41. first_paragraph = None
  42. failed_attempts = 0
  43. for paragraph in content.split('\n'):
  44. starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
  45. length = len(paragraph)
  46. if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
  47. first_paragraph = paragraph
  48. break
  49. failed_attempts += 1
  50. if failed_attempts > 3:
  51. return None
  52. return first_paragraph
  53. # get response from search-request
  54. def response(resp):
  55. results = []
  56. search_result = loads(resp.content)
  57. # wikipedia article's unique id
  58. # first valid id is assumed to be the requested article
  59. for article_id in search_result['query']['pages']:
  60. page = search_result['query']['pages'][article_id]
  61. if int(article_id) > 0:
  62. break
  63. if int(article_id) < 0:
  64. return []
  65. title = page.get('title')
  66. image = page.get('thumbnail')
  67. if image:
  68. image = image.get('source')
  69. extract = page.get('extract')
  70. summary = extract_first_paragraph(extract, title, image)
  71. if not summary:
  72. return []
  73. # link to wikipedia article
  74. wikipedia_link = url_lang(resp.search_params['language']) \
  75. + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
  76. results.append({'url': wikipedia_link, 'title': title})
  77. results.append({'infobox': title,
  78. 'id': wikipedia_link,
  79. 'content': summary,
  80. 'img_src': image,
  81. 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
  82. return results