| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 | 
							- """
 
-  Wikipedia (Web)
 
-  @website     https://{language}.wikipedia.org
 
-  @provide-api yes
 
-  @using-api   yes
 
-  @results     JSON
 
-  @stable      yes
 
-  @parse       url, infobox
 
- """
 
- from json import loads
 
- from lxml.html import fromstring
 
- from searx.url_utils import quote, urlencode
 
- from searx.utils import match_language
 
- # search-url
 
- base_url = u'https://{language}.wikipedia.org/'
 
- search_url = base_url + u'w/api.php?'\
 
-     'action=query'\
 
-     '&format=json'\
 
-     '&{query}'\
 
-     '&prop=extracts|pageimages'\
 
-     '&exintro'\
 
-     '&explaintext'\
 
-     '&pithumbsize=300'\
 
-     '&redirects'
 
- supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
 
- # set language in base_url
 
- def url_lang(lang):
 
-     return match_language(lang, supported_languages).split('-')[0]
 
- # do search-request
 
- def request(query, params):
 
-     if query.islower():
 
-         query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8')
 
-     params['url'] = search_url.format(query=urlencode({'titles': query}),
 
-                                       language=url_lang(params['language']))
 
-     return params
 
- # get first meaningful paragraph
 
- # this should filter out disambiguation pages and notes above first paragraph
 
- # "magic numbers" were obtained by fine tuning
 
- def extract_first_paragraph(content, title, image):
 
-     first_paragraph = None
 
-     failed_attempts = 0
 
-     for paragraph in content.split('\n'):
 
-         starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
 
-         length = len(paragraph)
 
-         if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
 
-             first_paragraph = paragraph
 
-             break
 
-         failed_attempts += 1
 
-         if failed_attempts > 3:
 
-             return None
 
-     return first_paragraph
 
- # get response from search-request
 
- def response(resp):
 
-     results = []
 
-     search_result = loads(resp.text)
 
-     # wikipedia article's unique id
 
-     # first valid id is assumed to be the requested article
 
-     for article_id in search_result['query']['pages']:
 
-         page = search_result['query']['pages'][article_id]
 
-         if int(article_id) > 0:
 
-             break
 
-     if int(article_id) < 0:
 
-         return []
 
-     title = page.get('title')
 
-     image = page.get('thumbnail')
 
-     if image:
 
-         image = image.get('source')
 
-     extract = page.get('extract')
 
-     summary = extract_first_paragraph(extract, title, image)
 
-     # link to wikipedia article
 
-     wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
 
-         + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
 
-     results.append({'url': wikipedia_link, 'title': title})
 
-     results.append({'infobox': title,
 
-                     'id': wikipedia_link,
 
-                     'content': summary,
 
-                     'img_src': image,
 
-                     'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
 
-     return results
 
- # get supported languages from their site
 
- def _fetch_supported_languages(resp):
 
-     supported_languages = {}
 
-     dom = fromstring(resp.text)
 
-     tables = dom.xpath('//table[contains(@class,"sortable")]')
 
-     for table in tables:
 
-         # exclude header row
 
-         trs = table.xpath('.//tr')[1:]
 
-         for tr in trs:
 
-             td = tr.xpath('./td')
 
-             code = td[3].xpath('./a')[0].text
 
-             name = td[2].xpath('./a')[0].text
 
-             english_name = td[1].xpath('./a')[0].text
 
-             articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
 
-             # exclude languages with too few articles
 
-             if articles >= 100:
 
-                 supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
 
-     return supported_languages
 
 
  |