| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 | """ Wikipedia (Web) @website     https://{language}.wikipedia.org @provide-api yes @using-api   yes @results     JSON @stable      yes @parse       url, infobox"""from json import loadsfrom urllib import urlencode, quotefrom lxml.html import fromstring# search-urlbase_url = 'https://{language}.wikipedia.org/'search_postfix = 'w/api.php?'\    'action=query'\    '&format=json'\    '&{query}'\    '&prop=extracts|pageimages'\    '&exintro'\    '&explaintext'\    '&pithumbsize=300'\    '&redirects'supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'# set language in base_urldef url_lang(lang):    lang = lang.split('-')[0]    if lang == 'all' or lang not in supported_languages:        language = 'en'    else:        language = lang    return base_url.format(language=language)# do search-requestdef request(query, params):    if query.islower():        query += '|' + query.title()    params['url'] = url_lang(params['language']) \        + search_postfix.format(query=urlencode({'titles': query}))    return params# get first meaningful paragraph# this should filter out disambiguation pages and notes above first paragraph# "magic numbers" were obtained by fine tuningdef extract_first_paragraph(content, title, image):    first_paragraph = None    failed_attempts = 0    for paragraph in content.split('\n'):        starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)        length = len(paragraph)        if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):            first_paragraph = paragraph            break        failed_attempts += 1        if failed_attempts > 3:            return None    return first_paragraph# get response from search-requestdef response(resp):    results = []    search_result = loads(resp.content)    # wikipedia article's unique id    # first valid id is assumed to be the requested article    for article_id in search_result['query']['pages']:        page = search_result['query']['pages'][article_id]        if int(article_id) > 0:            break    if int(article_id) < 0:        return []    title = page.get('title')    image = page.get('thumbnail')    if image:        image = image.get('source')    extract = page.get('extract')    summary = extract_first_paragraph(extract, title, image)    if not summary:        return []    # link to wikipedia article    wikipedia_link = url_lang(resp.search_params['language']) \        + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))    results.append({'url': wikipedia_link, 'title': title})    results.append({'infobox': title,                    'id': wikipedia_link,                    'content': summary,                    'img_src': image,                    'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})    return results# get supported languages from their sitedef _fetch_supported_languages(resp):    supported_languages = {}    dom = fromstring(resp.text)    tables = dom.xpath('//table[contains(@class,"sortable")]')    for table in tables:        # exclude header row        trs = table.xpath('.//tr')[1:]        for tr in trs:            td = tr.xpath('./td')            code = td[3].xpath('./a')[0].text            name = td[2].xpath('./a')[0].text            english_name = td[1].xpath('./a')[0].text            articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))            # exclude languages with too few articles            if articles >= 100:                supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}    return supported_languages
 |