3 years ago · 858aa3e604
--- a/docs/src/searx.engines.wikipedia.rst
+++ b/docs/src/searx.engines.wikipedia.rst
@@ -0,0 +1,27 @@
 
															+.. _wikimedia engines:
														
 
															+
														
 
															+=========
														
 
															+Wikimedia
														
 
															+=========
														
 
															+
														
 
															+.. contents:: Contents
														
 
															+   :depth: 2
														
 
															+   :local:
														
 
															+   :backlinks: entry
														
 
															+
														
 
															+
														
 
															+.. _wikipedia engine:
														
 
															+
														
 
															+Wikipedia
														
 
															+=========
														
 
															+
														
 
															+.. automodule:: searx.engines.wikipedia
														
 
															+  :members:
														
 
															+
														
 
															+.. _wikidata engine:
														
 
															+
														
 
															+Wikidata
														
 
															+=========
														
 
															+
														
 
															+.. automodule:: searx.engines.wikidata
														
 
															+  :members:
														
--- a/searx/autocomplete.py
+++ b/searx/autocomplete.py
@@ -143,14 +143,31 @@ def qwant(query, sxng_locale):
 
															     return results
														
 
															-def wikipedia(query, lang):
														
 
															-    # wikipedia autocompleter
														
 
															-    url = 'https://' + lang + '.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json'
														
 
															+def wikipedia(query, sxng_locale):
														
 
															+    """Autocomplete from Wikipedia. Supports Wikipedia's languages (aka netloc)."""
														
 
															+    results = []
														
 
															+    eng_traits = engines['wikipedia'].traits
														
 
															+    wiki_lang = eng_traits.get_language(sxng_locale, 'en')
														
 
															+    wiki_netloc = eng_traits.custom['wiki_netloc'].get(wiki_lang, 'en.wikipedia.org')
														
 
															+
														
 
															+    url = 'https://{wiki_netloc}/w/api.php?{args}'
														
 
															+    args = urlencode(
														
 
															+        {
														
 
															+            'action': 'opensearch',
														
 
															+            'format': 'json',
														
 
															+            'formatversion': '2',
														
 
															+            'search': query,
														
 
															+            'namespace': '0',
														
 
															+            'limit': '10',
														
 
															+        }
														
 
															+    )
														
 
															+    resp = get(url.format(args=args, wiki_netloc=wiki_netloc))
														
 
															+    if resp.ok:
														
 
															+        data = resp.json()
														
 
															+        if len(data) > 1:
														
 
															+            results = data[1]
														
 
															-    resp = loads(get(url.format(urlencode(dict(search=query)))).text)
														
 
															-    if len(resp) > 1:
														
 
															-        return resp[1]
														
 
															-    return []
														
 
															+    return results
														
 
															 def yandex(query, _lang):
														
--- a/searx/data/engine_traits.json
+++ b/searx/data/engine_traits.json
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -1,9 +1,12 @@
 
															 # SPDX-License-Identifier: AGPL-3.0-or-later
														
 
															 # lint: pylint
														
 
															-"""Wikidata
														
 
															+"""This module implements the Wikidata engine.  Some implementations are shared
														
 
															+from :ref:`wikipedia engine`.
														
 
															+
														
 
															 """
														
 
															 # pylint: disable=missing-class-docstring
														
 
															+from typing import TYPE_CHECKING
														
 
															 from hashlib import md5
														
 
															 from urllib.parse import urlencode, unquote
														
 
															 from json import loads
														
@@ -13,13 +16,17 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_
 
															 from searx.data import WIKIDATA_UNITS
														
 
															 from searx.network import post, get
														
 
															-from searx.utils import match_language, searx_useragent, get_string_replaces_function
														
 
															+from searx.utils import searx_useragent, get_string_replaces_function
														
 
															 from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
														
 
															-from searx.engines.wikipedia import (  # pylint: disable=unused-import
														
 
															-    fetch_traits,
														
 
															-    _fetch_supported_languages,
														
 
															-    supported_languages_url,
														
 
															-)
														
 
															+from searx.engines.wikipedia import fetch_traits as _fetch_traits
														
 
															+from searx.enginelib.traits import EngineTraits
														
 
															+
														
 
															+if TYPE_CHECKING:
														
 
															+    import logging
														
 
															+
														
 
															+    logger: logging.Logger
														
 
															+
														
 
															+traits: EngineTraits
														
 
															 # about
														
 
															 about = {
														
@@ -155,33 +162,35 @@ def send_wikidata_query(query, method='GET'):
 
															 def request(query, params):
														
 
															-    language = params['language'].split('-')[0]
														
 
															-    if language == 'all':
														
 
															-        language = 'en'
														
 
															-    else:
														
 
															-        language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
														
 
															+
														
 
															+    # wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN
														
 
															+    # mapped to zh
														
 
															+    sxng_lang = params['searxng_locale'].split('-')[0]
														
 
															+    language = traits.get_language(sxng_lang, 'en')
														
 
															     query, attributes = get_query(query, language)
														
 
															+    logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
														
 
															     params['method'] = 'POST'
														
 
															     params['url'] = SPARQL_ENDPOINT_URL
														
 
															     params['data'] = {'query': query}
														
 
															     params['headers'] = get_headers()
														
 
															-
														
 
															     params['language'] = language
														
 
															     params['attributes'] = attributes
														
 
															+
														
 
															     return params
														
 
															 def response(resp):
														
 
															+
														
 
															     results = []
														
 
															     jsonresponse = loads(resp.content.decode())
														
 
															-    language = resp.search_params['language'].lower()
														
 
															+    language = resp.search_params['language']
														
 
															     attributes = resp.search_params['attributes']
														
 
															+    logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
														
 
															     seen_entities = set()
														
 
															-
														
 
															     for result in jsonresponse.get('results', {}).get('bindings', []):
														
 
															         attribute_result = {key: value['value'] for key, value in result.items()}
														
 
															         entity_url = attribute_result['item']
														
@@ -757,3 +766,15 @@ def init(engine_settings=None):  # pylint: disable=unused-argument
 
															         lang = result['name']['xml:lang']
														
 
															         entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '')
														
 
															         WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize()
														
 
															+
														
 
															+
														
 
															+def fetch_traits(engine_traits: EngineTraits):
														
 
															+    """Use languages evaluated from :py:obj:`wikipedia.fetch_traits
														
 
															+    <searx.engines.wikipedia.fetch_traits>` except zh-classical (zh_Hans) what
														
 
															+    is not supported by wikidata."""
														
 
															+
														
 
															+    _fetch_traits(engine_traits)
														
 
															+    # wikidata does not support zh-classical (zh_Hans)
														
 
															+    engine_traits.languages.pop('zh_Hans')
														
 
															+    # wikidata does not have net-locations for the languages
														
 
															+    engine_traits.custom['wiki_netloc'] = {}
														
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -1,16 +1,26 @@
 
															 # SPDX-License-Identifier: AGPL-3.0-or-later
														
 
															-"""
														
 
															- Wikipedia (Web)
														
 
															+# lint: pylint
														
 
															+"""This module implements the Wikipedia engine.  Some of this implementations
														
 
															+are shared by other engines:
														
 
															+
														
 
															+- :ref:`wikidata engine`
														
 
															+
														
 
															+The list of supported languages is fetched from the article linked by
														
 
															+:py:obj:`wikipedia_article_depth`.  Unlike traditional search engines, wikipedia
														
 
															+does not support one Wikipedia for all the languages, but there is one Wikipedia
														
 
															+for every language (:py:obj:`fetch_traits`).
														
 
															 """
														
 
															-from urllib.parse import quote
														
 
															-from json import loads
														
 
															+import urllib.parse
														
 
															+import babel
														
 
															+
														
 
															 from lxml import html
														
 
															-from searx.utils import match_language, searx_useragent
														
 
															+
														
 
															 from searx import network
														
 
															+from searx.locales import language_tag
														
 
															 from searx.enginelib.traits import EngineTraits
														
 
															-engine_traits: EngineTraits
														
 
															+traits: EngineTraits
														
 
															 # about
														
 
															 about = {
														
@@ -22,32 +32,40 @@ about = {
 
															     "results": 'JSON',
														
 
															 }
														
 
															-
														
 
															 send_accept_language_header = True
														
 
															-# search-url
														
 
															-search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
														
 
															-supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
														
 
															-language_variants = {"zh": ("zh-cn", "zh-hk", "zh-mo", "zh-my", "zh-sg", "zh-tw")}
														
 
															+wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
														
 
															+"""The *editing depth* of Wikipedia is one of several possible rough indicators
														
 
															+of the encyclopedia's collaborative quality, showing how frequently its articles
														
 
															+are updated.  The measurement of depth was introduced after some limitations of
														
 
															+the classic measurement of article count were realized.
														
 
															+"""
														
 
															+
														
 
															+# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
														
 
															+rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
														
 
															+"""`wikipedia rest_v1 summary API`_: The summary response includes an extract of
														
 
															+the first paragraph of the page in plain text and HTML as well as the type of
														
 
															+page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web
														
 
															+and link previews in the apps.
														
 
															+.. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_
														
 
															-# set language in base_url
														
 
															-def url_lang(lang):
														
 
															-    lang_pre = lang.split('-')[0]
														
 
															-    if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases:
														
 
															-        return 'en'
														
 
															-    return match_language(lang, supported_languages, language_aliases).split('-')[0]
														
 
															+"""
														
 
															-# do search-request
														
 
															 def request(query, params):
														
 
															+    """Assemble a request (`wikipedia rest_v1 summary API`_)."""
														
 
															     if query.islower():
														
 
															         query = query.title()
														
 
															-    language = url_lang(params['language'])
														
 
															-    params['url'] = search_url.format(title=quote(query), language=language)
														
 
															+    engine_language = traits.get_language(params['searxng_locale'], 'en')
														
 
															+    wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/')
														
 
															+    title = urllib.parse.quote(query)
														
 
															+
														
 
															+    # '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
														
 
															+    # '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
														
 
															+    params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
														
 
															-    params['headers']['User-Agent'] = searx_useragent()
														
 
															     params['raise_for_httperror'] = False
														
 
															     params['soft_max_redirects'] = 2
														
@@ -56,13 +74,14 @@ def request(query, params):
 
															 # get response from search-request
														
 
															 def response(resp):
														
 
															+
														
 
															+    results = []
														
 
															     if resp.status_code == 404:
														
 
															         return []
														
 
															-
														
 
															     if resp.status_code == 400:
														
 
															         try:
														
 
															-            api_result = loads(resp.text)
														
 
															-        except:
														
 
															+            api_result = resp.json()
														
 
															+        except Exception:  # pylint: disable=broad-except
														
 
															             pass
														
 
															         else:
														
 
															             if (
														
@@ -73,52 +92,25 @@ def response(resp):
 
															     network.raise_for_httperror(resp)
														
 
															-    results = []
														
 
															-    api_result = loads(resp.text)
														
 
															-
														
 
															-    # skip disambiguation pages
														
 
															-    if api_result.get('type') != 'standard':
														
 
															-        return []
														
 
															-
														
 
															+    api_result = resp.json()
														
 
															     title = api_result['title']
														
 
															     wikipedia_link = api_result['content_urls']['desktop']['page']
														
 
															-
														
 
															-    results.append({'url': wikipedia_link, 'title': title})
														
 
															-
														
 
															-    results.append(
														
 
															-        {
														
 
															-            'infobox': title,
														
 
															-            'id': wikipedia_link,
														
 
															-            'content': api_result.get('extract', ''),
														
 
															-            'img_src': api_result.get('thumbnail', {}).get('source'),
														
 
															-            'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
														
 
															-        }
														
 
															-    )
														
 
															+    results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
														
 
															+
														
 
															+    if api_result.get('type') == 'standard':
														
 
															+        results.append(
														
 
															+            {
														
 
															+                'infobox': title,
														
 
															+                'id': wikipedia_link,
														
 
															+                'content': api_result.get('extract', ''),
														
 
															+                'img_src': api_result.get('thumbnail', {}).get('source'),
														
 
															+                'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
														
 
															+            }
														
 
															+        )
														
 
															     return results
														
 
															-# get supported languages from their site
														
 
															-def _fetch_supported_languages(resp):
														
 
															-    supported_languages = {}
														
 
															-    dom = html.fromstring(resp.text)
														
 
															-    tables = dom.xpath('//table[contains(@class,"sortable")]')
														
 
															-    for table in tables:
														
 
															-        # exclude header row
														
 
															-        trs = table.xpath('.//tr')[1:]
														
 
															-        for tr in trs:
														
 
															-            td = tr.xpath('./td')
														
 
															-            code = td[3].xpath('./a')[0].text
														
 
															-            name = td[1].xpath('./a')[0].text
														
 
															-            english_name = td[1].xpath('./a')[0].text
														
 
															-            articles = int(td[4].xpath('./a')[0].text.replace(',', ''))
														
 
															-            # exclude languages with too few articles
														
 
															-            if articles >= 100:
														
 
															-                supported_languages[code] = {"name": name, "english_name": english_name}
														
 
															-
														
 
															-    return supported_languages
														
 
															-
														
 
															-
														
 
															 # Nonstandard language codes
														
 
															 #
														
 
															 # These Wikipedias use language codes that do not conform to the ISO 639
														
@@ -135,104 +127,57 @@ lang_map = {
 
															     'nrm': 'nrf',
														
 
															     'roa-rup': 'rup',
														
 
															     'nds-nl': 'nds',
														
 
															-    #'roa-tara: – invented code used for the Tarantino Wikipedia (again, roa is the standard code for the large family of Romance languages that the Tarantino dialect falls within)
														
 
															     #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
														
 
															-    'zh-classical': 'zh_Hant',
														
 
															     'zh-min-nan': 'nan',
														
 
															     'zh-yue': 'yue',
														
 
															     'an': 'arg',
														
 
															+    'zh-classical': 'zh-Hant',  # babel maps classical to zh-Hans (for whatever reason)
														
 
															 }
														
 
															 unknown_langs = [
														
 
															-    'ab',  # Abkhazian
														
 
															-    'alt',  # Southern Altai
														
 
															     'an',  # Aragonese
														
 
															-    'ang',  # Anglo-Saxon
														
 
															-    'arc',  # Aramaic
														
 
															-    'ary',  # Moroccan Arabic
														
 
															-    'av',  # Avar
														
 
															     'ba',  # Bashkir
														
 
															-    'be-tarask',
														
 
															     'bar',  # Bavarian
														
 
															     'bcl',  # Central Bicolano
														
 
															-    'bh',  # Bhojpuri
														
 
															-    'bi',  # Bislama
														
 
															-    'bjn',  # Banjar
														
 
															-    'blk',  # Pa'O
														
 
															-    'bpy',  # Bishnupriya Manipuri
														
 
															-    'bxr',  # Buryat
														
 
															-    'cbk-zam',  # Zamboanga Chavacano
														
 
															-    'co',  # Corsican
														
 
															-    'cu',  # Old Church Slavonic
														
 
															-    'dty',  # Doteli
														
 
															-    'dv',  # Divehi
														
 
															-    'ext',  # Extremaduran
														
 
															-    'fj',  # Fijian
														
 
															-    'frp',  # Franco-Provençal
														
 
															-    'gan',  # Gan
														
 
															-    'gom',  # Goan Konkani
														
 
															+    'be-tarask',  # Belarusian variant / Belarusian is already covered by 'be'
														
 
															+    'bpy',  # Bishnupriya Manipuri is unknown by babel
														
 
															     'hif',  # Fiji Hindi
														
 
															     'ilo',  # Ilokano
														
 
															-    'inh',  # Ingush
														
 
															-    'jbo',  # Lojban
														
 
															-    'kaa',  # Karakalpak
														
 
															-    'kbd',  # Kabardian Circassian
														
 
															-    'kg',  # Kongo
														
 
															-    'koi',  # Komi-Permyak
														
 
															-    'krc',  # Karachay-Balkar
														
 
															-    'kv',  # Komi
														
 
															-    'lad',  # Ladino
														
 
															-    'lbe',  # Lak
														
 
															-    'lez',  # Lezgian
														
 
															     'li',  # Limburgish
														
 
															-    'ltg',  # Latgalian
														
 
															-    'mdf',  # Moksha
														
 
															-    'mnw',  # Mon
														
 
															-    'mwl',  # Mirandese
														
 
															-    'myv',  # Erzya
														
 
															-    'na',  # Nauruan
														
 
															-    'nah',  # Nahuatl
														
 
															-    'nov',  # Novial
														
 
															-    'nrm',  # Norman
														
 
															-    'pag',  # Pangasinan
														
 
															-    'pam',  # Kapampangan
														
 
															-    'pap',  # Papiamentu
														
 
															-    'pdc',  # Pennsylvania German
														
 
															-    'pfl',  # Palatinate German
														
 
															-    'roa-rup',  # Aromanian
														
 
															-    'sco',  # Scots
														
 
															-    'sco',  # Scots (https://sco.wikipedia.org) is not known by babel, Scottish Gaelic (https://gd.wikipedia.org) is known by babel
														
 
															+    'sco',  # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
														
 
															     'sh',  # Serbo-Croatian
														
 
															     'simple',  # simple english is not know as a natural language different to english (babel)
														
 
															-    'sm',  # Samoan
														
 
															-    'srn',  # Sranan
														
 
															-    'stq',  # Saterland Frisian
														
 
															-    'szy',  # Sakizaya
														
 
															-    'tcy',  # Tulu
														
 
															-    'tet',  # Tetum
														
 
															-    'tpi',  # Tok Pisin
														
 
															-    'trv',  # Seediq
														
 
															-    'ty',  # Tahitian
														
 
															-    'tyv',  # Tuvan
														
 
															-    'udm',  # Udmurt
														
 
															-    'vep',  # Vepsian
														
 
															-    'vls',  # West Flemish
														
 
															     'vo',  # Volapük
														
 
															     'wa',  # Walloon
														
 
															-    'xal',  # Kalmyk
														
 
															 ]
														
 
															 def fetch_traits(engine_traits: EngineTraits):
														
 
															-    """Fetch languages from Wikipedia"""
														
 
															-    # pylint: disable=import-outside-toplevel
														
 
															+    """Fetch languages from Wikipedia.
														
 
															+
														
 
															+    The location of the Wikipedia address of a language is mapped in a
														
 
															+    :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
														
 
															+    (``wiki_netloc``).  Here is a reduced example:
														
 
															+
														
 
															+    .. code:: python
														
 
															-    engine_traits.data_type = 'supported_languages'  # deprecated
														
 
															+       traits.custom['wiki_netloc'] = {
														
 
															+           "en": "en.wikipedia.org",
														
 
															+           ..
														
 
															+           "gsw": "als.wikipedia.org",
														
 
															+           ..
														
 
															+           "zh": "zh.wikipedia.org",
														
 
															+           "zh-classical": "zh-classical.wikipedia.org"
														
 
															+       }
														
 
															-    import babel
														
 
															-    from searx.locales import language_tag
														
 
															+    """
														
 
															-    resp = network.get('https://meta.wikimedia.org/wiki/List_of_Wikipedias')
														
 
															+    engine_traits.custom['wiki_netloc'] = {}
														
 
															+
														
 
															+    # insert alias to map from a region like zh-CN to a language zh_Hans
														
 
															+    engine_traits.languages['zh_Hans'] = 'zh'
														
 
															+
														
 
															+    resp = network.get(wikipedia_article_depth)
														
 
															     if not resp.ok:
														
 
															         print("ERROR: response from Wikipedia is not OK.")
														
@@ -242,34 +187,31 @@ def fetch_traits(engine_traits: EngineTraits):
 
															         cols = row.xpath('./td')
														
 
															         if not cols:
														
 
															             continue
														
 
															-
														
 
															         cols = [c.text_content().strip() for c in cols]
														
 
															-        articles = int(cols[4].replace(',', '').replace('-', '0'))
														
 
															-        users = int(cols[8].replace(',', '').replace('-', '0'))
														
 
															-        depth = cols[11].strip('-')
														
 
															-        if articles < 1000:
														
 
															+        depth = float(cols[3].replace('-', '0').replace(',', ''))
														
 
															+        articles = int(cols[4].replace(',', '').replace(',', ''))
														
 
															+
														
 
															+        if articles < 10000:
														
 
															             # exclude languages with too few articles
														
 
															             continue
														
 
															-        # depth: rough indicator of a Wikipedia’s quality, showing how
														
 
															-        #        frequently its articles are updated.
														
 
															-        if depth == '':
														
 
															-            if users < 1000:
														
 
															-                # depth is not calculated --> at least 1000 user should registered
														
 
															-                continue
														
 
															-        elif int(depth) < 20:
														
 
															+        if int(depth) < 20:
														
 
															+            # Rough indicator of a Wikipedia’s quality, showing how frequently
														
 
															+            # its articles are updated.
														
 
															             continue
														
 
															-        eng_tag = cols[3]
														
 
															+        eng_tag = cols[2]
														
 
															+        wiki_url = row.xpath('./td[3]/a/@href')[0]
														
 
															+        wiki_url = urllib.parse.urlparse(wiki_url)
														
 
															         if eng_tag in unknown_langs:
														
 
															             continue
														
 
															         try:
														
 
															-            sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag)))
														
 
															+            sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
														
 
															         except babel.UnknownLocaleError:
														
 
															-            print("ERROR: %s -> %s is unknown by babel" % (cols[1], eng_tag))
														
 
															+            print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
														
 
															             continue
														
 
															         conflict = engine_traits.languages.get(sxng_tag)
														
@@ -277,6 +219,6 @@ def fetch_traits(engine_traits: EngineTraits):
 
															             if conflict != eng_tag:
														
 
															                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
														
 
															             continue
														
 
															-        engine_traits.languages[sxng_tag] = eng_tag
														
 
															-    engine_traits.languages['zh_Hans'] = 'zh'
														
 
															+        engine_traits.languages[sxng_tag] = eng_tag
														
 
															+        engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc