| 
					
				 | 
			
			
				@@ -5,9 +5,12 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from urllib.parse import quote 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from json import loads 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from lxml.html import fromstring 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from lxml import html 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from searx.utils import match_language, searx_useragent 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from searx.network import raise_for_httperror 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from searx import network 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from searx.enginelib.traits import EngineTraits 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+engine_traits: EngineTraits 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 # about 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 about = { 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -68,7 +71,7 @@ def response(resp): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 return [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    raise_for_httperror(resp) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    network.raise_for_httperror(resp) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     results = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     api_result = loads(resp.text) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -98,7 +101,7 @@ def response(resp): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 # get supported languages from their site 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 def _fetch_supported_languages(resp): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     supported_languages = {} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    dom = fromstring(resp.text) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    dom = html.fromstring(resp.text) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     tables = dom.xpath('//table[contains(@class,"sortable")]') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     for table in tables: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         # exclude header row 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -114,3 +117,166 @@ def _fetch_supported_languages(resp): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 supported_languages[code] = {"name": name, "english_name": english_name} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     return supported_languages 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# Nonstandard language codes 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# These Wikipedias use language codes that do not conform to the ISO 639 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# standard (which is how wiki subdomains are chosen nowadays). 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+lang_map = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'be-tarask': 'bel', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'ak': 'aka', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'als': 'gsw', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'bat-smg': 'sgs', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'cbk-zam': 'cbk', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'fiu-vro': 'vro', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'map-bms': 'map', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'nrm': 'nrf', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'roa-rup': 'rup', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'nds-nl': 'nds', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    #'roa-tara: – invented code used for the Tarantino Wikipedia (again, roa is the standard code for the large family of Romance languages that the Tarantino dialect falls within) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'zh-classical': 'zh_Hant', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'zh-min-nan': 'nan', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'zh-yue': 'yue', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'an': 'arg', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+unknown_langs = [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'ab',  # Abkhazian 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'alt',  # Southern Altai 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'an',  # Aragonese 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'ang',  # Anglo-Saxon 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'arc',  # Aramaic 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'ary',  # Moroccan Arabic 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'av',  # Avar 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'ba',  # Bashkir 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'be-tarask', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'bar',  # Bavarian 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'bcl',  # Central Bicolano 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'bh',  # Bhojpuri 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'bi',  # Bislama 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'bjn',  # Banjar 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'blk',  # Pa'O 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'bpy',  # Bishnupriya Manipuri 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'bxr',  # Buryat 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'cbk-zam',  # Zamboanga Chavacano 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'co',  # Corsican 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'cu',  # Old Church Slavonic 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'dty',  # Doteli 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'dv',  # Divehi 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'ext',  # Extremaduran 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'fj',  # Fijian 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'frp',  # Franco-Provençal 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'gan',  # Gan 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'gom',  # Goan Konkani 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'hif',  # Fiji Hindi 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'ilo',  # Ilokano 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'inh',  # Ingush 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'jbo',  # Lojban 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'kaa',  # Karakalpak 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'kbd',  # Kabardian Circassian 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'kg',  # Kongo 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'koi',  # Komi-Permyak 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'krc',  # Karachay-Balkar 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'kv',  # Komi 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'lad',  # Ladino 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'lbe',  # Lak 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'lez',  # Lezgian 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'li',  # Limburgish 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'ltg',  # Latgalian 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'mdf',  # Moksha 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'mnw',  # Mon 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'mwl',  # Mirandese 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'myv',  # Erzya 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'na',  # Nauruan 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'nah',  # Nahuatl 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'nov',  # Novial 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'nrm',  # Norman 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'pag',  # Pangasinan 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'pam',  # Kapampangan 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'pap',  # Papiamentu 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'pdc',  # Pennsylvania German 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'pfl',  # Palatinate German 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'roa-rup',  # Aromanian 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'sco',  # Scots 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'sco',  # Scots (https://sco.wikipedia.org) is not known by babel, Scottish Gaelic (https://gd.wikipedia.org) is known by babel 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'sh',  # Serbo-Croatian 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'simple',  # simple english is not know as a natural language different to english (babel) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'sm',  # Samoan 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'srn',  # Sranan 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'stq',  # Saterland Frisian 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'szy',  # Sakizaya 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'tcy',  # Tulu 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'tet',  # Tetum 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'tpi',  # Tok Pisin 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'trv',  # Seediq 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'ty',  # Tahitian 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'tyv',  # Tuvan 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'udm',  # Udmurt 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'vep',  # Vepsian 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'vls',  # West Flemish 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'vo',  # Volapük 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'wa',  # Walloon 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    'xal',  # Kalmyk 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def fetch_traits(engine_traits: EngineTraits): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """Fetch languages from Wikipedia""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # pylint: disable=import-outside-toplevel 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    engine_traits.data_type = 'supported_languages'  # deprecated 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    import babel 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    from searx.locales import language_tag 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    resp = network.get('https://meta.wikimedia.org/wiki/List_of_Wikipedias') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if not resp.ok: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print("ERROR: response from Wikipedia is not OK.") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    dom = html.fromstring(resp.text) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        cols = row.xpath('./td') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if not cols: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        cols = [c.text_content().strip() for c in cols] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        articles = int(cols[4].replace(',', '').replace('-', '0')) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        users = int(cols[8].replace(',', '').replace('-', '0')) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        depth = cols[11].strip('-') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if articles < 1000: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            # exclude languages with too few articles 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # depth: rough indicator of a Wikipedia’s quality, showing how 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        #        frequently its articles are updated. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if depth == '': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if users < 1000: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                # depth is not calculated --> at least 1000 user should registered 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        elif int(depth) < 20: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        eng_tag = cols[3] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if eng_tag in unknown_langs: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag))) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except babel.UnknownLocaleError: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            print("ERROR: %s -> %s is unknown by babel" % (cols[1], eng_tag)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        conflict = engine_traits.languages.get(sxng_tag) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if conflict: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if conflict != eng_tag: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        engine_traits.languages[sxng_tag] = eng_tag 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    engine_traits.languages['zh_Hans'] = 'zh' 
			 |