wikipedia.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Wikipedia (Web)
  4. """
  5. from urllib.parse import quote
  6. from json import loads
  7. from lxml import html
  8. from searx.utils import match_language, searx_useragent
  9. from searx import network
  10. from searx.enginelib.traits import EngineTraits
  11. engine_traits: EngineTraits
  12. # about
  13. about = {
  14. "website": 'https://www.wikipedia.org/',
  15. "wikidata_id": 'Q52',
  16. "official_api_documentation": 'https://en.wikipedia.org/api/',
  17. "use_official_api": True,
  18. "require_api_key": False,
  19. "results": 'JSON',
  20. }
  21. send_accept_language_header = True
  22. # search-url
  23. search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
  24. supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
  25. language_variants = {"zh": ("zh-cn", "zh-hk", "zh-mo", "zh-my", "zh-sg", "zh-tw")}
  26. # set language in base_url
  27. def url_lang(lang):
  28. lang_pre = lang.split('-')[0]
  29. if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases:
  30. return 'en'
  31. return match_language(lang, supported_languages, language_aliases).split('-')[0]
  32. # do search-request
  33. def request(query, params):
  34. if query.islower():
  35. query = query.title()
  36. language = url_lang(params['language'])
  37. params['url'] = search_url.format(title=quote(query), language=language)
  38. params['headers']['User-Agent'] = searx_useragent()
  39. params['raise_for_httperror'] = False
  40. params['soft_max_redirects'] = 2
  41. return params
  42. # get response from search-request
  43. def response(resp):
  44. if resp.status_code == 404:
  45. return []
  46. if resp.status_code == 400:
  47. try:
  48. api_result = loads(resp.text)
  49. except:
  50. pass
  51. else:
  52. if (
  53. api_result['type'] == 'https://mediawiki.org/wiki/HyperSwitch/errors/bad_request'
  54. and api_result['detail'] == 'title-invalid-characters'
  55. ):
  56. return []
  57. network.raise_for_httperror(resp)
  58. results = []
  59. api_result = loads(resp.text)
  60. # skip disambiguation pages
  61. if api_result.get('type') != 'standard':
  62. return []
  63. title = api_result['title']
  64. wikipedia_link = api_result['content_urls']['desktop']['page']
  65. results.append({'url': wikipedia_link, 'title': title})
  66. results.append(
  67. {
  68. 'infobox': title,
  69. 'id': wikipedia_link,
  70. 'content': api_result.get('extract', ''),
  71. 'img_src': api_result.get('thumbnail', {}).get('source'),
  72. 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
  73. }
  74. )
  75. return results
  76. # get supported languages from their site
  77. def _fetch_supported_languages(resp):
  78. supported_languages = {}
  79. dom = html.fromstring(resp.text)
  80. tables = dom.xpath('//table[contains(@class,"sortable")]')
  81. for table in tables:
  82. # exclude header row
  83. trs = table.xpath('.//tr')[1:]
  84. for tr in trs:
  85. td = tr.xpath('./td')
  86. code = td[3].xpath('./a')[0].text
  87. name = td[1].xpath('./a')[0].text
  88. english_name = td[1].xpath('./a')[0].text
  89. articles = int(td[4].xpath('./a')[0].text.replace(',', ''))
  90. # exclude languages with too few articles
  91. if articles >= 100:
  92. supported_languages[code] = {"name": name, "english_name": english_name}
  93. return supported_languages
  94. # Nonstandard language codes
  95. #
  96. # These Wikipedias use language codes that do not conform to the ISO 639
  97. # standard (which is how wiki subdomains are chosen nowadays).
  98. lang_map = {
  99. 'be-tarask': 'bel',
  100. 'ak': 'aka',
  101. 'als': 'gsw',
  102. 'bat-smg': 'sgs',
  103. 'cbk-zam': 'cbk',
  104. 'fiu-vro': 'vro',
  105. 'map-bms': 'map',
  106. 'nrm': 'nrf',
  107. 'roa-rup': 'rup',
  108. 'nds-nl': 'nds',
  109. #'roa-tara: – invented code used for the Tarantino Wikipedia (again, roa is the standard code for the large family of Romance languages that the Tarantino dialect falls within)
  110. #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
  111. 'zh-classical': 'zh_Hant',
  112. 'zh-min-nan': 'nan',
  113. 'zh-yue': 'yue',
  114. 'an': 'arg',
  115. }
  116. unknown_langs = [
  117. 'ab', # Abkhazian
  118. 'alt', # Southern Altai
  119. 'an', # Aragonese
  120. 'ang', # Anglo-Saxon
  121. 'arc', # Aramaic
  122. 'ary', # Moroccan Arabic
  123. 'av', # Avar
  124. 'ba', # Bashkir
  125. 'be-tarask',
  126. 'bar', # Bavarian
  127. 'bcl', # Central Bicolano
  128. 'bh', # Bhojpuri
  129. 'bi', # Bislama
  130. 'bjn', # Banjar
  131. 'blk', # Pa'O
  132. 'bpy', # Bishnupriya Manipuri
  133. 'bxr', # Buryat
  134. 'cbk-zam', # Zamboanga Chavacano
  135. 'co', # Corsican
  136. 'cu', # Old Church Slavonic
  137. 'dty', # Doteli
  138. 'dv', # Divehi
  139. 'ext', # Extremaduran
  140. 'fj', # Fijian
  141. 'frp', # Franco-Provençal
  142. 'gan', # Gan
  143. 'gom', # Goan Konkani
  144. 'hif', # Fiji Hindi
  145. 'ilo', # Ilokano
  146. 'inh', # Ingush
  147. 'jbo', # Lojban
  148. 'kaa', # Karakalpak
  149. 'kbd', # Kabardian Circassian
  150. 'kg', # Kongo
  151. 'koi', # Komi-Permyak
  152. 'krc', # Karachay-Balkar
  153. 'kv', # Komi
  154. 'lad', # Ladino
  155. 'lbe', # Lak
  156. 'lez', # Lezgian
  157. 'li', # Limburgish
  158. 'ltg', # Latgalian
  159. 'mdf', # Moksha
  160. 'mnw', # Mon
  161. 'mwl', # Mirandese
  162. 'myv', # Erzya
  163. 'na', # Nauruan
  164. 'nah', # Nahuatl
  165. 'nov', # Novial
  166. 'nrm', # Norman
  167. 'pag', # Pangasinan
  168. 'pam', # Kapampangan
  169. 'pap', # Papiamentu
  170. 'pdc', # Pennsylvania German
  171. 'pfl', # Palatinate German
  172. 'roa-rup', # Aromanian
  173. 'sco', # Scots
  174. 'sco', # Scots (https://sco.wikipedia.org) is not known by babel, Scottish Gaelic (https://gd.wikipedia.org) is known by babel
  175. 'sh', # Serbo-Croatian
  176. 'simple', # simple english is not know as a natural language different to english (babel)
  177. 'sm', # Samoan
  178. 'srn', # Sranan
  179. 'stq', # Saterland Frisian
  180. 'szy', # Sakizaya
  181. 'tcy', # Tulu
  182. 'tet', # Tetum
  183. 'tpi', # Tok Pisin
  184. 'trv', # Seediq
  185. 'ty', # Tahitian
  186. 'tyv', # Tuvan
  187. 'udm', # Udmurt
  188. 'vep', # Vepsian
  189. 'vls', # West Flemish
  190. 'vo', # Volapük
  191. 'wa', # Walloon
  192. 'xal', # Kalmyk
  193. ]
  194. def fetch_traits(engine_traits: EngineTraits):
  195. """Fetch languages from Wikipedia"""
  196. # pylint: disable=import-outside-toplevel
  197. engine_traits.data_type = 'supported_languages' # deprecated
  198. import babel
  199. from searx.locales import language_tag
  200. resp = network.get('https://meta.wikimedia.org/wiki/List_of_Wikipedias')
  201. if not resp.ok:
  202. print("ERROR: response from Wikipedia is not OK.")
  203. dom = html.fromstring(resp.text)
  204. for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):
  205. cols = row.xpath('./td')
  206. if not cols:
  207. continue
  208. cols = [c.text_content().strip() for c in cols]
  209. articles = int(cols[4].replace(',', '').replace('-', '0'))
  210. users = int(cols[8].replace(',', '').replace('-', '0'))
  211. depth = cols[11].strip('-')
  212. if articles < 1000:
  213. # exclude languages with too few articles
  214. continue
  215. # depth: rough indicator of a Wikipedia’s quality, showing how
  216. # frequently its articles are updated.
  217. if depth == '':
  218. if users < 1000:
  219. # depth is not calculated --> at least 1000 user should registered
  220. continue
  221. elif int(depth) < 20:
  222. continue
  223. eng_tag = cols[3]
  224. if eng_tag in unknown_langs:
  225. continue
  226. try:
  227. sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag)))
  228. except babel.UnknownLocaleError:
  229. print("ERROR: %s -> %s is unknown by babel" % (cols[1], eng_tag))
  230. continue
  231. conflict = engine_traits.languages.get(sxng_tag)
  232. if conflict:
  233. if conflict != eng_tag:
  234. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
  235. continue
  236. engine_traits.languages[sxng_tag] = eng_tag
  237. engine_traits.languages['zh_Hans'] = 'zh'