wikipedia.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """This module implements the Wikipedia engine. Some of this implementations
  4. are shared by other engines:
  5. - :ref:`wikidata engine`
  6. The list of supported languages is :py:obj:`fetched <fetch_wikimedia_traits>` from
  7. the article linked by :py:obj:`list_of_wikipedias`.
  8. Unlike traditional search engines, wikipedia does not support one Wikipedia for
  9. all languages, but there is one Wikipedia for each supported language. Some of
  10. these Wikipedias have a LanguageConverter_ enabled
  11. (:py:obj:`rest_v1_summary_url`).
  12. A LanguageConverter_ (LC) is a system based on language variants that
  13. automatically converts the content of a page into a different variant. A variant
  14. is mostly the same language in a different script.
  15. - `Wikipedias in multiple writing systems`_
  16. - `Automatic conversion between traditional and simplified Chinese characters`_
  17. PR-2554_:
  18. The Wikipedia link returned by the API is still the same in all cases
  19. (`https://zh.wikipedia.org/wiki/出租車`_) but if your browser's
  20. ``Accept-Language`` is set to any of ``zh``, ``zh-CN``, ``zh-TW``, ``zh-HK``
  21. or .. Wikipedia's LC automatically returns the desired script in their
  22. web-page.
  23. - You can test the API here: https://reqbin.com/gesg2kvx
  24. .. _https://zh.wikipedia.org/wiki/出租車:
  25. https://zh.wikipedia.org/wiki/%E5%87%BA%E7%A7%9F%E8%BB%8A
  26. To support Wikipedia's LanguageConverter_, a SearXNG request to Wikipedia uses
  27. :py:obj:`get_wiki_params` and :py:obj:`wiki_lc_locale_variants' in the
  28. :py:obj:`fetch_wikimedia_traits` function.
  29. To test in SearXNG, query for ``!wp 出租車`` with each of the available Chinese
  30. options:
  31. - ``!wp 出租車 :zh`` should show 出租車
  32. - ``!wp 出租車 :zh-CN`` should show 出租车
  33. - ``!wp 出租車 :zh-TW`` should show 計程車
  34. - ``!wp 出租車 :zh-HK`` should show 的士
  35. - ``!wp 出租車 :zh-SG`` should show 德士
  36. .. _LanguageConverter:
  37. https://www.mediawiki.org/wiki/Writing_systems#LanguageConverter
  38. .. _Wikipedias in multiple writing systems:
  39. https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems
  40. .. _Automatic conversion between traditional and simplified Chinese characters:
  41. https://en.wikipedia.org/wiki/Chinese_Wikipedia#Automatic_conversion_between_traditional_and_simplified_Chinese_characters
  42. .. _PR-2554: https://github.com/searx/searx/pull/2554
  43. """
  44. import urllib.parse
  45. import babel
  46. from lxml import html
  47. from searx import utils
  48. from searx import network as _network
  49. from searx import locales
  50. from searx.enginelib.traits import EngineTraits
  51. traits: EngineTraits
  52. # about
  53. about = {
  54. "website": 'https://www.wikipedia.org/',
  55. "wikidata_id": 'Q52',
  56. "official_api_documentation": 'https://en.wikipedia.org/api/',
  57. "use_official_api": True,
  58. "require_api_key": False,
  59. "results": 'JSON',
  60. }
  61. display_type = ["infobox"]
  62. """A list of display types composed from ``infobox`` and ``list``. The latter
  63. one will add a hit to the result list. The first one will show a hit in the
  64. info box. Both values can be set, or one of the two can be set."""
  65. send_accept_language_header = True
  66. """The HTTP ``Accept-Language`` header is needed for wikis where
  67. LanguageConverter_ is enabled."""
  68. list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
  69. """`List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_
  70. """
  71. wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
  72. """The *editing depth* of Wikipedia is one of several possible rough indicators
  73. of the encyclopedia's collaborative quality, showing how frequently its articles
  74. are updated. The measurement of depth was introduced after some limitations of
  75. the classic measurement of article count were realized.
  76. """
  77. rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
  78. """
  79. `wikipedia rest_v1 summary API`_:
  80. The summary response includes an extract of the first paragraph of the page in
  81. plain text and HTML as well as the type of page. This is useful for page
  82. previews (fka. Hovercards, aka. Popups) on the web and link previews in the
  83. apps.
  84. HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`):
  85. The desired language variant code for wikis where LanguageConverter_ is
  86. enabled.
  87. .. _wikipedia rest_v1 summary API:
  88. https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_
  89. """
  90. wiki_lc_locale_variants = {
  91. "zh": (
  92. "zh-CN",
  93. "zh-HK",
  94. "zh-MO",
  95. "zh-MY",
  96. "zh-SG",
  97. "zh-TW",
  98. ),
  99. "zh-classical": ("zh-classical",),
  100. }
  101. """Mapping rule of the LanguageConverter_ to map a language and its variants to
  102. a Locale (used in the HTTP ``Accept-Language`` header). For example see `LC
  103. Chinese`_.
  104. .. _LC Chinese:
  105. https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems#Chinese
  106. """
  107. wikipedia_script_variants = {
  108. "zh": (
  109. "zh_Hant",
  110. "zh_Hans",
  111. )
  112. }
  113. def get_wiki_params(sxng_locale, eng_traits):
  114. """Returns the Wikipedia language tag and the netloc that fits to the
  115. ``sxng_locale``. To support LanguageConverter_ this function rates a locale
  116. (region) higher than a language (compare :py:obj:`wiki_lc_locale_variants`).
  117. """
  118. eng_tag = eng_traits.get_region(sxng_locale, eng_traits.get_language(sxng_locale, 'en'))
  119. wiki_netloc = eng_traits.custom['wiki_netloc'].get(eng_tag, 'en.wikipedia.org')
  120. return eng_tag, wiki_netloc
  121. def request(query, params):
  122. """Assemble a request (`wikipedia rest_v1 summary API`_)."""
  123. if query.islower():
  124. query = query.title()
  125. _eng_tag, wiki_netloc = get_wiki_params(params['searxng_locale'], traits)
  126. title = urllib.parse.quote(query)
  127. params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
  128. params['raise_for_httperror'] = False
  129. params['soft_max_redirects'] = 2
  130. return params
  131. # get response from search-request
  132. def response(resp):
  133. results = []
  134. if resp.status_code == 404:
  135. return []
  136. if resp.status_code == 400:
  137. try:
  138. api_result = resp.json()
  139. except Exception: # pylint: disable=broad-except
  140. pass
  141. else:
  142. if (
  143. api_result['type'] == 'https://mediawiki.org/wiki/HyperSwitch/errors/bad_request'
  144. and api_result['detail'] == 'title-invalid-characters'
  145. ):
  146. return []
  147. _network.raise_for_httperror(resp)
  148. api_result = resp.json()
  149. title = utils.html_to_text(api_result.get('titles', {}).get('display') or api_result.get('title'))
  150. wikipedia_link = api_result['content_urls']['desktop']['page']
  151. if "list" in display_type or api_result.get('type') != 'standard':
  152. # show item in the result list if 'list' is in the display options or it
  153. # is a item that can't be displayed in a infobox.
  154. results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
  155. if "infobox" in display_type:
  156. if api_result.get('type') == 'standard':
  157. results.append(
  158. {
  159. 'infobox': title,
  160. 'id': wikipedia_link,
  161. 'content': api_result.get('extract', ''),
  162. 'img_src': api_result.get('thumbnail', {}).get('source'),
  163. 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
  164. }
  165. )
  166. return results
  167. # Nonstandard language codes
  168. #
  169. # These Wikipedias use language codes that do not conform to the ISO 639
  170. # standard (which is how wiki subdomains are chosen nowadays).
  171. lang_map = locales.LOCALE_BEST_MATCH.copy()
  172. lang_map.update(
  173. {
  174. 'be-tarask': 'bel',
  175. 'ak': 'aka',
  176. 'als': 'gsw',
  177. 'bat-smg': 'sgs',
  178. 'cbk-zam': 'cbk',
  179. 'fiu-vro': 'vro',
  180. 'map-bms': 'map',
  181. 'no': 'nb-NO',
  182. 'nrm': 'nrf',
  183. 'roa-rup': 'rup',
  184. 'nds-nl': 'nds',
  185. #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
  186. 'zh-min-nan': 'nan',
  187. 'zh-yue': 'yue',
  188. 'an': 'arg',
  189. }
  190. )
  191. def fetch_traits(engine_traits: EngineTraits):
  192. fetch_wikimedia_traits(engine_traits)
  193. print("WIKIPEDIA_LANGUAGES: %s" % len(engine_traits.custom['WIKIPEDIA_LANGUAGES']))
  194. def fetch_wikimedia_traits(engine_traits: EngineTraits):
  195. """Fetch languages from Wikipedia. Not all languages from the
  196. :py:obj:`list_of_wikipedias` are supported by SearXNG locales, only those
  197. known from :py:obj:`searx.locales.LOCALE_NAMES` or those with a minimal
  198. :py:obj:`editing depth <wikipedia_article_depth>`.
  199. The location of the Wikipedia address of a language is mapped in a
  200. :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
  201. (``wiki_netloc``). Here is a reduced example:
  202. .. code:: python
  203. traits.custom['wiki_netloc'] = {
  204. "en": "en.wikipedia.org",
  205. ..
  206. "gsw": "als.wikipedia.org",
  207. ..
  208. "zh": "zh.wikipedia.org",
  209. "zh-classical": "zh-classical.wikipedia.org"
  210. }
  211. """
  212. # pylint: disable=too-many-branches
  213. engine_traits.custom['wiki_netloc'] = {}
  214. engine_traits.custom['WIKIPEDIA_LANGUAGES'] = []
  215. # insert alias to map from a script or region to a wikipedia variant
  216. for eng_tag, sxng_tag_list in wikipedia_script_variants.items():
  217. for sxng_tag in sxng_tag_list:
  218. engine_traits.languages[sxng_tag] = eng_tag
  219. for eng_tag, sxng_tag_list in wiki_lc_locale_variants.items():
  220. for sxng_tag in sxng_tag_list:
  221. engine_traits.regions[sxng_tag] = eng_tag
  222. resp = _network.get(list_of_wikipedias)
  223. if not resp.ok:
  224. print("ERROR: response from Wikipedia is not OK.")
  225. dom = html.fromstring(resp.text)
  226. for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):
  227. cols = row.xpath('./td')
  228. if not cols:
  229. continue
  230. cols = [c.text_content().strip() for c in cols]
  231. depth = float(cols[11].replace('-', '0').replace(',', ''))
  232. articles = int(cols[4].replace(',', '').replace(',', ''))
  233. eng_tag = cols[3]
  234. wiki_url = row.xpath('./td[4]/a/@href')[0]
  235. wiki_url = urllib.parse.urlparse(wiki_url)
  236. try:
  237. sxng_tag = locales.language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
  238. except babel.UnknownLocaleError:
  239. # print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
  240. continue
  241. finally:
  242. engine_traits.custom['WIKIPEDIA_LANGUAGES'].append(eng_tag)
  243. if sxng_tag not in locales.LOCALE_NAMES:
  244. if articles < 10000:
  245. # exclude languages with too few articles
  246. continue
  247. if int(depth) < 20:
  248. # Rough indicator of a Wikipedia’s quality, showing how
  249. # frequently its articles are updated.
  250. continue
  251. conflict = engine_traits.languages.get(sxng_tag)
  252. if conflict:
  253. if conflict != eng_tag:
  254. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
  255. continue
  256. engine_traits.languages[sxng_tag] = eng_tag
  257. engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc
  258. engine_traits.custom['WIKIPEDIA_LANGUAGES'].sort()