wikipedia.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """This module implements the Wikipedia engine. Some of this implementations
  4. are shared by other engines:
  5. - :ref:`wikidata engine`
  6. The list of supported languages is fetched from the article linked by
  7. :py:obj:`wikipedia_article_depth`. Unlike traditional search engines, wikipedia
  8. does not support one Wikipedia for all the languages, but there is one Wikipedia
  9. for every language (:py:obj:`fetch_traits`).
  10. """
  11. import urllib.parse
  12. import babel
  13. from lxml import html
  14. from searx import network
  15. from searx.locales import language_tag
  16. from searx.enginelib.traits import EngineTraits
  17. traits: EngineTraits
  18. # about
  19. about = {
  20. "website": 'https://www.wikipedia.org/',
  21. "wikidata_id": 'Q52',
  22. "official_api_documentation": 'https://en.wikipedia.org/api/',
  23. "use_official_api": True,
  24. "require_api_key": False,
  25. "results": 'JSON',
  26. }
  27. send_accept_language_header = True
  28. wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
  29. """The *editing depth* of Wikipedia is one of several possible rough indicators
  30. of the encyclopedia's collaborative quality, showing how frequently its articles
  31. are updated. The measurement of depth was introduced after some limitations of
  32. the classic measurement of article count were realized.
  33. """
  34. # example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
  35. rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
  36. """`wikipedia rest_v1 summary API`_: The summary response includes an extract of
  37. the first paragraph of the page in plain text and HTML as well as the type of
  38. page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web
  39. and link previews in the apps.
  40. .. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_
  41. """
  42. def request(query, params):
  43. """Assemble a request (`wikipedia rest_v1 summary API`_)."""
  44. if query.islower():
  45. query = query.title()
  46. engine_language = traits.get_language(params['searxng_locale'], 'en')
  47. wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/')
  48. title = urllib.parse.quote(query)
  49. # '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
  50. # '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
  51. params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
  52. params['raise_for_httperror'] = False
  53. params['soft_max_redirects'] = 2
  54. return params
  55. # get response from search-request
  56. def response(resp):
  57. results = []
  58. if resp.status_code == 404:
  59. return []
  60. if resp.status_code == 400:
  61. try:
  62. api_result = resp.json()
  63. except Exception: # pylint: disable=broad-except
  64. pass
  65. else:
  66. if (
  67. api_result['type'] == 'https://mediawiki.org/wiki/HyperSwitch/errors/bad_request'
  68. and api_result['detail'] == 'title-invalid-characters'
  69. ):
  70. return []
  71. network.raise_for_httperror(resp)
  72. api_result = resp.json()
  73. title = api_result['title']
  74. wikipedia_link = api_result['content_urls']['desktop']['page']
  75. results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
  76. if api_result.get('type') == 'standard':
  77. results.append(
  78. {
  79. 'infobox': title,
  80. 'id': wikipedia_link,
  81. 'content': api_result.get('extract', ''),
  82. 'img_src': api_result.get('thumbnail', {}).get('source'),
  83. 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
  84. }
  85. )
  86. return results
  87. # Nonstandard language codes
  88. #
  89. # These Wikipedias use language codes that do not conform to the ISO 639
  90. # standard (which is how wiki subdomains are chosen nowadays).
  91. lang_map = {
  92. 'be-tarask': 'bel',
  93. 'ak': 'aka',
  94. 'als': 'gsw',
  95. 'bat-smg': 'sgs',
  96. 'cbk-zam': 'cbk',
  97. 'fiu-vro': 'vro',
  98. 'map-bms': 'map',
  99. 'nrm': 'nrf',
  100. 'roa-rup': 'rup',
  101. 'nds-nl': 'nds',
  102. #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
  103. 'zh-min-nan': 'nan',
  104. 'zh-yue': 'yue',
  105. 'an': 'arg',
  106. 'zh-classical': 'zh-Hant', # babel maps classical to zh-Hans (for whatever reason)
  107. }
  108. unknown_langs = [
  109. 'an', # Aragonese
  110. 'ba', # Bashkir
  111. 'bar', # Bavarian
  112. 'bcl', # Central Bicolano
  113. 'be-tarask', # Belarusian variant / Belarusian is already covered by 'be'
  114. 'bpy', # Bishnupriya Manipuri is unknown by babel
  115. 'hif', # Fiji Hindi
  116. 'ilo', # Ilokano
  117. 'li', # Limburgish
  118. 'sco', # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
  119. 'sh', # Serbo-Croatian
  120. 'simple', # simple english is not know as a natural language different to english (babel)
  121. 'vo', # Volapük
  122. 'wa', # Walloon
  123. ]
  124. def fetch_traits(engine_traits: EngineTraits):
  125. """Fetch languages from Wikipedia.
  126. The location of the Wikipedia address of a language is mapped in a
  127. :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
  128. (``wiki_netloc``). Here is a reduced example:
  129. .. code:: python
  130. traits.custom['wiki_netloc'] = {
  131. "en": "en.wikipedia.org",
  132. ..
  133. "gsw": "als.wikipedia.org",
  134. ..
  135. "zh": "zh.wikipedia.org",
  136. "zh-classical": "zh-classical.wikipedia.org"
  137. }
  138. """
  139. engine_traits.custom['wiki_netloc'] = {}
  140. # insert alias to map from a region like zh-CN to a language zh_Hans
  141. engine_traits.languages['zh_Hans'] = 'zh'
  142. resp = network.get(wikipedia_article_depth)
  143. if not resp.ok:
  144. print("ERROR: response from Wikipedia is not OK.")
  145. dom = html.fromstring(resp.text)
  146. for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):
  147. cols = row.xpath('./td')
  148. if not cols:
  149. continue
  150. cols = [c.text_content().strip() for c in cols]
  151. depth = float(cols[3].replace('-', '0').replace(',', ''))
  152. articles = int(cols[4].replace(',', '').replace(',', ''))
  153. if articles < 10000:
  154. # exclude languages with too few articles
  155. continue
  156. if int(depth) < 20:
  157. # Rough indicator of a Wikipedia’s quality, showing how frequently
  158. # its articles are updated.
  159. continue
  160. eng_tag = cols[2]
  161. wiki_url = row.xpath('./td[3]/a/@href')[0]
  162. wiki_url = urllib.parse.urlparse(wiki_url)
  163. if eng_tag in unknown_langs:
  164. continue
  165. try:
  166. sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
  167. except babel.UnknownLocaleError:
  168. print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
  169. continue
  170. conflict = engine_traits.languages.get(sxng_tag)
  171. if conflict:
  172. if conflict != eng_tag:
  173. print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
  174. continue
  175. engine_traits.languages[sxng_tag] = eng_tag
  176. engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc