mediawiki.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
  4. the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have
  5. endpoints that follow this pattern::
  6. https://{base_url}/w/api.php?action=query&list=search&format=json
  7. .. note::
  8. In its actual state, this engine is implemented to parse JSON result
  9. (`format=json`_) from a search query (`list=search`_). If you need other
  10. ``action`` and ``list`` types ask SearXNG developers to extend the
  11. implementation according to your needs.
  12. .. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
  13. .. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
  14. .. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
  15. .. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
  16. Configuration
  17. =============
  18. Request:
  19. - :py:obj:`base_url`
  20. - :py:obj:`search_type`
  21. - :py:obj:`srenablerewrites`
  22. - :py:obj:`srsort`
  23. - :py:obj:`srprop`
  24. Implementations
  25. ===============
  26. """
  27. from __future__ import annotations
  28. from typing import TYPE_CHECKING
  29. from datetime import datetime
  30. from urllib.parse import urlencode, quote
  31. from searx.utils import html_to_text
  32. from searx.enginelib.traits import EngineTraits
  33. if TYPE_CHECKING:
  34. import logging
  35. logger: logging.Logger
  36. traits: EngineTraits
  37. # about
  38. about = {
  39. "website": None,
  40. "wikidata_id": None,
  41. "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
  42. "use_official_api": True,
  43. "require_api_key": False,
  44. "results": 'JSON',
  45. }
  46. # engine dependent config
  47. categories = ['general']
  48. paging = True
  49. number_of_results = 5
  50. search_type: str = 'nearmatch'
  51. """Which type of search to perform. One of the following values: ``nearmatch``,
  52. ``text`` or ``title``.
  53. See ``srwhat`` argument in `list=search`_ documentation.
  54. """
  55. srenablerewrites: bool = True
  56. """Enable internal query rewriting (Type: boolean). Some search backends can
  57. rewrite the query into another which is thought to provide better results, for
  58. instance by correcting spelling errors.
  59. See ``srenablerewrites`` argument in `list=search`_ documentation.
  60. """
  61. srsort: str = 'relevance'
  62. """Set the sort order of returned results. One of the following values:
  63. ``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
  64. ``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
  65. ``none``, ``random``, ``relevance``, ``user_random``.
  66. See ``srenablerewrites`` argument in `list=search`_ documentation.
  67. """
  68. srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
  69. """Which properties to return.
  70. See ``srprop`` argument in `list=search`_ documentation.
  71. """
  72. base_url: str = 'https://{language}.wikipedia.org/'
  73. """Base URL of the Wikimedia wiki.
  74. ``{language}``:
  75. ISO 639-1 language code (en, de, fr ..) of the search language.
  76. """
  77. timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
  78. """The longhand version of MediaWiki time strings."""
  79. def request(query, params):
  80. # write search-language back to params, required in response
  81. if params['language'] == 'all':
  82. params['language'] = 'en'
  83. else:
  84. params['language'] = params['language'].split('-')[0]
  85. if base_url.endswith('/'):
  86. api_url = base_url + 'w/api.php?'
  87. else:
  88. api_url = base_url + '/w/api.php?'
  89. api_url = api_url.format(language=params['language'])
  90. offset = (params['pageno'] - 1) * number_of_results
  91. args = {
  92. 'action': 'query',
  93. 'list': 'search',
  94. 'format': 'json',
  95. 'srsearch': query,
  96. 'sroffset': offset,
  97. 'srlimit': number_of_results,
  98. 'srwhat': search_type,
  99. 'srprop': srprop,
  100. 'srsort': srsort,
  101. }
  102. if srenablerewrites:
  103. args['srenablerewrites'] = '1'
  104. params['url'] = api_url + urlencode(args)
  105. return params
  106. # get response from search-request
  107. def response(resp):
  108. results = []
  109. search_results = resp.json()
  110. # return empty array if there are no results
  111. if not search_results.get('query', {}).get('search'):
  112. return []
  113. for result in search_results['query']['search']:
  114. if result.get('snippet', '').startswith('#REDIRECT'):
  115. continue
  116. title = result['title']
  117. sectiontitle = result.get('sectiontitle')
  118. content = html_to_text(result.get('snippet', ''))
  119. metadata = html_to_text(result.get('categorysnippet', ''))
  120. timestamp = result.get('timestamp')
  121. url = (
  122. base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
  123. )
  124. if sectiontitle:
  125. # in case of sectiontitle create a link to the section in the wiki page
  126. url += '#' + quote(sectiontitle.replace(' ', '_').encode())
  127. title += ' / ' + sectiontitle
  128. item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
  129. if timestamp:
  130. item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
  131. results.append(item)
  132. # return results
  133. return results