|
@@ -1,18 +1,59 @@
|
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
|
+# lint: pylint
|
|
|
+"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
|
|
|
+the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have
|
|
|
+endpoints that follow this pattern::
|
|
|
+
|
|
|
+ https://{base_url}/w/api.php?action=query&list=search&format=json
|
|
|
+
|
|
|
+.. note::
|
|
|
+
|
|
|
+ In its actual state, this engine is implemented to parse JSON result
|
|
|
+ (`format=json`_) from a search query (`list=search`_). If you need other
|
|
|
+ ``action`` and ``list`` types ask SearXNG developers to extend the
|
|
|
+ implementation according to your needs.
|
|
|
+
|
|
|
+.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
|
|
|
+.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
|
|
|
+.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
|
|
|
+.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
|
|
|
+
|
|
|
+Configuration
|
|
|
+=============
|
|
|
+
|
|
|
+Request:
|
|
|
+
|
|
|
+- :py:obj:`base_url`
|
|
|
+- :py:obj:`search_type`
|
|
|
+- :py:obj:`srenablerewrites`
|
|
|
+- :py:obj:`srsort`
|
|
|
+- :py:obj:`srprop`
|
|
|
+
|
|
|
+Implementations
|
|
|
+===============
|
|
|
+
|
|
|
"""
|
|
|
- General mediawiki-engine (Web)
|
|
|
-"""
|
|
|
+from __future__ import annotations
|
|
|
+from typing import TYPE_CHECKING
|
|
|
|
|
|
-from string import Formatter
|
|
|
+from datetime import datetime
|
|
|
from urllib.parse import urlencode, quote
|
|
|
|
|
|
from searx.utils import html_to_text
|
|
|
+from searx.enginelib.traits import EngineTraits
|
|
|
+
|
|
|
+if TYPE_CHECKING:
|
|
|
+ import logging
|
|
|
+
|
|
|
+ logger: logging.Logger
|
|
|
+
|
|
|
+traits: EngineTraits
|
|
|
|
|
|
# about
|
|
|
about = {
|
|
|
"website": None,
|
|
|
"wikidata_id": None,
|
|
|
- "official_api_documentation": 'http://www.mediawiki.org/wiki/API:Search',
|
|
|
+ "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
|
|
|
"use_official_api": True,
|
|
|
"require_api_key": False,
|
|
|
"results": 'JSON',
|
|
@@ -21,73 +62,119 @@ about = {
|
|
|
# engine dependent config
|
|
|
categories = ['general']
|
|
|
paging = True
|
|
|
-number_of_results = 1
|
|
|
-search_type = 'nearmatch' # possible values: title, text, nearmatch
|
|
|
-
|
|
|
-# search-url
|
|
|
-base_url = 'https://{language}.wikipedia.org/'
|
|
|
-search_postfix = (
|
|
|
- 'w/api.php?action=query'
|
|
|
- '&list=search'
|
|
|
- '&{query}'
|
|
|
- '&format=json'
|
|
|
- '&sroffset={offset}'
|
|
|
- '&srlimit={limit}'
|
|
|
- '&srwhat={searchtype}'
|
|
|
-)
|
|
|
-
|
|
|
-
|
|
|
-# do search-request
|
|
|
-def request(query, params):
|
|
|
- offset = (params['pageno'] - 1) * number_of_results
|
|
|
+number_of_results = 5
|
|
|
|
|
|
- string_args = dict(
|
|
|
- query=urlencode({'srsearch': query}), offset=offset, limit=number_of_results, searchtype=search_type
|
|
|
- )
|
|
|
+search_type: str = 'nearmatch'
|
|
|
+"""Which type of search to perform. One of the following values: ``nearmatch``,
|
|
|
+``text`` or ``title``.
|
|
|
|
|
|
- format_strings = list(Formatter().parse(base_url))
|
|
|
+See ``srwhat`` argument in `list=search`_ documentation.
|
|
|
+"""
|
|
|
|
|
|
- if params['language'] == 'all':
|
|
|
- language = 'en'
|
|
|
- else:
|
|
|
- language = params['language'].split('-')[0]
|
|
|
+srenablerewrites: bool = True
|
|
|
+"""Enable internal query rewriting (Type: boolean). Some search backends can
|
|
|
+rewrite the query into another which is thought to provide better results, for
|
|
|
+instance by correcting spelling errors.
|
|
|
+
|
|
|
+See ``srenablerewrites`` argument in `list=search`_ documentation.
|
|
|
+"""
|
|
|
+
|
|
|
+srsort: str = 'relevance'
|
|
|
+"""Set the sort order of returned results. One of the following values:
|
|
|
+``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
|
|
|
+``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
|
|
|
+``none``, ``random``, ``relevance``, ``user_random``.
|
|
|
+
|
|
|
+See ``srenablerewrites`` argument in `list=search`_ documentation.
|
|
|
+"""
|
|
|
+
|
|
|
+srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
|
|
|
+"""Which properties to return.
|
|
|
+
|
|
|
+See ``srprop`` argument in `list=search`_ documentation.
|
|
|
+"""
|
|
|
+
|
|
|
+base_url: str = 'https://{language}.wikipedia.org/'
|
|
|
+"""Base URL of the Wikimedia wiki.
|
|
|
|
|
|
- # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)]
|
|
|
- if any(x[1] == 'language' for x in format_strings):
|
|
|
- string_args['language'] = language
|
|
|
+``{language}``:
|
|
|
+ ISO 639-1 language code (en, de, fr ..) of the search language.
|
|
|
+"""
|
|
|
+
|
|
|
+timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
|
|
|
+"""The longhand version of MediaWiki time strings."""
|
|
|
+
|
|
|
+
|
|
|
+def request(query, params):
|
|
|
|
|
|
# write search-language back to params, required in response
|
|
|
- params['language'] = language
|
|
|
|
|
|
- search_url = base_url + search_postfix
|
|
|
+ if params['language'] == 'all':
|
|
|
+ params['language'] = 'en'
|
|
|
+ else:
|
|
|
+ params['language'] = params['language'].split('-')[0]
|
|
|
+
|
|
|
+ if base_url.endswith('/'):
|
|
|
+ api_url = base_url + 'w/api.php?'
|
|
|
+ else:
|
|
|
+ api_url = base_url + '/w/api.php?'
|
|
|
+ api_url = api_url.format(language=params['language'])
|
|
|
|
|
|
- params['url'] = search_url.format(**string_args)
|
|
|
+ offset = (params['pageno'] - 1) * number_of_results
|
|
|
|
|
|
+ args = {
|
|
|
+ 'action': 'query',
|
|
|
+ 'list': 'search',
|
|
|
+ 'format': 'json',
|
|
|
+ 'srsearch': query,
|
|
|
+ 'sroffset': offset,
|
|
|
+ 'srlimit': number_of_results,
|
|
|
+ 'srwhat': search_type,
|
|
|
+ 'srprop': srprop,
|
|
|
+ 'srsort': srsort,
|
|
|
+ }
|
|
|
+ if srenablerewrites:
|
|
|
+ args['srenablerewrites'] = '1'
|
|
|
+
|
|
|
+ params['url'] = api_url + urlencode(args)
|
|
|
return params
|
|
|
|
|
|
|
|
|
# get response from search-request
|
|
|
def response(resp):
|
|
|
- results = []
|
|
|
|
|
|
+ results = []
|
|
|
search_results = resp.json()
|
|
|
|
|
|
# return empty array if there are no results
|
|
|
if not search_results.get('query', {}).get('search'):
|
|
|
return []
|
|
|
|
|
|
- # parse results
|
|
|
for result in search_results['query']['search']:
|
|
|
+
|
|
|
if result.get('snippet', '').startswith('#REDIRECT'):
|
|
|
continue
|
|
|
+
|
|
|
+ title = result['title']
|
|
|
+ sectiontitle = result.get('sectiontitle')
|
|
|
+ content = html_to_text(result.get('snippet', ''))
|
|
|
+ metadata = html_to_text(result.get('categorysnippet', ''))
|
|
|
+ timestamp = result.get('timestamp')
|
|
|
+
|
|
|
url = (
|
|
|
- base_url.format(language=resp.search_params['language'])
|
|
|
- + 'wiki/'
|
|
|
- + quote(result['title'].replace(' ', '_').encode())
|
|
|
+ base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
|
|
|
)
|
|
|
+ if sectiontitle:
|
|
|
+ # in case of sectiontitle create a link to the section in the wiki page
|
|
|
+ url += '#' + quote(sectiontitle.replace(' ', '_').encode())
|
|
|
+ title += ' / ' + sectiontitle
|
|
|
+
|
|
|
+ item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
|
|
|
+
|
|
|
+ if timestamp:
|
|
|
+ item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
|
|
|
|
|
|
- # append result
|
|
|
- results.append({'url': url, 'title': result['title'], 'content': html_to_text(result.get('snippet', ''))})
|
|
|
+ results.append(item)
|
|
|
|
|
|
# return results
|
|
|
return results
|