Browse Source

[mod] engine: wikimedia - improve results, add addition settings & doc

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 1 year ago
parent
commit
db522cf76d
4 changed files with 151 additions and 64 deletions
  1. 6 0
      docs/dev/engines/index.rst
  2. 13 0
      docs/dev/engines/mediawiki.rst
  3. 131 44
      searx/engines/mediawiki.py
  4. 1 20
      searx/settings.yml

+ 6 - 0
docs/dev/engines/index.rst

@@ -40,6 +40,12 @@ Online Engines
 
    demo/demo_online
    xpath
+   mediawiki
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+
    online/*
 
 .. _offline engines:

+ 13 - 0
docs/dev/engines/mediawiki.rst

@@ -0,0 +1,13 @@
+.. _mediawiki engine:
+
+================
+MediaWiki Engine
+================
+
+.. contents::
+   :depth: 2
+   :local:
+   :backlinks: entry
+
+.. automodule:: searx.engines.mediawiki
+  :members:

+ 131 - 44
searx/engines/mediawiki.py

@@ -1,18 +1,59 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
+the `MediaWiki Action API`_.  For a `query action`_ all Wikimedia wikis have
+endpoints that follow this pattern::
+
+    https://{base_url}/w/api.php?action=query&list=search&format=json
+
+.. note::
+
+   In its actual state, this engine is implemented to parse JSON result
+   (`format=json`_) from a search query (`list=search`_).  If you need other
+   ``action`` and ``list`` types ask SearXNG developers to extend the
+   implementation according to your needs.
+
+.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
+.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
+.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
+.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
+
+Configuration
+=============
+
+Request:
+
+- :py:obj:`base_url`
+- :py:obj:`search_type`
+- :py:obj:`srenablerewrites`
+- :py:obj:`srsort`
+- :py:obj:`srprop`
+
+Implementations
+===============
+
 """
- General mediawiki-engine (Web)
-"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
 
-from string import Formatter
+from datetime import datetime
 from urllib.parse import urlencode, quote
 
 from searx.utils import html_to_text
+from searx.enginelib.traits import EngineTraits
+
+if TYPE_CHECKING:
+    import logging
+
+    logger: logging.Logger
+
+traits: EngineTraits
 
 # about
 about = {
     "website": None,
     "wikidata_id": None,
-    "official_api_documentation": 'http://www.mediawiki.org/wiki/API:Search',
+    "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
     "use_official_api": True,
     "require_api_key": False,
     "results": 'JSON',
@@ -21,73 +62,119 @@ about = {
 # engine dependent config
 categories = ['general']
 paging = True
-number_of_results = 1
-search_type = 'nearmatch'  # possible values: title, text, nearmatch
-
-# search-url
-base_url = 'https://{language}.wikipedia.org/'
-search_postfix = (
-    'w/api.php?action=query'
-    '&list=search'
-    '&{query}'
-    '&format=json'
-    '&sroffset={offset}'
-    '&srlimit={limit}'
-    '&srwhat={searchtype}'
-)
-
-
-# do search-request
-def request(query, params):
-    offset = (params['pageno'] - 1) * number_of_results
+number_of_results = 5
 
-    string_args = dict(
-        query=urlencode({'srsearch': query}), offset=offset, limit=number_of_results, searchtype=search_type
-    )
+search_type: str = 'nearmatch'
+"""Which type of search to perform.  One of the following values: ``nearmatch``,
+``text`` or ``title``.
 
-    format_strings = list(Formatter().parse(base_url))
+See ``srwhat`` argument in `list=search`_ documentation.
+"""
 
-    if params['language'] == 'all':
-        language = 'en'
-    else:
-        language = params['language'].split('-')[0]
+srenablerewrites: bool = True
+"""Enable internal query rewriting (Type: boolean).  Some search backends can
+rewrite the query into another which is thought to provide better results, for
+instance by correcting spelling errors.
+
+See ``srenablerewrites`` argument in `list=search`_ documentation.
+"""
+
+srsort: str = 'relevance'
+"""Set the sort order of returned results.  One of the following values:
+``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
+``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
+``none``, ``random``, ``relevance``, ``user_random``.
+
+See ``srenablerewrites`` argument in `list=search`_ documentation.
+"""
+
+srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
+"""Which properties to return.
+
+See ``srprop`` argument in `list=search`_ documentation.
+"""
+
+base_url: str = 'https://{language}.wikipedia.org/'
+"""Base URL of the Wikimedia wiki.
 
-    # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)]
-    if any(x[1] == 'language' for x in format_strings):
-        string_args['language'] = language
+``{language}``:
+  ISO 639-1 language code (en, de, fr ..) of the search language.
+"""
+
+timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
+"""The longhand version of MediaWiki time strings."""
+
+
+def request(query, params):
 
     # write search-language back to params, required in response
-    params['language'] = language
 
-    search_url = base_url + search_postfix
+    if params['language'] == 'all':
+        params['language'] = 'en'
+    else:
+        params['language'] = params['language'].split('-')[0]
+
+    if base_url.endswith('/'):
+        api_url = base_url + 'w/api.php?'
+    else:
+        api_url = base_url + '/w/api.php?'
+    api_url = api_url.format(language=params['language'])
 
-    params['url'] = search_url.format(**string_args)
+    offset = (params['pageno'] - 1) * number_of_results
 
+    args = {
+        'action': 'query',
+        'list': 'search',
+        'format': 'json',
+        'srsearch': query,
+        'sroffset': offset,
+        'srlimit': number_of_results,
+        'srwhat': search_type,
+        'srprop': srprop,
+        'srsort': srsort,
+    }
+    if srenablerewrites:
+        args['srenablerewrites'] = '1'
+
+    params['url'] = api_url + urlencode(args)
     return params
 
 
 # get response from search-request
 def response(resp):
-    results = []
 
+    results = []
     search_results = resp.json()
 
     # return empty array if there are no results
     if not search_results.get('query', {}).get('search'):
         return []
 
-    # parse results
     for result in search_results['query']['search']:
+
         if result.get('snippet', '').startswith('#REDIRECT'):
             continue
+
+        title = result['title']
+        sectiontitle = result.get('sectiontitle')
+        content = html_to_text(result.get('snippet', ''))
+        metadata = html_to_text(result.get('categorysnippet', ''))
+        timestamp = result.get('timestamp')
+
         url = (
-            base_url.format(language=resp.search_params['language'])
-            + 'wiki/'
-            + quote(result['title'].replace(' ', '_').encode())
+            base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
         )
+        if sectiontitle:
+            # in case of sectiontitle create a link to the section in the wiki page
+            url += '#' + quote(sectiontitle.replace(' ', '_').encode())
+            title += ' / ' + sectiontitle
+
+        item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
+
+        if timestamp:
+            item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
 
-        # append result
-        results.append({'url': url, 'title': result['title'], 'content': html_to_text(result.get('snippet', ''))})
+        results.append(item)
 
     # return results
     return results

+ 1 - 20
searx/settings.yml

@@ -667,11 +667,6 @@ engines:
     shortcut: fsd
     categories: [it, software wikis]
     base_url: https://directory.fsf.org/
-    number_of_results: 5
-    # what part of a page matches the query string: title, text, nearmatch
-    # * title     - query matches title
-    # * text      - query matches the text of page
-    # * nearmatch - nearmatch in title
     search_type: title
     timeout: 5.0
     disabled: true
@@ -1449,13 +1444,6 @@ engines:
     engine: twitter
     disabled: true
 
-  # maybe in a fun category
-  #  - name: uncyclopedia
-  #    engine: mediawiki
-  #    shortcut: unc
-  #    base_url: https://uncyclopedia.wikia.com/
-  #    number_of_results: 5
-
   # tmp suspended - too slow, too many errors
   #  - name: urbandictionary
   #    engine      : xpath
@@ -1534,7 +1522,6 @@ engines:
     shortcut: wb
     categories: general
     base_url: "https://{language}.wikibooks.org/"
-    number_of_results: 5
     search_type: text
     disabled: true
     about:
@@ -1546,9 +1533,9 @@ engines:
     shortcut: wn
     categories: news
     base_url: "https://{language}.wikinews.org/"
-    number_of_results: 5
     search_type: text
     disabled: true
+    srsort: create_timestamp_desc
     about:
       website: https://www.wikinews.org/
       wikidata_id: Q964
@@ -1558,7 +1545,6 @@ engines:
     shortcut: wq
     categories: general
     base_url: "https://{language}.wikiquote.org/"
-    number_of_results: 5
     search_type: text
     disabled: true
     additional_tests:
@@ -1572,7 +1558,6 @@ engines:
     shortcut: ws
     categories: general
     base_url: "https://{language}.wikisource.org/"
-    number_of_results: 5
     search_type: text
     disabled: true
     about:
@@ -1584,7 +1569,6 @@ engines:
     shortcut: wsp
     categories: [general, science]
     base_url: "https://species.wikimedia.org/"
-    number_of_results: 5
     search_type: text
     disabled: true
     about:
@@ -1596,7 +1580,6 @@ engines:
     shortcut: wt
     categories: [dictionaries]
     base_url: "https://{language}.wiktionary.org/"
-    number_of_results: 5
     search_type: text
     about:
       website: https://www.wiktionary.org/
@@ -1607,7 +1590,6 @@ engines:
     shortcut: wv
     categories: general
     base_url: "https://{language}.wikiversity.org/"
-    number_of_results: 5
     search_type: text
     disabled: true
     about:
@@ -1619,7 +1601,6 @@ engines:
     shortcut: wy
     categories: general
     base_url: "https://{language}.wikivoyage.org/"
-    number_of_results: 5
     search_type: text
     disabled: true
     about: