Browse Source

[mod] limiter: add config file /etc/searxng/limiter.toml

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 1 year ago
parent
commit
66fdec0eb9

+ 1 - 0
requirements.txt

@@ -16,3 +16,4 @@ redis==4.5.5
 markdown-it-py==2.2.0
 typing_extensions==4.6.2
 fasttext-predict==0.9.2.1
+pytomlpp==1.0.13

+ 4 - 1
searx/botdetection/http_accept.py

@@ -13,12 +13,15 @@ Accept_ header ..
    https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept
 
 """
+# pylint: disable=unused-argument
 
 from typing import Optional, Tuple
 import flask
 
+from searx.tools import config
 
-def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
+
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
     if 'text/html' not in request.accept_mimetypes:
         return 429, "bot detected, HTTP header Accept did not contain text/html"
     return None

+ 4 - 1
searx/botdetection/http_accept_encoding.py

@@ -14,12 +14,15 @@ bot if the Accept-Encoding_ header ..
    https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding
 
 """
+# pylint: disable=unused-argument
 
 from typing import Optional, Tuple
 import flask
 
+from searx.tools import config
 
-def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
+
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
     accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
     if not ('gzip' in accept_list or 'deflate' in accept_list):
         return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate"

+ 4 - 2
searx/botdetection/http_accept_language.py

@@ -11,13 +11,15 @@ if the Accept-Language_ header is unset.
    https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
 
 """
-
+# pylint: disable=unused-argument
 
 from typing import Optional, Tuple
 import flask
 
+from searx.tools import config
+
 
-def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
     if request.headers.get('Accept-Language', '').strip() == '':
         return 429, "bot detected, missing HTTP header Accept-Language"
     return None

+ 4 - 2
searx/botdetection/http_connection.py

@@ -11,13 +11,15 @@ the Connection_ header is set to ``close``.
    https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection
 
 """
-
+# pylint: disable=unused-argument
 
 from typing import Optional, Tuple
 import flask
 
+from searx.tools import config
+
 
-def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
     if request.headers.get('Connection', '').strip() == 'close':
         return 429, "bot detected, HTTP header 'Connection=close'"
     return None

+ 5 - 1
searx/botdetection/http_user_agent.py

@@ -12,11 +12,15 @@ the User-Agent_ header is unset or matches the regular expression
    https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
 
 """
+# pylint: disable=unused-argument
 
 from typing import Optional, Tuple
 import re
 import flask
 
+from searx.tools import config
+
+
 USER_AGENT = (
     r'('
     + r'unknown'
@@ -44,7 +48,7 @@ def regexp_user_agent():
     return _regexp
 
 
-def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
     user_agent = request.headers.get('User-Agent', 'unknown')
     if regexp_user_agent().match(user_agent):
         return (

+ 8 - 3
searx/botdetection/ip_limit.py

@@ -1,4 +1,5 @@
-"""
+""".. _botdetection.ip_limit:
+
 Method ``ip_limit``
 -------------------
 
@@ -22,6 +23,8 @@ The :py:obj:`link_token` method is used to investigate whether a request is
 
 from typing import Optional, Tuple
 import flask
+from searx.tools import config
+
 
 from searx import redisdb
 from searx import logger
@@ -56,7 +59,7 @@ API_MAX = 4
 """Maximum requests from one IP in the :py:obj:`API_WONDOW`"""
 
 
-def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
     redis_client = redisdb.client()
 
     x_forwarded_for = request.headers.get('X-Forwarded-For', '')
@@ -68,7 +71,9 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
         if c > API_MAX:
             return 429, "BLOCK %s: API limit exceeded"
 
-    suspicious = link_token.is_suspicious(request)
+    suspicious = False
+    if cfg['botdetection.ip_limit.link_token']:
+        suspicious = link_token.is_suspicious(request)
 
     if suspicious:
         c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW)

+ 41 - 2
searx/botdetection/limiter.py

@@ -38,8 +38,11 @@ and set the redis-url connection. Check the value, it depends on your redis DB
 """
 
 from typing import Optional, Tuple
+from pathlib import Path
 import flask
+import pytomlpp as toml
 
+from searx.tools import config
 from searx.botdetection import (
     http_accept,
     http_accept_encoding,
@@ -49,6 +52,42 @@ from searx.botdetection import (
     ip_limit,
 )
 
+LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml"
+"""Base configuration (schema) of the botdetection."""
+
+LIMITER_CFG = Path('/etc/searxng/limiter.toml')
+"""Lokal Limiter configuration."""
+
+CFG_DEPRECATED = {
+    # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests.  Don't use it in your real project config."
+}
+
+CFG = config.Config({}, {})
+
+
+def init_cfg(log):
+    global CFG  # pylint: disable=global-statement
+    CFG = config.Config(cfg_schema=toml.load(LIMITER_CFG_SCHEMA), deprecated=CFG_DEPRECATED)
+
+    if not LIMITER_CFG.exists():
+        log.warning("missing config file: %s", LIMITER_CFG)
+        return
+
+    log.warning("load config file: %s", LIMITER_CFG)
+    try:
+        upd_cfg = toml.load(LIMITER_CFG)
+    except toml.DecodeError as exc:
+        msg = str(exc).replace('\t', '').replace('\n', ' ')
+        log.error("%s: %s", LIMITER_CFG, msg)
+        raise
+
+    is_valid, issue_list = CFG.validate(upd_cfg)
+    for msg in issue_list:
+        log.error(str(msg))
+    if not is_valid:
+        raise TypeError(f"schema of {LIMITER_CFG} is invalid, can't cutomize limiter configuration from!")
+    CFG.update(upd_cfg)
+
 
 def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
 
@@ -58,7 +97,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
     for func in [
         http_user_agent,
     ]:
-        val = func.filter_request(request)
+        val = func.filter_request(request, CFG)
         if val is not None:
             return val
 
@@ -72,7 +111,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
             http_user_agent,
             ip_limit,
         ]:
-            val = func.filter_request(request)
+            val = func.filter_request(request, CFG)
             if val is not None:
                 return val
 

+ 3 - 0
searx/botdetection/limiter.toml

@@ -0,0 +1,3 @@
+[botdetection.ip_limit]
+
+link_token = true

+ 1 - 0
searx/plugins/limiter.py

@@ -38,5 +38,6 @@ def init(app: flask.Flask, settings) -> bool:
     if not redisdb.client():
         logger.error("The limiter requires Redis")
         return False
+    limiter.init_cfg(logger)
     app.before_request(pre_request)
     return True

+ 8 - 0
searx/tools/__init__.py

@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+""".. _tools src:
+
+A collection of *utilities* used by SearXNG, but without SearXNG specific
+peculiarities.
+
+"""

+ 376 - 0
searx/tools/config.py

@@ -0,0 +1,376 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Configuration class :py:class:`Config` with deep-update, schema validation
+and deprecated names.
+
+The :py:class:`Config` class implements a configuration that is based on
+structured dictionaries.  The configuration schema is defined in a dictionary
+structure and the configuration data is given in a dictionary structure.
+"""
+from __future__ import annotations
+
+import copy
+import typing
+import logging
+import pathlib
+import pytomlpp as toml
+
+__all__ = ['Config', 'UNSET', 'SchemaIssue']
+
+log = logging.getLogger(__name__)
+
+
+class FALSE:
+    """Class of ``False`` singelton"""
+
+    # pylint: disable=multiple-statements
+    def __init__(self, msg):
+        self.msg = msg
+
+    def __bool__(self):
+        return False
+
+    def __str__(self):
+        return self.msg
+
+    __repr__ = __str__
+
+
+UNSET = FALSE('<UNSET>')
+
+
+class SchemaIssue(ValueError):
+    """Exception to store and/or raise a message from a schema issue."""
+
+    def __init__(self, level: typing.Literal['warn', 'invalid'], msg: str):
+        self.level = level
+        super().__init__(msg)
+
+    def __str__(self):
+        return f"[cfg schema {self.level}] {self.args[0]}"
+
+
+class Config:
+    """Base class used for configuration"""
+
+    UNSET = UNSET
+
+    @classmethod
+    def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path, deprecated: dict) -> Config:
+
+        # init schema
+
+        log.debug("load schema file: %s", schema_file)
+        cfg = cls(cfg_schema=toml.load(schema_file), deprecated=deprecated)
+        if not cfg_file.exists():
+            log.warning("missing config file: %s", cfg_file)
+            return cfg
+
+        # load configuration
+
+        log.debug("load config file: %s", cfg_file)
+        try:
+            upd_cfg = toml.load(cfg_file)
+        except toml.DecodeError as exc:
+            msg = str(exc).replace('\t', '').replace('\n', ' ')
+            log.error("%s: %s", cfg_file, msg)
+            raise
+
+        is_valid, issue_list = cfg.validate(upd_cfg)
+        for msg in issue_list:
+            log.error(str(msg))
+        if not is_valid:
+            raise TypeError(f"schema of {cfg_file} is invalid!")
+        cfg.update(upd_cfg)
+        return cfg
+
+    def __init__(self, cfg_schema: typing.Dict, deprecated: typing.Dict[str, str]):
+        """Construtor of class Config.
+
+        :param cfg_schema: Schema of the configuration
+        :param deprecated: dictionary that maps deprecated configuration names to a messages
+
+        These values are needed for validation, see :py:obj:`validate`.
+
+        """
+        self.cfg_schema = cfg_schema
+        self.deprecated = deprecated
+        self.cfg = copy.deepcopy(cfg_schema)
+
+    def __getitem__(self, key: str):
+        return self.get(key)
+
+    def validate(self, cfg: dict):
+        """Validation of dictionary ``cfg`` on :py:obj:`Config.SCHEMA`.
+        Validation is done by :py:obj:`validate`."""
+
+        return validate(self.cfg_schema, cfg, self.deprecated)
+
+    def update(self, upd_cfg: dict):
+        """Update this configuration by ``upd_cfg``."""
+
+        dict_deepupdate(self.cfg, upd_cfg)
+
+    def default(self, name: str):
+        """Returns default value of field ``name`` in ``self.cfg_schema``."""
+        return value(name, self.cfg_schema)
+
+    def get(self, name: str, default=UNSET, replace=True):
+        """Returns the value to which ``name`` points in the configuration.
+
+        If there is no such ``name`` in the config and the ``default`` is
+        :py:obj:`UNSET`, a :py:obj:`KeyError` is raised.
+        """
+
+        parent = self._get_parent_dict(name)
+        val = parent.get(name.split('.')[-1], UNSET)
+        if val is UNSET:
+            if default is UNSET:
+                raise KeyError(name)
+            val = default
+
+        if replace and isinstance(val, str):
+            val = val % self
+        return val
+
+    def set(self, name: str, val):
+        """Set the value to which ``name`` points in the configuration.
+
+        If there is no such ``name`` in the config, a :py:obj:`KeyError` is
+        raised.
+        """
+        parent = self._get_parent_dict(name)
+        parent[name.split('.')[-1]] = val
+
+    def _get_parent_dict(self, name):
+        parent_name = '.'.join(name.split('.')[:-1])
+        if parent_name:
+            parent = value(parent_name, self.cfg)
+        else:
+            parent = self.cfg
+        if (parent is UNSET) or (not isinstance(parent, dict)):
+            raise KeyError(parent_name)
+        return parent
+
+    def path(self, name: str, default=UNSET):
+        """Get a :py:class:`pathlib.Path` object from a config string."""
+
+        val = self.get(name, default)
+        if val is UNSET:
+            if default is UNSET:
+                raise KeyError(name)
+            return default
+        return pathlib.Path(str(val))
+
+    def pyobj(self, name, default=UNSET):
+        """Get python object refered by full qualiffied name (FQN) in the config
+        string."""
+
+        fqn = self.get(name, default)
+        if fqn is UNSET:
+            if default is UNSET:
+                raise KeyError(name)
+            return default
+        (modulename, name) = str(fqn).rsplit('.', 1)
+        m = __import__(modulename, {}, {}, [name], 0)
+        return getattr(m, name)
+
+
+# working with dictionaries
+
+
+def value(name: str, data_dict: dict):
+    """Returns the value to which ``name`` points in the ``dat_dict``.
+
+    .. code: python
+
+        >>> data_dict = {
+                "foo": {"bar": 1 },
+                "bar": {"foo": 2 },
+                "foobar": [1, 2, 3],
+            }
+        >>> value('foobar', data_dict)
+        [1, 2, 3]
+        >>> value('foo.bar', data_dict)
+        1
+        >>> value('foo.bar.xxx', data_dict)
+        <UNSET>
+
+    """
+
+    ret_val = data_dict
+    for part in name.split('.'):
+        if isinstance(ret_val, dict):
+            ret_val = ret_val.get(part, UNSET)
+        if ret_val is UNSET:
+            break
+    return ret_val
+
+
+def validate(
+    schema_dict: typing.Dict, data_dict: typing.Dict, deprecated: typing.Dict[str, str]
+) -> typing.Tuple[bool, list]:
+
+    """Deep validation of dictionary in ``data_dict`` against dictionary in
+    ``schema_dict``.  Argument deprecated is a dictionary that maps deprecated
+    configuration names to a messages::
+
+        deprecated = {
+            "foo.bar" : "config 'foo.bar' is deprecated, use 'bar.foo'",
+            "..."     : "..."
+        }
+
+    The function returns a python tuple ``(is_valid, issue_list)``:
+
+    ``is_valid``:
+      A bool value indicating ``data_dict`` is valid or not.
+
+    ``issue_list``:
+      A list of messages (:py:obj:`SchemaIssue`) from the validation::
+
+          [schema warn] data_dict: deprecated 'fontlib.foo': <DEPRECATED['foo.bar']>
+          [schema invalid] data_dict: key unknown 'fontlib.foo'
+          [schema invalid] data_dict: type mismatch 'fontlib.foo': expected ..., is ...
+
+    If ``schema_dict`` or ``data_dict`` is not a dictionary type a
+    :py:obj:`SchemaIssue` is raised.
+
+    """
+    names = []
+    is_valid = True
+    issue_list = []
+
+    if not isinstance(schema_dict, dict):
+        raise SchemaIssue('invalid', "schema_dict is not a dict type")
+    if not isinstance(data_dict, dict):
+        raise SchemaIssue('invalid', f"data_dict issue{'.'.join(names)} is not a dict type")
+
+    is_valid, issue_list = _validate(names, issue_list, schema_dict, data_dict, deprecated)
+    return is_valid, issue_list
+
+
+def _validate(
+    names: typing.List,
+    issue_list: typing.List,
+    schema_dict: typing.Dict,
+    data_dict: typing.Dict,
+    deprecated: typing.Dict[str, str],
+) -> typing.Tuple[bool, typing.List]:
+
+    is_valid = True
+
+    for key, data_value in data_dict.items():
+
+        names.append(key)
+        name = '.'.join(names)
+
+        deprecated_msg = deprecated.get(name)
+        # print("XXX %s: key %s //   data_value: %s" % (name, key, data_value))
+        if deprecated_msg:
+            issue_list.append(SchemaIssue('warn', f"data_dict '{name}': deprecated - {deprecated_msg}"))
+
+        schema_value = value(name, schema_dict)
+        # print("YYY %s: key %s // schema_value: %s" % (name, key, schema_value))
+        if schema_value is UNSET:
+            if not deprecated_msg:
+                issue_list.append(SchemaIssue('invalid', f"data_dict '{name}': key unknown in schema_dict"))
+                is_valid = False
+
+        elif type(schema_value) != type(data_value):  # pylint: disable=unidiomatic-typecheck
+            issue_list.append(
+                SchemaIssue(
+                    'invalid',
+                    (f"data_dict: type mismatch '{name}':" f" expected {type(schema_value)}, is: {type(data_value)}"),
+                )
+            )
+            is_valid = False
+
+        elif isinstance(data_value, dict):
+            _valid, _ = _validate(names, issue_list, schema_dict, data_value, deprecated)
+            is_valid = is_valid and _valid
+        names.pop()
+
+    return is_valid, issue_list
+
+
+def dict_deepupdate(base_dict: dict, upd_dict: dict, names=None):
+    """Deep-update of dictionary in ``base_dict`` by dictionary in ``upd_dict``.
+
+    For each ``upd_key`` & ``upd_val`` pair in ``upd_dict``:
+
+    0. If types of ``base_dict[upd_key]`` and ``upd_val`` do not match raise a
+       :py:obj:`TypeError`.
+
+    1. If ``base_dict[upd_key]`` is a dict: recursively deep-update it by ``upd_val``.
+
+    2. If ``base_dict[upd_key]`` not exist: set ``base_dict[upd_key]`` from a
+       (deep-) copy of ``upd_val``.
+
+    3. If ``upd_val`` is a list, extend list in ``base_dict[upd_key]`` by the
+       list in ``upd_val``.
+
+    4. If ``upd_val`` is a set, update set in ``base_dict[upd_key]`` by set in
+       ``upd_val``.
+    """
+    # pylint: disable=too-many-branches
+    if not isinstance(base_dict, dict):
+        raise TypeError("argument 'base_dict' is not a ditionary type")
+    if not isinstance(upd_dict, dict):
+        raise TypeError("argument 'upd_dict' is not a ditionary type")
+
+    if names is None:
+        names = []
+
+    for upd_key, upd_val in upd_dict.items():
+        # For each upd_key & upd_val pair in upd_dict:
+
+        if isinstance(upd_val, dict):
+
+            if upd_key in base_dict:
+                # if base_dict[upd_key] exists, recursively deep-update it
+                if not isinstance(base_dict[upd_key], dict):
+                    raise TypeError(f"type mismatch {'.'.join(names)}: is not a dict type in base_dict")
+                dict_deepupdate(
+                    base_dict[upd_key],
+                    upd_val,
+                    names
+                    + [
+                        upd_key,
+                    ],
+                )
+
+            else:
+                # if base_dict[upd_key] not exist, set base_dict[upd_key] from deepcopy of upd_val
+                base_dict[upd_key] = copy.deepcopy(upd_val)
+
+        elif isinstance(upd_val, list):
+
+            if upd_key in base_dict:
+                # if base_dict[upd_key] exists, base_dict[up_key] is extended by
+                # the list from upd_val
+                if not isinstance(base_dict[upd_key], list):
+                    raise TypeError(f"type mismatch {'.'.join(names)}: is not a list type in base_dict")
+                base_dict[upd_key].extend(upd_val)
+
+            else:
+                # if base_dict[upd_key] doesn't exists, set base_dict[key] from a deepcopy of the
+                # list in upd_val.
+                base_dict[upd_key] = copy.deepcopy(upd_val)
+
+        elif isinstance(upd_val, set):
+
+            if upd_key in base_dict:
+                # if base_dict[upd_key] exists, base_dict[up_key] is updated by the set in upd_val
+                if not isinstance(base_dict[upd_key], set):
+                    raise TypeError(f"type mismatch {'.'.join(names)}: is not a set type in base_dict")
+                base_dict[upd_key].update(upd_val.copy())
+
+            else:
+                # if base_dict[upd_key] doesn't exists, set base_dict[upd_key] from a copy of the
+                # set in upd_val
+                base_dict[upd_key] = upd_val.copy()
+
+        else:
+            # for any other type of upd_val replace or add base_dict[upd_key] by a copy
+            # of upd_val
+            base_dict[upd_key] = copy.copy(upd_val)