Browse Source

[fix] limiter: replace real_ip by IPv4/v6 network

Closes: https://github.com/searxng/searxng/issues/2477
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 1 year ago
parent
commit
281e36f4b7

+ 1 - 0
searx/botdetection/__init__.py

@@ -24,3 +24,4 @@ X-Forwarded-For
 
 from ._helpers import dump_request
 from ._helpers import get_real_ip
+from ._helpers import too_many_requests

+ 36 - 8
searx/botdetection/_helpers.py

@@ -1,11 +1,19 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # lint: pylint
 # pylint: disable=missing-module-docstring, invalid-name
-
-from typing import Optional
+from __future__ import annotations
+
+from ipaddress import (
+    IPv4Network,
+    IPv6Network,
+    IPv6Address,
+    ip_address,
+    ip_network,
+)
 import flask
 import werkzeug
 
+from searx.tools import config
 from searx import logger
 
 logger = logger.getChild('botdetection')
@@ -13,7 +21,7 @@ logger = logger.getChild('botdetection')
 
 def dump_request(request: flask.Request):
     return (
-        "%s: %s" % (get_real_ip(request), request.path)
+        request.path
         + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For')
         + " || X-Real-IP: %s" % request.headers.get('X-Real-IP')
         + " || form: %s" % request.form
@@ -27,12 +35,30 @@ def dump_request(request: flask.Request):
     )
 
 
-def too_many_requests(request: flask.Request, log_msg: str) -> Optional[werkzeug.Response]:
-    log_prefix = 'BLOCK %s: ' % get_real_ip(request)
-    logger.debug(log_prefix + log_msg)
+def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None:
+    """Returns a HTTP 429 response object and writes a ERROR message to the
+    'botdetection' logger.  This function is used in part by the filter methods
+    to return the default ``Too Many Requests`` response.
+
+    """
+
+    logger.debug("BLOCK %s: %s", network.compressed, log_msg)
     return flask.make_response(('Too Many Requests', 429))
 
 
+def get_network(real_ip: str, cfg: config.Config) -> IPv4Network | IPv6Network:
+    """Returns the (client) network of whether the real_ip is part of."""
+
+    ip = ip_address(real_ip)
+    if isinstance(ip, IPv6Address):
+        prefix = cfg['real_ip.ipv6_prefix']
+    else:
+        prefix = cfg['real_ip.ipv4_prefix']
+    network = ip_network(f"{real_ip}/{prefix}", strict=False)
+    # logger.debug("get_network(): %s", network.compressed)
+    return network
+
+
 def get_real_ip(request: flask.Request) -> str:
     """Returns real IP of the request.  Since not all proxies set all the HTTP
     headers and incoming headers can be faked it may happen that the IP cannot
@@ -63,7 +89,9 @@ def get_real_ip(request: flask.Request) -> str:
     forwarded_for = request.headers.get("X-Forwarded-For")
     real_ip = request.headers.get('X-Real-IP')
     remote_addr = request.remote_addr
-    logger.debug("X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr)
+    # logger.debug(
+    #     "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr
+    # )
 
     if not forwarded_for:
         logger.error("X-Forwarded-For header is not set!")
@@ -89,5 +117,5 @@ def get_real_ip(request: flask.Request) -> str:
         logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip)
 
     request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0'
-    logger.debug("get_real_ip() -> %s", request_ip)
+    # logger.debug("get_real_ip() -> %s", request_ip)
     return request_ip

+ 13 - 3
searx/botdetection/http_accept.py

@@ -15,7 +15,12 @@ Accept_ header ..
 """
 # pylint: disable=unused-argument
 
-from typing import Optional
+from __future__ import annotations
+from ipaddress import (
+    IPv4Network,
+    IPv6Network,
+)
+
 import flask
 import werkzeug
 
@@ -23,7 +28,12 @@ from searx.tools import config
 from ._helpers import too_many_requests
 
 
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
+def filter_request(
+    network: IPv4Network | IPv6Network,
+    request: flask.Request,
+    cfg: config.Config,
+) -> werkzeug.Response | None:
+
     if 'text/html' not in request.accept_mimetypes:
-        return too_many_requests(request, "HTTP header Accept did not contain text/html")
+        return too_many_requests(network, "HTTP header Accept did not contain text/html")
     return None

+ 13 - 3
searx/botdetection/http_accept_encoding.py

@@ -16,7 +16,12 @@ bot if the Accept-Encoding_ header ..
 """
 # pylint: disable=unused-argument
 
-from typing import Optional
+from __future__ import annotations
+from ipaddress import (
+    IPv4Network,
+    IPv6Network,
+)
+
 import flask
 import werkzeug
 
@@ -24,8 +29,13 @@ from searx.tools import config
 from ._helpers import too_many_requests
 
 
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
+def filter_request(
+    network: IPv4Network | IPv6Network,
+    request: flask.Request,
+    cfg: config.Config,
+) -> werkzeug.Response | None:
+
     accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
     if not ('gzip' in accept_list or 'deflate' in accept_list):
-        return too_many_requests(request, "HTTP header Accept-Encoding did not contain gzip nor deflate")
+        return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate")
     return None

+ 11 - 3
searx/botdetection/http_accept_language.py

@@ -12,8 +12,12 @@ if the Accept-Language_ header is unset.
 
 """
 # pylint: disable=unused-argument
+from __future__ import annotations
+from ipaddress import (
+    IPv4Network,
+    IPv6Network,
+)
 
-from typing import Optional
 import flask
 import werkzeug
 
@@ -21,7 +25,11 @@ from searx.tools import config
 from ._helpers import too_many_requests
 
 
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
+def filter_request(
+    network: IPv4Network | IPv6Network,
+    request: flask.Request,
+    cfg: config.Config,
+) -> werkzeug.Response | None:
     if request.headers.get('Accept-Language', '').strip() == '':
-        return too_many_requests(request, "missing HTTP header Accept-Language")
+        return too_many_requests(network, "missing HTTP header Accept-Language")
     return None

+ 13 - 3
searx/botdetection/http_connection.py

@@ -13,7 +13,12 @@ the Connection_ header is set to ``close``.
 """
 # pylint: disable=unused-argument
 
-from typing import Optional
+from __future__ import annotations
+from ipaddress import (
+    IPv4Network,
+    IPv6Network,
+)
+
 import flask
 import werkzeug
 
@@ -21,7 +26,12 @@ from searx.tools import config
 from ._helpers import too_many_requests
 
 
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
+def filter_request(
+    network: IPv4Network | IPv6Network,
+    request: flask.Request,
+    cfg: config.Config,
+) -> werkzeug.Response | None:
+
     if request.headers.get('Connection', '').strip() == 'close':
-        return too_many_requests(request, "HTTP header 'Connection=close")
+        return too_many_requests(network, "HTTP header 'Connection=close")
     return None

+ 13 - 3
searx/botdetection/http_user_agent.py

@@ -14,8 +14,13 @@ the User-Agent_ header is unset or matches the regular expression
 """
 # pylint: disable=unused-argument
 
-from typing import Optional
+from __future__ import annotations
 import re
+from ipaddress import (
+    IPv4Network,
+    IPv6Network,
+)
+
 import flask
 import werkzeug
 
@@ -50,8 +55,13 @@ def regexp_user_agent():
     return _regexp
 
 
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
+def filter_request(
+    network: IPv4Network | IPv6Network,
+    request: flask.Request,
+    cfg: config.Config,
+) -> werkzeug.Response | None:
+
     user_agent = request.headers.get('User-Agent', 'unknown')
     if regexp_user_agent().match(user_agent):
-        return too_many_requests(request, f"bot detected, HTTP header User-Agent: {user_agent}")
+        return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}")
     return None

+ 31 - 18
searx/botdetection/ip_limit.py

@@ -38,8 +38,12 @@ droped.
    https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
 
 """
+from __future__ import annotations
+from ipaddress import (
+    IPv4Network,
+    IPv6Network,
+)
 
-from typing import Optional
 import flask
 import werkzeug
 from searx.tools import config
@@ -49,7 +53,7 @@ from searx import logger
 from searx.redislib import incr_sliding_window, drop_counter
 
 from . import link_token
-from ._helpers import too_many_requests, get_real_ip
+from ._helpers import too_many_requests
 
 
 logger = logger.getChild('botdetection.ip_limit')
@@ -85,49 +89,58 @@ SUSPICIOUS_IP_MAX = 3
 """Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`."""
 
 
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
+def filter_request(
+    network: IPv4Network | IPv6Network,
+    request: flask.Request,
+    cfg: config.Config,
+) -> werkzeug.Response | None:
+
     # pylint: disable=too-many-return-statements
     redis_client = redisdb.client()
 
-    client_ip = get_real_ip(request)
+    if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']:
+        logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed)
+        return None
 
     if request.args.get('format', 'html') != 'html':
-        c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + client_ip, API_WONDOW)
+        c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + network.compressed, API_WONDOW)
         if c > API_MAX:
-            return too_many_requests(request, "too many request in API_WINDOW")
+            return too_many_requests(network, "too many request in API_WINDOW")
 
     if cfg['botdetection.ip_limit.link_token']:
 
-        suspicious = link_token.is_suspicious(request, True)
+        suspicious = link_token.is_suspicious(network, request, True)
 
         if not suspicious:
             # this IP is no longer suspicious: release ip again / delete the counter of this IP
-            drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip)
+            drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed)
             return None
 
         # this IP is suspicious: count requests from this IP
-        c = incr_sliding_window(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip, SUSPICIOUS_IP_WINDOW)
+        c = incr_sliding_window(
+            redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed, SUSPICIOUS_IP_WINDOW
+        )
         if c > SUSPICIOUS_IP_MAX:
-            logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", client_ip)
+            logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network)
             return flask.redirect(flask.url_for('index'), code=302)
 
-        c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW)
+        c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
         if c > BURST_MAX_SUSPICIOUS:
-            return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)")
+            return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)")
 
-        c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW)
+        c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW)
         if c > LONG_MAX_SUSPICIOUS:
-            return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)")
+            return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)")
 
         return None
 
     # vanilla limiter without extensions counts BURST_MAX and LONG_MAX
-    c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW)
+    c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
     if c > BURST_MAX:
-        return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX)")
+        return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX)")
 
-    c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW)
+    c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW)
     if c > LONG_MAX:
-        return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX)")
+        return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX)")
 
     return None

+ 27 - 34
searx/botdetection/limiter.py

@@ -37,14 +37,16 @@ and set the redis-url connection. Check the value, it depends on your redis DB
 
 """
 
-from typing import Optional, Tuple
+from __future__ import annotations
+
 from pathlib import Path
 import flask
-import pytomlpp as toml
+import werkzeug
 
-from searx import logger
 from searx.tools import config
-from searx.botdetection import (
+from searx import logger
+
+from . import (
     http_accept,
     http_accept_encoding,
     http_accept_language,
@@ -53,6 +55,16 @@ from searx.botdetection import (
     ip_limit,
 )
 
+from ._helpers import (
+    get_network,
+    get_real_ip,
+    dump_request,
+)
+
+logger = logger.getChild('botdetection.limiter')
+
+CFG: config.Config = None  # type: ignore
+
 LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml"
 """Base configuration (schema) of the botdetection."""
 
@@ -63,40 +75,21 @@ CFG_DEPRECATED = {
     # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests.  Don't use it in your real project config."
 }
 
-CFG = None
-
 
 def get_cfg() -> config.Config:
+    global CFG  # pylint: disable=global-statement
     if CFG is None:
-        init_cfg(logger)
+        CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, LIMITER_CFG, CFG_DEPRECATED)
     return CFG
 
 
-def init_cfg(log):
-    global CFG  # pylint: disable=global-statement
-    CFG = config.Config(cfg_schema=toml.load(LIMITER_CFG_SCHEMA), deprecated=CFG_DEPRECATED)
-
-    if not LIMITER_CFG.exists():
-        log.warning("missing config file: %s", LIMITER_CFG)
-        return
-
-    log.info("load config file: %s", LIMITER_CFG)
-    try:
-        upd_cfg = toml.load(LIMITER_CFG)
-    except toml.DecodeError as exc:
-        msg = str(exc).replace('\t', '').replace('\n', ' ')
-        log.error("%s: %s", LIMITER_CFG, msg)
-        raise
+def filter_request(request: flask.Request) -> werkzeug.Response | None:
 
-    is_valid, issue_list = CFG.validate(upd_cfg)
-    for msg in issue_list:
-        log.error(str(msg))
-    if not is_valid:
-        raise TypeError(f"schema of {LIMITER_CFG} is invalid, can't cutomize limiter configuration from!")
-    CFG.update(upd_cfg)
-
-
-def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
+    cfg = get_cfg()
+    real_ip = get_real_ip(request)
+    network = get_network(real_ip, cfg)
+    if network.is_link_local:
+        return None
 
     if request.path == '/healthz':
         return None
@@ -104,7 +97,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
     for func in [
         http_user_agent,
     ]:
-        val = func.filter_request(request, CFG)
+        val = func.filter_request(network, request, cfg)
         if val is not None:
             return val
 
@@ -118,8 +111,8 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
             http_user_agent,
             ip_limit,
         ]:
-            val = func.filter_request(request, CFG)
+            val = func.filter_request(network, request, cfg)
             if val is not None:
                 return val
-
+    logger.debug(f"OK {network}: %s", dump_request(flask.request))
     return None

+ 17 - 3
searx/botdetection/limiter.toml

@@ -1,8 +1,22 @@
+[real_ip]
+
+# Number of values to trust for X-Forwarded-For.
+
+x_for = 1
+
+# The prefix defines the number of leading bits in an address that are compared
+# to determine whether or not an address is part of a (client) network.
+
+ipv4_prefix = 32
+ipv6_prefix = 48
+
 [botdetection.ip_limit]
 
+# To get unlimited access in a local network, by default link-lokal addresses
+# (networks) are not monitored by the ip_limit
+filter_link_local = false
+
+# acrivate link_token method in the ip_limit method
 link_token = false
 
-[real_ip]
 
-# Number of values to trust for X-Forwarded-For.
-x_for = 1

+ 32 - 22
searx/botdetection/link_token.py

@@ -6,7 +6,7 @@ Method ``link_token``
 
 The ``link_token`` method evaluates a request as :py:obj:`suspicious
 <is_suspicious>` if the URL ``/client<token>.css`` is not requested by the
-client.  By adding a random component (the token) in the URL a bot can not send
+client.  By adding a random component (the token) in the URL, a bot can not send
 a ping by request a static URL.
 
 .. note::
@@ -35,6 +35,11 @@ And in the HTML template from flask a stylesheet link is needed (the value of
    https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
 
 """
+from __future__ import annotations
+from ipaddress import (
+    IPv4Network,
+    IPv6Network,
+)
 
 import string
 import random
@@ -43,7 +48,11 @@ import flask
 from searx import logger
 from searx import redisdb
 from searx.redislib import secret_hash
-from ._helpers import get_real_ip
+
+from ._helpers import (
+    get_network,
+    get_real_ip,
+)
 
 TOKEN_LIVE_TIME = 600
 """Livetime (sec) of limiter's CSS token."""
@@ -60,29 +69,26 @@ TOKEN_KEY = 'SearXNG_limiter.token'
 logger = logger.getChild('botdetection.link_token')
 
 
-def is_suspicious(request: flask.Request, renew: bool = False):
-    """Checks if there is a valid ping for this request, if not this request is
-    rated as *suspicious*.  If a valid ping exists and argument ``renew`` is
-    ``True`` the expire time of this ping is reset to :py:obj:`PING_LIVE_TIME`.
+def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False):
+    """Checks whether a valid ping is exists for this (client) network, if not
+    this request is rated as *suspicious*.  If a valid ping exists and argument
+    ``renew`` is ``True`` the expire time of this ping is reset to
+    :py:obj:`PING_LIVE_TIME`.
 
     """
     redis_client = redisdb.client()
     if not redis_client:
         return False
 
-    ping_key = get_ping_key(request)
+    ping_key = get_ping_key(network, request)
     if not redis_client.get(ping_key):
-        logger.warning(
-            "missing ping (IP: %s) / request: %s",
-            get_real_ip(request),
-            ping_key,
-        )
+        logger.warning("missing ping (IP: %s) / request: %s", network.compressed, ping_key)
         return True
 
     if renew:
         redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
 
-    logger.debug("found ping for client request: %s", ping_key)
+    logger.debug("found ping for (client) network %s -> %s", network.compressed, ping_key)
     return False
 
 
@@ -92,27 +98,31 @@ def ping(request: flask.Request, token: str):
     The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`.
 
     """
+    from . import limiter  # pylint: disable=import-outside-toplevel, cyclic-import
+
     redis_client = redisdb.client()
     if not redis_client:
         return
     if not token_is_valid(token):
         return
-    ping_key = get_ping_key(request)
-    logger.debug("store ping for: %s", ping_key)
-    redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
 
+    cfg = limiter.get_cfg()
+    real_ip = get_real_ip(request)
+    network = get_network(real_ip, cfg)
 
-def get_ping_key(request: flask.Request):
-    """Generates a hashed key that fits (more or less) to a client (request).
-    At least X-Forwarded-For_ is needed to be able to assign the request to an
-    IP.
+    ping_key = get_ping_key(network, request)
+    logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key)
+    redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
 
-    """
+
+def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str:
+    """Generates a hashed key that fits (more or less) to a *WEB-browser
+    session* in a network."""
     return (
         PING_KEY
         + "["
         + secret_hash(
-            get_real_ip(request) + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '')
+            network.compressed + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '')
         )
         + "]"
     )

+ 1 - 6
searx/plugins/limiter.py

@@ -8,7 +8,6 @@ import flask
 from searx import redisdb
 from searx.plugins import logger
 from searx.botdetection import limiter
-from searx.botdetection import dump_request
 
 name = "Request limiter"
 description = "Limit the number of request"
@@ -20,10 +19,7 @@ logger = logger.getChild('limiter')
 
 def pre_request():
     """See :ref:`flask.Flask.before_request`"""
-    ret_val = limiter.filter_request(flask.request)
-    if ret_val is None:
-        logger.debug("OK: %s" % dump_request(flask.request))
-    return ret_val
+    return limiter.filter_request(flask.request)
 
 
 def init(app: flask.Flask, settings) -> bool:
@@ -32,6 +28,5 @@ def init(app: flask.Flask, settings) -> bool:
     if not redisdb.client():
         logger.error("The limiter requires Redis")
         return False
-    limiter.init_cfg(logger)
     app.before_request(pre_request)
     return True