Browse Source

[mod] botdetection - improve ip_limit and link_token methods

- counting requests in LONG_WINDOW and BURST_WINDOW is not needed when the
  request is validated by the link_token method [1]

- renew a ping-key on validation [2], this is needed for infinite scrolling,
  where no new token (CSS) is loaded. / this does not fix the BURST_MAX issue in
  the vanilla limiter

- normalize the counter names of the ip_limit method to 'ip_limit.*'

- just integrate the ip_limit method straight forward in the limiter plugin /
  non intermediate code --> ip_limit now returns None or a werkzeug.Response
  object that can be passed by the plugin to the flask application / non
  intermediate code that returns a tuple

[1] https://github.com/searxng/searxng/pull/2357#issuecomment-1566113277
[2] https://github.com/searxng/searxng/pull/2357#discussion_r1208542206
[3] https://github.com/searxng/searxng/pull/2357#issuecomment-1566125979

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 1 year ago
parent
commit
b8c7c2c9aa

+ 1 - 15
searx/botdetection/__init__.py

@@ -9,18 +9,4 @@ The methods implemented in this python package are use by the :ref:`limiter src`
 
 """
 
-import flask
-
-
-def dump_request(request: flask.Request):
-    return (
-        "%s: '%s'" % (request.headers.get('X-Forwarded-For'), request.path)
-        + " || form: %s" % request.form
-        + " || Accept: %s" % request.headers.get('Accept')
-        + " || Accept-Language: %s" % request.headers.get('Accept-Language')
-        + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
-        + " || Content-Type: %s" % request.headers.get('Content-Type')
-        + " || Content-Length: %s" % request.headers.get('Content-Length')
-        + " || Connection: %s" % request.headers.get('Connection')
-        + " || User-Agent: %s" % request.headers.get('User-Agent')
-    )
+from ._helpers import dump_request

+ 93 - 0
searx/botdetection/_helpers.py

@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+# pylint: disable=missing-module-docstring, invalid-name
+
+from typing import Optional
+import flask
+import werkzeug
+
+from searx import logger
+
+logger = logger.getChild('botdetection')
+
+
+def dump_request(request: flask.Request):
+    return (
+        "%s: %s" % (get_real_ip(request), request.path)
+        + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For')
+        + " || X-Real-IP: %s" % request.headers.get('X-Real-IP')
+        + " || form: %s" % request.form
+        + " || Accept: %s" % request.headers.get('Accept')
+        + " || Accept-Language: %s" % request.headers.get('Accept-Language')
+        + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
+        + " || Content-Type: %s" % request.headers.get('Content-Type')
+        + " || Content-Length: %s" % request.headers.get('Content-Length')
+        + " || Connection: %s" % request.headers.get('Connection')
+        + " || User-Agent: %s" % request.headers.get('User-Agent')
+    )
+
+
+def too_many_requests(request: flask.Request, log_msg: str) -> Optional[werkzeug.Response]:
+    log_prefix = 'BLOCK %s: ' % get_real_ip(request)
+    logger.debug(log_prefix + log_msg)
+    return flask.make_response(('Too Many Requests', 429))
+
+
+def get_real_ip(request: flask.Request) -> str:
+    """Returns real IP of the request.  Since not all proxies set all the HTTP
+    headers and incoming headers can be faked it may happen that the IP cannot
+    be determined correctly.
+
+    .. sidebar:: :py:obj:`flask.Request.remote_addr`
+
+       SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``).
+
+    This function tries to get the remote IP in the order listed below,
+    additional some tests are done and if inconsistencies or errors are
+    detected, they are logged.
+
+    The remote IP of the request is taken from (first match):
+
+    - X-Forwarded-For_ header
+    - `X-real-IP header <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__
+    - :py:obj:`flask.Request.remote_addr`
+
+    .. _ProxyFix:
+       https://werkzeug.palletsprojects.com/middleware/proxy_fix/
+
+    .. _X-Forwarded-For:
+      https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
+
+    """
+
+    forwarded_for = request.headers.get("X-Forwarded-For")
+    real_ip = request.headers.get('X-Real-IP')
+    remote_addr = request.remote_addr
+    logger.debug("X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr)
+
+    if not forwarded_for:
+        logger.error("X-Forwarded-For header is not set!")
+    else:
+        from .limiter import get_cfg  # pylint: disable=import-outside-toplevel, cyclic-import
+
+        forwarded_for = [x.strip() for x in forwarded_for.split(',')]
+        x_for: int = get_cfg()['real_ip.x_for']
+        forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)]
+
+    if not real_ip:
+        logger.error("X-Real-IP header is not set!")
+
+    if forwarded_for and real_ip and forwarded_for != real_ip:
+        logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for)
+
+    if forwarded_for and remote_addr and forwarded_for != remote_addr:
+        logger.warning(
+            "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for
+        )
+
+    if real_ip and remote_addr and real_ip != remote_addr:
+        logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip)
+
+    request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0'
+    logger.debug("get_real_ip() -> %s", request_ip)
+    return request_ip

+ 5 - 3
searx/botdetection/http_accept.py

@@ -15,13 +15,15 @@ Accept_ header ..
 """
 # pylint: disable=unused-argument
 
-from typing import Optional, Tuple
+from typing import Optional
 import flask
+import werkzeug
 
 from searx.tools import config
+from ._helpers import too_many_requests
 
 
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
     if 'text/html' not in request.accept_mimetypes:
-        return 429, "bot detected, HTTP header Accept did not contain text/html"
+        return too_many_requests(request, "HTTP header Accept did not contain text/html")
     return None

+ 5 - 3
searx/botdetection/http_accept_encoding.py

@@ -16,14 +16,16 @@ bot if the Accept-Encoding_ header ..
 """
 # pylint: disable=unused-argument
 
-from typing import Optional, Tuple
+from typing import Optional
 import flask
+import werkzeug
 
 from searx.tools import config
+from ._helpers import too_many_requests
 
 
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
     accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
     if not ('gzip' in accept_list or 'deflate' in accept_list):
-        return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate"
+        return too_many_requests(request, "HTTP header Accept-Encoding did not contain gzip nor deflate")
     return None

+ 5 - 3
searx/botdetection/http_accept_language.py

@@ -13,13 +13,15 @@ if the Accept-Language_ header is unset.
 """
 # pylint: disable=unused-argument
 
-from typing import Optional, Tuple
+from typing import Optional
 import flask
+import werkzeug
 
 from searx.tools import config
+from ._helpers import too_many_requests
 
 
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
     if request.headers.get('Accept-Language', '').strip() == '':
-        return 429, "bot detected, missing HTTP header Accept-Language"
+        return too_many_requests(request, "missing HTTP header Accept-Language")
     return None

+ 5 - 3
searx/botdetection/http_connection.py

@@ -13,13 +13,15 @@ the Connection_ header is set to ``close``.
 """
 # pylint: disable=unused-argument
 
-from typing import Optional, Tuple
+from typing import Optional
 import flask
+import werkzeug
 
 from searx.tools import config
+from ._helpers import too_many_requests
 
 
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
     if request.headers.get('Connection', '').strip() == 'close':
-        return 429, "bot detected, HTTP header 'Connection=close'"
+        return too_many_requests(request, "HTTP header 'Connection=close")
     return None

+ 5 - 6
searx/botdetection/http_user_agent.py

@@ -14,11 +14,13 @@ the User-Agent_ header is unset or matches the regular expression
 """
 # pylint: disable=unused-argument
 
-from typing import Optional, Tuple
+from typing import Optional
 import re
 import flask
+import werkzeug
 
 from searx.tools import config
+from ._helpers import too_many_requests
 
 
 USER_AGENT = (
@@ -48,11 +50,8 @@ def regexp_user_agent():
     return _regexp
 
 
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
     user_agent = request.headers.get('User-Agent', 'unknown')
     if regexp_user_agent().match(user_agent):
-        return (
-            429,
-            f"bot detected, HTTP header User-Agent: {user_agent}",
-        )
+        return too_many_requests(request, f"bot detected, HTTP header User-Agent: {user_agent}")
     return None

+ 33 - 28
searx/botdetection/ip_limit.py

@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
 """.. _botdetection.ip_limit:
 
 Method ``ip_limit``
@@ -37,16 +39,18 @@ droped.
 
 """
 
-from typing import Optional, Tuple
+from typing import Optional
 import flask
+import werkzeug
 from searx.tools import config
 
-
 from searx import redisdb
 from searx import logger
 from searx.redislib import incr_sliding_window, drop_counter
 
 from . import link_token
+from ._helpers import too_many_requests
+
 
 logger = logger.getChild('botdetection.ip_limit')
 
@@ -81,50 +85,51 @@ SUSPICIOUS_IP_MAX = 3
 """Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`."""
 
 
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
+    # pylint: disable=too-many-return-statements
     redis_client = redisdb.client()
 
-    x_forwarded_for = request.headers.get('X-Forwarded-For', '')
-    if not x_forwarded_for:
+    client_ip = request.headers.get('X-Forwarded-For', '')
+    if not client_ip:
         logger.error("missing HTTP header X-Forwarded-For")
 
     if request.args.get('format', 'html') != 'html':
-        c = incr_sliding_window(redis_client, 'IP limit - API_WONDOW:' + x_forwarded_for, API_WONDOW)
+        c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + client_ip, API_WONDOW)
         if c > API_MAX:
-            return 429, "BLOCK %s: API limit exceeded"
-
-    suspicious = False
-    suspicious_ip_counter = 'IP limit - SUSPICIOUS_IP_WINDOW:' + x_forwarded_for
+            return too_many_requests(request, "too many request in API_WINDOW")
 
     if cfg['botdetection.ip_limit.link_token']:
-        suspicious = link_token.is_suspicious(request)
 
-    if suspicious:
+        suspicious = link_token.is_suspicious(request, True)
+
+        if not suspicious:
+            # this IP is no longer suspicious: release ip again / delete the counter of this IP
+            drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip)
+            return None
 
         # this IP is suspicious: count requests from this IP
-        c = incr_sliding_window(redis_client, suspicious_ip_counter, SUSPICIOUS_IP_WINDOW)
+        c = incr_sliding_window(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip, SUSPICIOUS_IP_WINDOW)
         if c > SUSPICIOUS_IP_MAX:
-            return 429, f"bot detected, too many request from {x_forwarded_for} in SUSPICIOUS_IP_WINDOW"
+            logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", client_ip)
+            return flask.redirect(flask.url_for('index'), code=302)
 
-        c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW)
+        c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW)
         if c > BURST_MAX_SUSPICIOUS:
-            return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX_SUSPICIOUS"
+            return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)")
 
-        c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW)
+        c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW)
         if c > LONG_MAX_SUSPICIOUS:
-            return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX_SUSPICIOUS"
+            return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)")
 
-    else:
+        return None
 
-        if cfg['botdetection.ip_limit.link_token']:
-            # this IP is no longer suspicious: release ip again / delete the counter of this IP
-            drop_counter(redis_client, suspicious_ip_counter)
+    # vanilla limiter without extensions counts BURST_MAX and LONG_MAX
+    c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW)
+    if c > BURST_MAX:
+        return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX)")
 
-        c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW)
-        if c > BURST_MAX:
-            return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX"
+    c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW)
+    if c > LONG_MAX:
+        return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX)")
 
-        c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW)
-        if c > LONG_MAX:
-            return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX"
     return None

+ 9 - 2
searx/botdetection/limiter.py

@@ -42,6 +42,7 @@ from pathlib import Path
 import flask
 import pytomlpp as toml
 
+from searx import logger
 from searx.tools import config
 from searx.botdetection import (
     http_accept,
@@ -62,7 +63,13 @@ CFG_DEPRECATED = {
     # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests.  Don't use it in your real project config."
 }
 
-CFG = config.Config({}, {})
+CFG = None
+
+
+def get_cfg() -> config.Config:
+    if CFG is None:
+        init_cfg(logger)
+    return CFG
 
 
 def init_cfg(log):
@@ -73,7 +80,7 @@ def init_cfg(log):
         log.warning("missing config file: %s", LIMITER_CFG)
         return
 
-    log.warning("load config file: %s", LIMITER_CFG)
+    log.info("load config file: %s", LIMITER_CFG)
     try:
         upd_cfg = toml.load(LIMITER_CFG)
     except toml.DecodeError as exc:

+ 32 - 11
searx/botdetection/link_token.py

@@ -47,15 +47,24 @@ from searx.redislib import secret_hash
 TOKEN_LIVE_TIME = 600
 """Livetime (sec) of limiter's CSS token."""
 
+PING_LIVE_TIME = 3600
+"""Livetime (sec) of the ping-key from a client (request)"""
+
 PING_KEY = 'SearXNG_limiter.ping'
+"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`"""
+
 TOKEN_KEY = 'SearXNG_limiter.token'
+"""Key for which the current token is stored in the DB"""
 
 logger = logger.getChild('botdetection.link_token')
 
 
-def is_suspicious(request: flask.Request):
+def is_suspicious(request: flask.Request, renew: bool = False):
     """Checks if there is a valid ping for this request, if not this request is
-    rated as *suspicious*"""
+    rated as *suspicious*.  If a valid ping exists and argument ``renew`` is
+    ``True`` the expire time of this ping is reset to :py:obj:`PING_LIVE_TIME`.
+
+    """
     redis_client = redisdb.client()
     if not redis_client:
         return False
@@ -69,12 +78,19 @@ def is_suspicious(request: flask.Request):
         )
         return True
 
-    logger.debug("found ping for this request: %s", ping_key)
+    if renew:
+        redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
+
+    logger.debug("found ping for client request: %s", ping_key)
     return False
 
 
 def ping(request: flask.Request, token: str):
-    """This function is called by a request to URL ``/client<token>.css``"""
+    """This function is called by a request to URL ``/client<token>.css``.  If
+    ``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB.
+    The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`.
+
+    """
     redis_client = redisdb.client()
     if not redis_client:
         return
@@ -82,19 +98,24 @@ def ping(request: flask.Request, token: str):
         return
     ping_key = get_ping_key(request)
     logger.debug("store ping for: %s", ping_key)
-    redis_client.set(ping_key, 1, ex=TOKEN_LIVE_TIME)
+    redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
 
 
 def get_ping_key(request: flask.Request):
-    """Generates a hashed key that fits (more or less) to a request.  At least
-    X-Forwarded-For_ is needed to be able to assign the request to an IP.
+    """Generates a hashed key that fits (more or less) to a client (request).
+    At least X-Forwarded-For_ is needed to be able to assign the request to an
+    IP.
 
     """
-    return secret_hash(
+    return (
         PING_KEY
-        + request.headers.get('X-Forwarded-For', '')
-        + request.headers.get('Accept-Language', '')
-        + request.headers.get('User-Agent', '')
+        + "["
+        + secret_hash(
+            request.headers.get('X-Forwarded-For', '')
+            + request.headers.get('Accept-Language', '')
+            + request.headers.get('User-Agent', '')
+        )
+        + "]"
     )
 
 

+ 4 - 10
searx/plugins/limiter.py

@@ -20,16 +20,10 @@ logger = logger.getChild('limiter')
 
 def pre_request():
     """See :ref:`flask.Flask.before_request`"""
-
-    val = limiter.filter_request(flask.request)
-    if val is not None:
-        http_status, msg = val
-        client_ip = flask.request.headers.get('X-Forwarded-For', '<unknown>')
-        logger.error("BLOCK (IP %s): %s" % (client_ip, msg))
-        return 'Too Many Requests', http_status
-
-    logger.debug("OK: %s" % dump_request(flask.request))
-    return None
+    ret_val = limiter.filter_request(flask.request)
+    if ret_val is None:
+        logger.debug("OK: %s" % dump_request(flask.request))
+    return ret_val
 
 
 def init(app: flask.Flask, settings) -> bool: