http_user_agent.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """
  4. Method ``http_user_agent``
  5. --------------------------
  6. The ``http_user_agent`` method evaluates a request as the request of a bot if
  7. the User-Agent_ header is unset or matches the regular expression
  8. :py:obj:`USER_AGENT`.
  9. .. _User-Agent:
  10. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
  11. """
  12. # pylint: disable=unused-argument
  13. from typing import Optional
  14. import re
  15. import flask
  16. import werkzeug
  17. from searx.tools import config
  18. from ._helpers import too_many_requests
  19. USER_AGENT = (
  20. r'('
  21. + r'unknown'
  22. + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
  23. + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
  24. + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
  25. + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
  26. + r'|ZmEu|BLEXBot|bitlybot'
  27. # unmaintained Farside instances
  28. + r'|'
  29. + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
  30. # other bots and client to block
  31. + '|.*PetalBot.*'
  32. + r')'
  33. )
  34. """Regular expression that matches to User-Agent_ from known *bots*"""
  35. _regexp = None
  36. def regexp_user_agent():
  37. global _regexp # pylint: disable=global-statement
  38. if not _regexp:
  39. _regexp = re.compile(USER_AGENT)
  40. return _regexp
  41. def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
  42. user_agent = request.headers.get('User-Agent', 'unknown')
  43. if regexp_user_agent().match(user_agent):
  44. return too_many_requests(request, f"bot detected, HTTP header User-Agent: {user_agent}")
  45. return None