hostnames.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # pylint: disable=too-many-branches
  3. """In addition to rewriting/replace reslut URLs, the *hoostnames* plugin offers
  4. other features.
  5. .. attention::
  6. The 'Hostnames plugin' from `PR-3463
  7. <https://github.com/searxng/searxng/pull/3463>`_ is a rewrite of the
  8. 'Hostname replace' plugin. Backwards compatibility is guaranteed for a
  9. transitional period, but this will end soon.
  10. **To maintainers of SearXNG instances, please modify your old plugin config
  11. to the new.**
  12. - ``hostnames.replace``: A mapping of regular expressions to hostnames to be
  13. replaced by other hostnames.
  14. - ``hostnames.remove``: A list of regular expressions of the hostnames whose
  15. results should be taken from the results list.
  16. - ``hostnames.high_priority``: A list of regular expressions for hostnames whose
  17. result should be given higher priority. The results from these hosts are
  18. arranged higher in the results list.
  19. - ``hostnames.lower_priority``: A list of regular expressions for hostnames
  20. whose result should be given lower priority. The results from these hosts are
  21. arranged lower in the results list.
  22. Alternatively, a file name can also be specified for the mappings or lists:
  23. .. code:: yaml
  24. hostnames:
  25. replace: 'rewrite-hosts.yml'
  26. remove:
  27. - '(.*\\.)?facebook.com$'
  28. ...
  29. low_priority:
  30. - '(.*\\.)?google(\\..*)?$'
  31. ...
  32. high_priority:
  33. - '(.*\\.)?wikipedia.org$'
  34. ...
  35. The ``rewrite-hosts.yml`` from the example above must be in the folder in which
  36. the ``settings.yml`` file is already located (``/etc/searxng``). The file then
  37. only contains the lists or the mapping tables without further information on the
  38. namespaces. In the example above, this would be a mapping table that looks
  39. something like this:
  40. .. code:: yaml
  41. '(.*\\.)?youtube\\.com$': 'invidious.example.com'
  42. '(.*\\.)?youtu\\.be$': 'invidious.example.com'
  43. """
  44. import re
  45. from urllib.parse import urlunparse, urlparse
  46. from flask_babel import gettext
  47. from searx import settings
  48. from searx.plugins import logger
  49. from searx.settings_loader import get_yaml_file
  50. name = gettext('Hostnames plugin')
  51. description = gettext('Rewrite hostnames, remove results or prioritize them based on the hostname')
  52. default_on = False
  53. preference_section = 'general'
  54. plugin_id = 'hostnames'
  55. logger = logger.getChild(plugin_id)
  56. parsed = 'parsed_url'
  57. _url_fields = ['iframe_src', 'audio_src']
  58. def _load_regular_expressions(settings_key):
  59. setting_value = settings.get(plugin_id, {}).get(settings_key)
  60. if not setting_value:
  61. return {}
  62. # load external file with configuration
  63. if isinstance(setting_value, str):
  64. setting_value = get_yaml_file(setting_value)
  65. if isinstance(setting_value, list):
  66. return {re.compile(r) for r in setting_value}
  67. if isinstance(setting_value, dict):
  68. return {re.compile(p): r for (p, r) in setting_value.items()}
  69. return {}
  70. # compatibility fallback for old hostname replace plugin
  71. # TODO: remove in the future once most/all instance maintainers finished migrating # pylint: disable=fixme
  72. def _load_regular_expressions_with_fallback(settings_key):
  73. expressions = _load_regular_expressions(settings_key)
  74. if expressions:
  75. return expressions
  76. # fallback to the old `hostname_replace` settings format
  77. # pylint: disable=import-outside-toplevel, cyclic-import
  78. hostname_replace_config = settings.get('hostname_replace', {})
  79. if hostname_replace_config:
  80. from searx.plugins.hostname_replace import deprecated_msg
  81. deprecated_msg()
  82. if settings_key == 'replace':
  83. return {re.compile(p): r for (p, r) in hostname_replace_config.items() if r}
  84. return {re.compile(p) for (p, r) in hostname_replace_config.items() if not r}
  85. replacements = _load_regular_expressions_with_fallback('replace')
  86. removables = _load_regular_expressions_with_fallback('remove')
  87. high_priority = _load_regular_expressions('high_priority')
  88. low_priority = _load_regular_expressions('low_priority')
  89. def _matches_parsed_url(result, pattern):
  90. return parsed in result and pattern.search(result[parsed].netloc)
  91. def on_result(_request, _search, result):
  92. for pattern, replacement in replacements.items():
  93. if _matches_parsed_url(result, pattern):
  94. logger.debug(result['url'])
  95. result[parsed] = result[parsed]._replace(netloc=pattern.sub(replacement, result[parsed].netloc))
  96. result['url'] = urlunparse(result[parsed])
  97. logger.debug(result['url'])
  98. for url_field in _url_fields:
  99. if not result.get(url_field):
  100. continue
  101. url_src = urlparse(result[url_field])
  102. if pattern.search(url_src.netloc):
  103. url_src = url_src._replace(netloc=pattern.sub(replacement, url_src.netloc))
  104. result[url_field] = urlunparse(url_src)
  105. for pattern in removables:
  106. if _matches_parsed_url(result, pattern):
  107. return False
  108. for url_field in _url_fields:
  109. if not result.get(url_field):
  110. continue
  111. url_src = urlparse(result[url_field])
  112. if pattern.search(url_src.netloc):
  113. del result[url_field]
  114. for pattern in low_priority:
  115. if _matches_parsed_url(result, pattern):
  116. result['priority'] = 'low'
  117. for pattern in high_priority:
  118. if _matches_parsed_url(result, pattern):
  119. result['priority'] = 'high'
  120. return True