hostnames.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # pylint: disable=too-many-branches
  3. """
  4. .. attention::
  5. The **"Hostname replace"** plugin has been replace by **"Hostnames
  6. plugin"**, see :pull:`3463` & :pull:`3552`.
  7. The **Hostnames plugin** can be enabled by adding it to the
  8. ``enabled_plugins`` **list** in the ``setting.yml`` like so.
  9. .. code:: yaml
  10. enabled_plugins:
  11. - 'Hostnames plugin'
  12. ...
  13. - ``hostnames.replace``: A **mapping** of regular expressions to hostnames to be
  14. replaced by other hostnames.
  15. .. code:: yaml
  16. hostnames:
  17. replace:
  18. '(.*\\.)?youtube\\.com$': 'invidious.example.com'
  19. '(.*\\.)?youtu\\.be$': 'invidious.example.com'
  20. ...
  21. - ``hostnames.remove``: A **list** of regular expressions of the hostnames whose
  22. results should be taken from the results list.
  23. .. code:: yaml
  24. hostnames:
  25. remove:
  26. - '(.*\\.)?facebook.com$'
  27. - ...
  28. - ``hostnames.high_priority``: A **list** of regular expressions for hostnames
  29. whose result should be given higher priority. The results from these hosts are
  30. arranged higher in the results list.
  31. .. code:: yaml
  32. hostnames:
  33. high_priority:
  34. - '(.*\\.)?wikipedia.org$'
  35. - ...
  36. - ``hostnames.lower_priority``: A **list** of regular expressions for hostnames
  37. whose result should be given lower priority. The results from these hosts are
  38. arranged lower in the results list.
  39. .. code:: yaml
  40. hostnames:
  41. low_priority:
  42. - '(.*\\.)?google(\\..*)?$'
  43. - ...
  44. If the URL matches the pattern of ``high_priority`` AND ``low_priority``, the
  45. higher priority wins over the lower priority.
  46. Alternatively, you can also specify a file name for the **mappings** or
  47. **lists** to load these from an external file:
  48. .. code:: yaml
  49. hostnames:
  50. replace: 'rewrite-hosts.yml'
  51. remove:
  52. - '(.*\\.)?facebook.com$'
  53. - ...
  54. low_priority:
  55. - '(.*\\.)?google(\\..*)?$'
  56. - ...
  57. high_priority:
  58. - '(.*\\.)?wikipedia.org$'
  59. - ...
  60. The ``rewrite-hosts.yml`` from the example above must be in the folder in which
  61. the ``settings.yml`` file is already located (``/etc/searxng``). The file then
  62. only contains the lists or the mapping tables without further information on the
  63. namespaces. In the example above, this would be a mapping table that looks
  64. something like this:
  65. .. code:: yaml
  66. '(.*\\.)?youtube\\.com$': 'invidious.example.com'
  67. '(.*\\.)?youtu\\.be$': 'invidious.example.com'
  68. """
  69. import re
  70. from urllib.parse import urlunparse, urlparse
  71. from flask_babel import gettext
  72. from searx import settings
  73. from searx.plugins import logger
  74. from searx.settings_loader import get_yaml_cfg
  75. name = gettext('Hostnames plugin')
  76. description = gettext('Rewrite hostnames, remove results or prioritize them based on the hostname')
  77. default_on = False
  78. preference_section = 'general'
  79. plugin_id = 'hostnames'
  80. logger = logger.getChild(plugin_id)
  81. parsed = 'parsed_url'
  82. _url_fields = ['iframe_src', 'audio_src']
  83. def _load_regular_expressions(settings_key):
  84. setting_value = settings.get(plugin_id, {}).get(settings_key)
  85. if not setting_value:
  86. return {}
  87. # load external file with configuration
  88. if isinstance(setting_value, str):
  89. setting_value = get_yaml_cfg(setting_value)
  90. if isinstance(setting_value, list):
  91. return {re.compile(r) for r in setting_value}
  92. if isinstance(setting_value, dict):
  93. return {re.compile(p): r for (p, r) in setting_value.items()}
  94. return {}
  95. replacements = _load_regular_expressions('replace')
  96. removables = _load_regular_expressions('remove')
  97. high_priority = _load_regular_expressions('high_priority')
  98. low_priority = _load_regular_expressions('low_priority')
  99. def _matches_parsed_url(result, pattern):
  100. return parsed in result and pattern.search(result[parsed].netloc)
  101. def on_result(_request, _search, result):
  102. for pattern, replacement in replacements.items():
  103. if _matches_parsed_url(result, pattern):
  104. # logger.debug(result['url'])
  105. result[parsed] = result[parsed]._replace(netloc=pattern.sub(replacement, result[parsed].netloc))
  106. result['url'] = urlunparse(result[parsed])
  107. # logger.debug(result['url'])
  108. for url_field in _url_fields:
  109. if not result.get(url_field):
  110. continue
  111. url_src = urlparse(result[url_field])
  112. if pattern.search(url_src.netloc):
  113. url_src = url_src._replace(netloc=pattern.sub(replacement, url_src.netloc))
  114. result[url_field] = urlunparse(url_src)
  115. for pattern in removables:
  116. if _matches_parsed_url(result, pattern):
  117. return False
  118. for url_field in _url_fields:
  119. if not result.get(url_field):
  120. continue
  121. url_src = urlparse(result[url_field])
  122. if pattern.search(url_src.netloc):
  123. del result[url_field]
  124. for pattern in low_priority:
  125. if _matches_parsed_url(result, pattern):
  126. result['priority'] = 'low'
  127. for pattern in high_priority:
  128. if _matches_parsed_url(result, pattern):
  129. result['priority'] = 'high'
  130. return True