hostnames.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # pylint: disable=too-many-branches
  3. """In addition to rewriting/replace reslut URLs, the *hoostnames* plugin offers
  4. other features.
  5. .. attention::
  6. The 'Hostnames plugin' from `PR-3463
  7. <https://github.com/searxng/searxng/pull/3463>`_ is a rewrite of the
  8. 'Hostname replace' plugin. Backwards compatibility is guaranteed for a
  9. transitional period, but this will end soon.
  10. **To maintainers of SearXNG instances, please modify your old plugin config
  11. to the new.**
  12. - ``hostnames.replace``: A **mapping** of regular expressions to hostnames to be
  13. replaced by other hostnames.
  14. .. code:: yaml
  15. hostnames:
  16. replace:
  17. '(.*\\.)?youtube\\.com$': 'invidious.example.com'
  18. '(.*\\.)?youtu\\.be$': 'invidious.example.com'
  19. ...
  20. - ``hostnames.remove``: A **list** of regular expressions of the hostnames whose
  21. results should be taken from the results list.
  22. .. code:: yaml
  23. hostnames:
  24. remove:
  25. - '(.*\\.)?facebook.com$'
  26. - ...
  27. - ``hostnames.high_priority``: A **list** of regular expressions for hostnames
  28. whose result should be given higher priority. The results from these hosts are
  29. arranged higher in the results list.
  30. .. code:: yaml
  31. hostnames:
  32. high_priority:
  33. - '(.*\\.)?wikipedia.org$'
  34. - ...
  35. - ``hostnames.lower_priority``: A **list** of regular expressions for hostnames
  36. whose result should be given lower priority. The results from these hosts are
  37. arranged lower in the results list.
  38. .. code:: yaml
  39. hostnames:
  40. low_priority:
  41. - '(.*\\.)?google(\\..*)?$'
  42. - ...
  43. If the URL matches the pattern of ``high_priority`` AND ``low_priority``, the
  44. higher priority wins over the lower priority.
  45. Alternatively, you can also specify a file name for the **mappings** or
  46. **lists** to load these from an external file:
  47. .. code:: yaml
  48. hostnames:
  49. replace: 'rewrite-hosts.yml'
  50. remove:
  51. - '(.*\\.)?facebook.com$'
  52. - ...
  53. low_priority:
  54. - '(.*\\.)?google(\\..*)?$'
  55. - ...
  56. high_priority:
  57. - '(.*\\.)?wikipedia.org$'
  58. - ...
  59. The ``rewrite-hosts.yml`` from the example above must be in the folder in which
  60. the ``settings.yml`` file is already located (``/etc/searxng``). The file then
  61. only contains the lists or the mapping tables without further information on the
  62. namespaces. In the example above, this would be a mapping table that looks
  63. something like this:
  64. .. code:: yaml
  65. '(.*\\.)?youtube\\.com$': 'invidious.example.com'
  66. '(.*\\.)?youtu\\.be$': 'invidious.example.com'
  67. """
  68. import re
  69. from urllib.parse import urlunparse, urlparse
  70. from flask_babel import gettext
  71. from searx import settings
  72. from searx.plugins import logger
  73. from searx.settings_loader import get_yaml_file
  74. name = gettext('Hostnames plugin')
  75. description = gettext('Rewrite hostnames, remove results or prioritize them based on the hostname')
  76. default_on = False
  77. preference_section = 'general'
  78. plugin_id = 'hostnames'
  79. logger = logger.getChild(plugin_id)
  80. parsed = 'parsed_url'
  81. _url_fields = ['iframe_src', 'audio_src']
  82. def _load_regular_expressions(settings_key):
  83. setting_value = settings.get(plugin_id, {}).get(settings_key)
  84. if not setting_value:
  85. return {}
  86. # load external file with configuration
  87. if isinstance(setting_value, str):
  88. setting_value = get_yaml_file(setting_value)
  89. if isinstance(setting_value, list):
  90. return {re.compile(r) for r in setting_value}
  91. if isinstance(setting_value, dict):
  92. return {re.compile(p): r for (p, r) in setting_value.items()}
  93. return {}
  94. # compatibility fallback for old hostname replace plugin
  95. # TODO: remove in the future once most/all instance maintainers finished migrating # pylint: disable=fixme
  96. def _load_regular_expressions_with_fallback(settings_key):
  97. expressions = _load_regular_expressions(settings_key)
  98. if expressions:
  99. return expressions
  100. # fallback to the old `hostname_replace` settings format
  101. # pylint: disable=import-outside-toplevel, cyclic-import
  102. hostname_replace_config = settings.get('hostname_replace', {})
  103. if hostname_replace_config:
  104. from searx.plugins.hostname_replace import deprecated_msg
  105. deprecated_msg()
  106. if settings_key == 'replace':
  107. return {re.compile(p): r for (p, r) in hostname_replace_config.items() if r}
  108. return {re.compile(p) for (p, r) in hostname_replace_config.items() if not r}
  109. replacements = _load_regular_expressions_with_fallback('replace')
  110. removables = _load_regular_expressions_with_fallback('remove')
  111. high_priority = _load_regular_expressions('high_priority')
  112. low_priority = _load_regular_expressions('low_priority')
  113. def _matches_parsed_url(result, pattern):
  114. return parsed in result and pattern.search(result[parsed].netloc)
  115. def on_result(_request, _search, result):
  116. for pattern, replacement in replacements.items():
  117. if _matches_parsed_url(result, pattern):
  118. logger.debug(result['url'])
  119. result[parsed] = result[parsed]._replace(netloc=pattern.sub(replacement, result[parsed].netloc))
  120. result['url'] = urlunparse(result[parsed])
  121. logger.debug(result['url'])
  122. for url_field in _url_fields:
  123. if not result.get(url_field):
  124. continue
  125. url_src = urlparse(result[url_field])
  126. if pattern.search(url_src.netloc):
  127. url_src = url_src._replace(netloc=pattern.sub(replacement, url_src.netloc))
  128. result[url_field] = urlunparse(url_src)
  129. for pattern in removables:
  130. if _matches_parsed_url(result, pattern):
  131. return False
  132. for url_field in _url_fields:
  133. if not result.get(url_field):
  134. continue
  135. url_src = urlparse(result[url_field])
  136. if pattern.search(url_src.netloc):
  137. del result[url_field]
  138. for pattern in low_priority:
  139. if _matches_parsed_url(result, pattern):
  140. result['priority'] = 'low'
  141. for pattern in high_priority:
  142. if _matches_parsed_url(result, pattern):
  143. result['priority'] = 'high'
  144. return True