hostnames.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # pylint: disable=too-many-branches
  3. """In addition to rewriting/replace reslut URLs, the *hoostnames* plugin offers
  4. other features.
  5. - ``hostnames.replace``: A **mapping** of regular expressions to hostnames to be
  6. replaced by other hostnames.
  7. .. code:: yaml
  8. hostnames:
  9. replace:
  10. '(.*\\.)?youtube\\.com$': 'invidious.example.com'
  11. '(.*\\.)?youtu\\.be$': 'invidious.example.com'
  12. ...
  13. - ``hostnames.remove``: A **list** of regular expressions of the hostnames whose
  14. results should be taken from the results list.
  15. .. code:: yaml
  16. hostnames:
  17. remove:
  18. - '(.*\\.)?facebook.com$'
  19. - ...
  20. - ``hostnames.high_priority``: A **list** of regular expressions for hostnames
  21. whose result should be given higher priority. The results from these hosts are
  22. arranged higher in the results list.
  23. .. code:: yaml
  24. hostnames:
  25. high_priority:
  26. - '(.*\\.)?wikipedia.org$'
  27. - ...
  28. - ``hostnames.lower_priority``: A **list** of regular expressions for hostnames
  29. whose result should be given lower priority. The results from these hosts are
  30. arranged lower in the results list.
  31. .. code:: yaml
  32. hostnames:
  33. low_priority:
  34. - '(.*\\.)?google(\\..*)?$'
  35. - ...
  36. If the URL matches the pattern of ``high_priority`` AND ``low_priority``, the
  37. higher priority wins over the lower priority.
  38. Alternatively, you can also specify a file name for the **mappings** or
  39. **lists** to load these from an external file:
  40. .. code:: yaml
  41. hostnames:
  42. replace: 'rewrite-hosts.yml'
  43. remove:
  44. - '(.*\\.)?facebook.com$'
  45. - ...
  46. low_priority:
  47. - '(.*\\.)?google(\\..*)?$'
  48. - ...
  49. high_priority:
  50. - '(.*\\.)?wikipedia.org$'
  51. - ...
  52. The ``rewrite-hosts.yml`` from the example above must be in the folder in which
  53. the ``settings.yml`` file is already located (``/etc/searxng``). The file then
  54. only contains the lists or the mapping tables without further information on the
  55. namespaces. In the example above, this would be a mapping table that looks
  56. something like this:
  57. .. code:: yaml
  58. '(.*\\.)?youtube\\.com$': 'invidious.example.com'
  59. '(.*\\.)?youtu\\.be$': 'invidious.example.com'
  60. """
  61. import re
  62. from urllib.parse import urlunparse, urlparse
  63. from flask_babel import gettext
  64. from searx import settings
  65. from searx.plugins import logger
  66. from searx.settings_loader import get_yaml_cfg
  67. name = gettext('Hostnames plugin')
  68. description = gettext('Rewrite hostnames, remove results or prioritize them based on the hostname')
  69. default_on = False
  70. preference_section = 'general'
  71. plugin_id = 'hostnames'
  72. logger = logger.getChild(plugin_id)
  73. parsed = 'parsed_url'
  74. _url_fields = ['iframe_src', 'audio_src']
  75. def _load_regular_expressions(settings_key):
  76. setting_value = settings.get(plugin_id, {}).get(settings_key)
  77. if not setting_value:
  78. return {}
  79. # load external file with configuration
  80. if isinstance(setting_value, str):
  81. setting_value = get_yaml_cfg(setting_value)
  82. if isinstance(setting_value, list):
  83. return {re.compile(r) for r in setting_value}
  84. if isinstance(setting_value, dict):
  85. return {re.compile(p): r for (p, r) in setting_value.items()}
  86. return {}
  87. replacements = _load_regular_expressions('replace')
  88. removables = _load_regular_expressions('remove')
  89. high_priority = _load_regular_expressions('high_priority')
  90. low_priority = _load_regular_expressions('low_priority')
  91. def _matches_parsed_url(result, pattern):
  92. return parsed in result and pattern.search(result[parsed].netloc)
  93. def on_result(_request, _search, result):
  94. for pattern, replacement in replacements.items():
  95. if _matches_parsed_url(result, pattern):
  96. # logger.debug(result['url'])
  97. result[parsed] = result[parsed]._replace(netloc=pattern.sub(replacement, result[parsed].netloc))
  98. result['url'] = urlunparse(result[parsed])
  99. # logger.debug(result['url'])
  100. for url_field in _url_fields:
  101. if not result.get(url_field):
  102. continue
  103. url_src = urlparse(result[url_field])
  104. if pattern.search(url_src.netloc):
  105. url_src = url_src._replace(netloc=pattern.sub(replacement, url_src.netloc))
  106. result[url_field] = urlunparse(url_src)
  107. for pattern in removables:
  108. if _matches_parsed_url(result, pattern):
  109. return False
  110. for url_field in _url_fields:
  111. if not result.get(url_field):
  112. continue
  113. url_src = urlparse(result[url_field])
  114. if pattern.search(url_src.netloc):
  115. del result[url_field]
  116. for pattern in low_priority:
  117. if _matches_parsed_url(result, pattern):
  118. result['priority'] = 'low'
  119. for pattern in high_priority:
  120. if _matches_parsed_url(result, pattern):
  121. result['priority'] = 'high'
  122. return True