https_rewrite.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import re
  15. import sys
  16. from lxml import etree
  17. from os import listdir, environ
  18. from os.path import isfile, isdir, join
  19. from searx.plugins import logger
  20. from flask_babel import gettext
  21. from searx import searx_dir
  22. from searx.url_utils import urlparse
  23. if sys.version_info[0] == 3:
  24. unicode = str
  25. name = "HTTPS rewrite"
  26. description = gettext('Rewrite HTTP links to HTTPS if possible')
  27. default_on = True
  28. if 'SEARX_HTTPS_REWRITE_PATH' in environ:
  29. rules_path = environ['SEARX_rules_path']
  30. else:
  31. rules_path = join(searx_dir, 'plugins/https_rules')
  32. logger = logger.getChild("https_rewrite")
  33. # https://gitweb.torproject.org/\
  34. # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
  35. # HTTPS rewrite rules
  36. https_rules = []
  37. # load single ruleset from a xml file
  38. def load_single_https_ruleset(rules_path):
  39. ruleset = ()
  40. # init parser
  41. parser = etree.XMLParser()
  42. # load and parse xml-file
  43. try:
  44. tree = etree.parse(rules_path, parser)
  45. except:
  46. # TODO, error message
  47. return ()
  48. # get root node
  49. root = tree.getroot()
  50. # check if root is a node with the name ruleset
  51. # TODO improve parsing
  52. if root.tag != 'ruleset':
  53. return ()
  54. # check if rule is deactivated by default
  55. if root.attrib.get('default_off'):
  56. return ()
  57. # check if rule does only work for specific platforms
  58. if root.attrib.get('platform'):
  59. return ()
  60. hosts = []
  61. rules = []
  62. exclusions = []
  63. # parse childs from ruleset
  64. for ruleset in root:
  65. # this child define a target
  66. if ruleset.tag == 'target':
  67. # check if required tags available
  68. if not ruleset.attrib.get('host'):
  69. continue
  70. # convert host-rule to valid regex
  71. host = ruleset.attrib.get('host')\
  72. .replace('.', r'\.').replace('*', '.*')
  73. # append to host list
  74. hosts.append(host)
  75. # this child define a rule
  76. elif ruleset.tag == 'rule':
  77. # check if required tags available
  78. if not ruleset.attrib.get('from')\
  79. or not ruleset.attrib.get('to'):
  80. continue
  81. # TODO hack, which convert a javascript regex group
  82. # into a valid python regex group
  83. rule_from = ruleset.attrib['from'].replace('$', '\\')
  84. if rule_from.endswith('\\'):
  85. rule_from = rule_from[:-1] + '$'
  86. rule_to = ruleset.attrib['to'].replace('$', '\\')
  87. if rule_to.endswith('\\'):
  88. rule_to = rule_to[:-1] + '$'
  89. # TODO, not working yet because of the hack above,
  90. # currently doing that in webapp.py
  91. # rule_from_rgx = re.compile(rule_from, re.I)
  92. # append rule
  93. try:
  94. rules.append((re.compile(rule_from, re.I | re.U), rule_to))
  95. except:
  96. # TODO log regex error
  97. continue
  98. # this child define an exclusion
  99. elif ruleset.tag == 'exclusion':
  100. # check if required tags available
  101. if not ruleset.attrib.get('pattern'):
  102. continue
  103. exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
  104. # append exclusion
  105. exclusions.append(exclusion_rgx)
  106. # convert list of possible hosts to a simple regex
  107. # TODO compress regex to improve performance
  108. try:
  109. target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
  110. except:
  111. return ()
  112. # return ruleset
  113. return (target_hosts, rules, exclusions)
  114. # load all https rewrite rules
  115. def load_https_rules(rules_path):
  116. # check if directory exists
  117. if not isdir(rules_path):
  118. logger.error("directory not found: '" + rules_path + "'")
  119. return
  120. # search all xml files which are stored in the https rule directory
  121. xml_files = [join(rules_path, f)
  122. for f in listdir(rules_path)
  123. if isfile(join(rules_path, f)) and f[-4:] == '.xml']
  124. # load xml-files
  125. for ruleset_file in xml_files:
  126. # calculate rewrite-rules
  127. ruleset = load_single_https_ruleset(ruleset_file)
  128. # skip if no ruleset returned
  129. if not ruleset:
  130. continue
  131. # append ruleset
  132. https_rules.append(ruleset)
  133. logger.info('{n} rules loaded'.format(n=len(https_rules)))
  134. def https_url_rewrite(result):
  135. skip_https_rewrite = False
  136. # check if HTTPS rewrite is possible
  137. for target, rules, exclusions in https_rules:
  138. # check if target regex match with url
  139. if target.match(result['parsed_url'].netloc):
  140. # process exclusions
  141. for exclusion in exclusions:
  142. # check if exclusion match with url
  143. if exclusion.match(result['url']):
  144. skip_https_rewrite = True
  145. break
  146. # skip https rewrite if required
  147. if skip_https_rewrite:
  148. break
  149. # process rules
  150. for rule in rules:
  151. try:
  152. new_result_url = rule[0].sub(rule[1], result['url'])
  153. except:
  154. break
  155. # parse new url
  156. new_parsed_url = urlparse(new_result_url)
  157. # continiue if nothing was rewritten
  158. if result['url'] == new_result_url:
  159. continue
  160. # get domainname from result
  161. # TODO, does only work correct with TLD's like
  162. # asdf.com, not for asdf.com.de
  163. # TODO, using publicsuffix instead of this rewrite rule
  164. old_result_domainname = '.'.join(
  165. result['parsed_url'].hostname.split('.')[-2:])
  166. new_result_domainname = '.'.join(
  167. new_parsed_url.hostname.split('.')[-2:])
  168. # check if rewritten hostname is the same,
  169. # to protect against wrong or malicious rewrite rules
  170. if old_result_domainname == new_result_domainname:
  171. # set new url
  172. result['url'] = new_result_url
  173. # target has matched, do not search over the other rules
  174. break
  175. return result
  176. def on_result(request, search, result):
  177. if result['parsed_url'].scheme == 'http':
  178. https_url_rewrite(result)
  179. return True
  180. load_https_rules(rules_path)