https_rewrite.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import re
  15. from urlparse import urlparse
  16. from lxml import etree
  17. from os import listdir, environ
  18. from os.path import isfile, isdir, join
  19. from searx.plugins import logger
  20. from flask_babel import gettext
  21. from searx import searx_dir
  22. name = "HTTPS rewrite"
  23. description = gettext('Rewrite HTTP links to HTTPS if possible')
  24. default_on = True
  25. if 'SEARX_HTTPS_REWRITE_PATH' in environ:
  26. rules_path = environ['SEARX_rules_path']
  27. else:
  28. rules_path = join(searx_dir, 'plugins/https_rules')
  29. logger = logger.getChild("https_rewrite")
  30. # https://gitweb.torproject.org/\
  31. # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
  32. # HTTPS rewrite rules
  33. https_rules = []
  34. # load single ruleset from a xml file
  35. def load_single_https_ruleset(rules_path):
  36. ruleset = ()
  37. # init parser
  38. parser = etree.XMLParser()
  39. # load and parse xml-file
  40. try:
  41. tree = etree.parse(rules_path, parser)
  42. except:
  43. # TODO, error message
  44. return ()
  45. # get root node
  46. root = tree.getroot()
  47. # check if root is a node with the name ruleset
  48. # TODO improve parsing
  49. if root.tag != 'ruleset':
  50. return ()
  51. # check if rule is deactivated by default
  52. if root.attrib.get('default_off'):
  53. return ()
  54. # check if rule does only work for specific platforms
  55. if root.attrib.get('platform'):
  56. return ()
  57. hosts = []
  58. rules = []
  59. exclusions = []
  60. # parse childs from ruleset
  61. for ruleset in root:
  62. # this child define a target
  63. if ruleset.tag == 'target':
  64. # check if required tags available
  65. if not ruleset.attrib.get('host'):
  66. continue
  67. # convert host-rule to valid regex
  68. host = ruleset.attrib.get('host')\
  69. .replace('.', '\.').replace('*', '.*')
  70. # append to host list
  71. hosts.append(host)
  72. # this child define a rule
  73. elif ruleset.tag == 'rule':
  74. # check if required tags available
  75. if not ruleset.attrib.get('from')\
  76. or not ruleset.attrib.get('to'):
  77. continue
  78. # TODO hack, which convert a javascript regex group
  79. # into a valid python regex group
  80. rule_from = ruleset.attrib['from'].replace('$', '\\')
  81. if rule_from.endswith('\\'):
  82. rule_from = rule_from[:-1] + '$'
  83. rule_to = ruleset.attrib['to'].replace('$', '\\')
  84. if rule_to.endswith('\\'):
  85. rule_to = rule_to[:-1] + '$'
  86. # TODO, not working yet because of the hack above,
  87. # currently doing that in webapp.py
  88. # rule_from_rgx = re.compile(rule_from, re.I)
  89. # append rule
  90. try:
  91. rules.append((re.compile(rule_from, re.I | re.U), rule_to))
  92. except:
  93. # TODO log regex error
  94. continue
  95. # this child define an exclusion
  96. elif ruleset.tag == 'exclusion':
  97. # check if required tags available
  98. if not ruleset.attrib.get('pattern'):
  99. continue
  100. exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
  101. # append exclusion
  102. exclusions.append(exclusion_rgx)
  103. # convert list of possible hosts to a simple regex
  104. # TODO compress regex to improve performance
  105. try:
  106. target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
  107. except:
  108. return ()
  109. # return ruleset
  110. return (target_hosts, rules, exclusions)
  111. # load all https rewrite rules
  112. def load_https_rules(rules_path):
  113. # check if directory exists
  114. if not isdir(rules_path):
  115. logger.error("directory not found: '" + rules_path + "'")
  116. return
  117. # search all xml files which are stored in the https rule directory
  118. xml_files = [join(rules_path, f)
  119. for f in listdir(rules_path)
  120. if isfile(join(rules_path, f)) and f[-4:] == '.xml']
  121. # load xml-files
  122. for ruleset_file in xml_files:
  123. # calculate rewrite-rules
  124. ruleset = load_single_https_ruleset(ruleset_file)
  125. # skip if no ruleset returned
  126. if not ruleset:
  127. continue
  128. # append ruleset
  129. https_rules.append(ruleset)
  130. logger.info('{n} rules loaded'.format(n=len(https_rules)))
  131. def https_url_rewrite(result):
  132. skip_https_rewrite = False
  133. # check if HTTPS rewrite is possible
  134. for target, rules, exclusions in https_rules:
  135. # check if target regex match with url
  136. if target.match(result['parsed_url'].netloc):
  137. # process exclusions
  138. for exclusion in exclusions:
  139. # check if exclusion match with url
  140. if exclusion.match(result['url']):
  141. skip_https_rewrite = True
  142. break
  143. # skip https rewrite if required
  144. if skip_https_rewrite:
  145. break
  146. # process rules
  147. for rule in rules:
  148. try:
  149. new_result_url = rule[0].sub(rule[1], result['url'])
  150. except:
  151. break
  152. # parse new url
  153. new_parsed_url = urlparse(new_result_url)
  154. # continiue if nothing was rewritten
  155. if result['url'] == new_result_url:
  156. continue
  157. # get domainname from result
  158. # TODO, does only work correct with TLD's like
  159. # asdf.com, not for asdf.com.de
  160. # TODO, using publicsuffix instead of this rewrite rule
  161. old_result_domainname = '.'.join(
  162. result['parsed_url'].hostname.split('.')[-2:])
  163. new_result_domainname = '.'.join(
  164. new_parsed_url.hostname.split('.')[-2:])
  165. # check if rewritten hostname is the same,
  166. # to protect against wrong or malicious rewrite rules
  167. if old_result_domainname == new_result_domainname:
  168. # set new url
  169. result['url'] = new_result_url
  170. # target has matched, do not search over the other rules
  171. break
  172. return result
  173. def on_result(request, ctx):
  174. result = ctx['result']
  175. if result['parsed_url'].scheme == 'http':
  176. https_url_rewrite(result)
  177. return True
  178. load_https_rules(rules_path)