https_rewrite.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import re
  15. from urlparse import urlparse
  16. from lxml import etree
  17. from os import listdir
  18. from os.path import isfile, isdir, join
  19. # https://gitweb.torproject.org/\
  20. # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
  21. # HTTPS rewrite rules
  22. https_rules = []
  23. # load single ruleset from a xml file
  24. def load_single_https_ruleset(filepath):
  25. ruleset = ()
  26. # init parser
  27. parser = etree.XMLParser()
  28. # load and parse xml-file
  29. try:
  30. tree = etree.parse(filepath, parser)
  31. except:
  32. # TODO, error message
  33. return ()
  34. # get root node
  35. root = tree.getroot()
  36. # check if root is a node with the name ruleset
  37. # TODO improve parsing
  38. if root.tag != 'ruleset':
  39. return ()
  40. # check if rule is deactivated by default
  41. if root.attrib.get('default_off'):
  42. return ()
  43. # check if rule does only work for specific platforms
  44. if root.attrib.get('platform'):
  45. return ()
  46. hosts = []
  47. rules = []
  48. exclusions = []
  49. # parse childs from ruleset
  50. for ruleset in root:
  51. # this child define a target
  52. if ruleset.tag == 'target':
  53. # check if required tags available
  54. if not ruleset.attrib.get('host'):
  55. continue
  56. # convert host-rule to valid regex
  57. host = ruleset.attrib.get('host')\
  58. .replace('.', '\.').replace('*', '.*')
  59. # append to host list
  60. hosts.append(host)
  61. # this child define a rule
  62. elif ruleset.tag == 'rule':
  63. # check if required tags available
  64. if not ruleset.attrib.get('from')\
  65. or not ruleset.attrib.get('to'):
  66. continue
  67. # TODO hack, which convert a javascript regex group
  68. # into a valid python regex group
  69. rule_from = ruleset.attrib['from'].replace('$', '\\')
  70. if rule_from.endswith('\\'):
  71. rule_from = rule_from[:-1]+'$'
  72. rule_to = ruleset.attrib['to'].replace('$', '\\')
  73. if rule_to.endswith('\\'):
  74. rule_to = rule_to[:-1]+'$'
  75. # TODO, not working yet because of the hack above,
  76. # currently doing that in webapp.py
  77. # rule_from_rgx = re.compile(rule_from, re.I)
  78. # append rule
  79. try:
  80. rules.append((re.compile(rule_from, re.I | re.U), rule_to))
  81. except:
  82. # TODO log regex error
  83. continue
  84. # this child define an exclusion
  85. elif ruleset.tag == 'exclusion':
  86. # check if required tags available
  87. if not ruleset.attrib.get('pattern'):
  88. continue
  89. exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
  90. # append exclusion
  91. exclusions.append(exclusion_rgx)
  92. # convert list of possible hosts to a simple regex
  93. # TODO compress regex to improve performance
  94. try:
  95. target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
  96. except:
  97. return ()
  98. # return ruleset
  99. return (target_hosts, rules, exclusions)
  100. # load all https rewrite rules
  101. def load_https_rules(rules_path):
  102. # check if directory exists
  103. if not isdir(rules_path):
  104. print("[E] directory not found: '" + rules_path + "'")
  105. return
  106. # search all xml files which are stored in the https rule directory
  107. xml_files = [join(rules_path, f)
  108. for f in listdir(rules_path)
  109. if isfile(join(rules_path, f)) and f[-4:] == '.xml']
  110. # load xml-files
  111. for ruleset_file in xml_files:
  112. # calculate rewrite-rules
  113. ruleset = load_single_https_ruleset(ruleset_file)
  114. # skip if no ruleset returned
  115. if not ruleset:
  116. continue
  117. # append ruleset
  118. https_rules.append(ruleset)
  119. print(' * {n} https-rules loaded'.format(n=len(https_rules)))
  120. def https_url_rewrite(result):
  121. skip_https_rewrite = False
  122. # check if HTTPS rewrite is possible
  123. for target, rules, exclusions in https_rules:
  124. # check if target regex match with url
  125. if target.match(result['parsed_url'].netloc):
  126. # process exclusions
  127. for exclusion in exclusions:
  128. # check if exclusion match with url
  129. if exclusion.match(result['url']):
  130. skip_https_rewrite = True
  131. break
  132. # skip https rewrite if required
  133. if skip_https_rewrite:
  134. break
  135. # process rules
  136. for rule in rules:
  137. try:
  138. new_result_url = rule[0].sub(rule[1], result['url'])
  139. except:
  140. break
  141. # parse new url
  142. new_parsed_url = urlparse(new_result_url)
  143. # continiue if nothing was rewritten
  144. if result['url'] == new_result_url:
  145. continue
  146. # get domainname from result
  147. # TODO, does only work correct with TLD's like
  148. # asdf.com, not for asdf.com.de
  149. # TODO, using publicsuffix instead of this rewrite rule
  150. old_result_domainname = '.'.join(
  151. result['parsed_url'].hostname.split('.')[-2:])
  152. new_result_domainname = '.'.join(
  153. new_parsed_url.hostname.split('.')[-2:])
  154. # check if rewritten hostname is the same,
  155. # to protect against wrong or malicious rewrite rules
  156. if old_result_domainname == new_result_domainname:
  157. # set new url
  158. result['url'] = new_result_url
  159. # target has matched, do not search over the other rules
  160. break
  161. return result