https_rewrite.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import re
  15. from lxml import etree
  16. from os import listdir
  17. from os.path import isfile, join
  18. # https://gitweb.torproject.org/\
  19. # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
  20. # HTTPS rewrite rules
  21. https_rules = []
  22. # load single ruleset from a xml file
  23. def load_single_https_ruleset(filepath):
  24. ruleset = ()
  25. # init parser
  26. parser = etree.XMLParser()
  27. # load and parse xml-file
  28. try:
  29. tree = etree.parse(filepath, parser)
  30. except:
  31. # TODO, error message
  32. return ()
  33. # get root node
  34. root = tree.getroot()
  35. #print(etree.tostring(tree))
  36. # check if root is a node with the name ruleset
  37. # TODO improve parsing
  38. if root.tag != 'ruleset':
  39. return ()
  40. # check if rule is deactivated by default
  41. if root.attrib.get('default_off'):
  42. return ()
  43. # check if rule does only work for specific platforms
  44. if root.attrib.get('platform'):
  45. return ()
  46. hosts = []
  47. rules = []
  48. exclusions = []
  49. # parse childs from ruleset
  50. for ruleset in root:
  51. # this child define a target
  52. if ruleset.tag == 'target':
  53. # check if required tags available
  54. if not ruleset.attrib.get('host'):
  55. continue
  56. # convert host-rule to valid regex
  57. host = ruleset.attrib.get('host').replace('.', '\.').replace('*', '.*')
  58. # append to host list
  59. hosts.append(host)
  60. # this child define a rule
  61. elif ruleset.tag == 'rule':
  62. # check if required tags available
  63. if not ruleset.attrib.get('from')\
  64. or not ruleset.attrib.get('to'):
  65. continue
  66. # TODO hack, which convert a javascript regex group into a valid python regex group
  67. rule_from = ruleset.attrib.get('from').replace('$', '\\')
  68. rule_to = ruleset.attrib.get('to').replace('$', '\\')
  69. # TODO, not working yet because of the hack above, currently doing that in webapp.py
  70. #rule_from_rgx = re.compile(rule_from, re.I)
  71. # append rule
  72. rules.append((rule_from, rule_to))
  73. # this child define an exclusion
  74. elif ruleset.tag == 'exclusion':
  75. # check if required tags available
  76. if not ruleset.attrib.get('pattern'):
  77. continue
  78. exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
  79. # append exclusion
  80. exclusions.append(exclusion_rgx)
  81. # convert list of possible hosts to a simple regex
  82. # TODO compress regex to improve performance
  83. try:
  84. target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
  85. except:
  86. return ()
  87. # return ruleset
  88. return (target_hosts, rules, exclusions)
  89. # load all https rewrite rules
  90. def load_https_rules(rules_path):
  91. # add / to path if not set yet
  92. if rules_path[-1:] != '/':
  93. rules_path += '/'
  94. # search all xml files which are stored in the https rule directory
  95. xml_files = [ join(rules_path,f) for f in listdir(rules_path) if isfile(join(rules_path,f)) and f[-4:] == '.xml' ]
  96. # load xml-files
  97. for ruleset_file in xml_files:
  98. # calculate rewrite-rules
  99. ruleset = load_single_https_ruleset(ruleset_file)
  100. # skip if no ruleset returned
  101. if not ruleset:
  102. continue
  103. # append ruleset
  104. https_rules.append(ruleset)