https_rewrite.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import re
  15. from lxml import etree
  16. from os import listdir
  17. from os.path import isfile, join
  18. # https://gitweb.torproject.org/\
  19. # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
  20. # HTTPS rewrite rules
  21. https_rules = []
  22. # load single ruleset from a xml file
  23. def load_single_https_ruleset(filepath):
  24. ruleset = ()
  25. # init parser
  26. parser = etree.XMLParser()
  27. # load and parse xml-file
  28. try:
  29. tree = etree.parse(filepath, parser)
  30. except:
  31. # TODO, error message
  32. return ()
  33. # get root node
  34. root = tree.getroot()
  35. # check if root is a node with the name ruleset
  36. # TODO improve parsing
  37. if root.tag != 'ruleset':
  38. return ()
  39. # check if rule is deactivated by default
  40. if root.attrib.get('default_off'):
  41. return ()
  42. # check if rule does only work for specific platforms
  43. if root.attrib.get('platform'):
  44. return ()
  45. hosts = []
  46. rules = []
  47. exclusions = []
  48. # parse childs from ruleset
  49. for ruleset in root:
  50. # this child define a target
  51. if ruleset.tag == 'target':
  52. # check if required tags available
  53. if not ruleset.attrib.get('host'):
  54. continue
  55. # convert host-rule to valid regex
  56. host = ruleset.attrib.get('host')\
  57. .replace('.', '\.').replace('*', '.*')
  58. # append to host list
  59. hosts.append(host)
  60. # this child define a rule
  61. elif ruleset.tag == 'rule':
  62. # check if required tags available
  63. if not ruleset.attrib.get('from')\
  64. or not ruleset.attrib.get('to'):
  65. continue
  66. # TODO hack, which convert a javascript regex group
  67. # into a valid python regex group
  68. rule_from = ruleset.attrib.get('from').replace('$', '\\')
  69. rule_to = ruleset.attrib.get('to').replace('$', '\\')
  70. # TODO, not working yet because of the hack above,
  71. # currently doing that in webapp.py
  72. # rule_from_rgx = re.compile(rule_from, re.I)
  73. # append rule
  74. rules.append((rule_from, rule_to))
  75. # this child define an exclusion
  76. elif ruleset.tag == 'exclusion':
  77. # check if required tags available
  78. if not ruleset.attrib.get('pattern'):
  79. continue
  80. exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
  81. # append exclusion
  82. exclusions.append(exclusion_rgx)
  83. # convert list of possible hosts to a simple regex
  84. # TODO compress regex to improve performance
  85. try:
  86. target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
  87. except:
  88. return ()
  89. # return ruleset
  90. return (target_hosts, rules, exclusions)
  91. # load all https rewrite rules
  92. def load_https_rules(rules_path):
  93. # add / to path if not set yet
  94. if rules_path[-1:] != '/':
  95. rules_path += '/'
  96. # search all xml files which are stored in the https rule directory
  97. xml_files = [join(rules_path, f)
  98. for f in listdir(rules_path)
  99. if isfile(join(rules_path, f)) and f[-4:] == '.xml']
  100. # load xml-files
  101. for ruleset_file in xml_files:
  102. # calculate rewrite-rules
  103. ruleset = load_single_https_ruleset(ruleset_file)
  104. # skip if no ruleset returned
  105. if not ruleset:
  106. continue
  107. # append ruleset
  108. https_rules.append(ruleset)
  109. print(' * {n} https-rules loaded'.format(n=len(https_rules)))