| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233 | '''searx is free software: you can redistribute it and/or modifyit under the terms of the GNU Affero General Public License as published bythe Free Software Foundation, either version 3 of the License, or(at your option) any later version.searx is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See theGNU Affero General Public License for more details.You should have received a copy of the GNU Affero General Public Licensealong with searx. If not, see < http://www.gnu.org/licenses/ >.(C) 2013- by Adam Tauber, <asciimoo@gmail.com>'''import reimport sysfrom lxml import etreefrom os import listdir, environfrom os.path import isfile, isdir, joinfrom searx.plugins import loggerfrom flask_babel import gettextfrom searx import searx_dirfrom searx.url_utils import urlparseif sys.version_info[0] == 3:    unicode = strname = "HTTPS rewrite"description = gettext('Rewrite HTTP links to HTTPS if possible')default_on = Truepreference_section = 'privacy'if 'SEARX_HTTPS_REWRITE_PATH' in environ:    rules_path = environ['SEARX_rules_path']else:    rules_path = join(searx_dir, 'plugins/https_rules')logger = logger.getChild("https_rewrite")# https://gitweb.torproject.org/\# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules# HTTPS rewrite ruleshttps_rules = []# load single ruleset from a xml filedef load_single_https_ruleset(rules_path):    ruleset = ()    # init parser    parser = etree.XMLParser()    # load and parse xml-file    try:        tree = etree.parse(rules_path, parser)    except:        # TODO, error message        return ()    # get root node    root = tree.getroot()    # check if root is a node with the name ruleset    # TODO improve parsing    if root.tag != 'ruleset':        return ()    # check if rule is deactivated by default    if root.attrib.get('default_off'):        return ()    # check if rule does only work for specific platforms    if root.attrib.get('platform'):        return ()    hosts = []    rules = []    exclusions = []    # parse childs from ruleset    for ruleset in root:        # this child define a target        if ruleset.tag == 'target':            # check if required tags available            if not ruleset.attrib.get('host'):                continue            # convert host-rule to valid regex            host = ruleset.attrib.get('host')\                .replace('.', r'\.').replace('*', '.*')            # append to host list            hosts.append(host)        # this child define a rule        elif ruleset.tag == 'rule':            # check if required tags available            if not ruleset.attrib.get('from')\               or not ruleset.attrib.get('to'):                continue            # TODO hack, which convert a javascript regex group            # into a valid python regex group            rule_from = ruleset.attrib['from'].replace('$', '\\')            if rule_from.endswith('\\'):                rule_from = rule_from[:-1] + '$'            rule_to = ruleset.attrib['to'].replace('$', '\\')            if rule_to.endswith('\\'):                rule_to = rule_to[:-1] + '$'            # TODO, not working yet because of the hack above,            # currently doing that in webapp.py            # rule_from_rgx = re.compile(rule_from, re.I)            # append rule            try:                rules.append((re.compile(rule_from, re.I | re.U), rule_to))            except:                # TODO log regex error                continue        # this child define an exclusion        elif ruleset.tag == 'exclusion':            # check if required tags available            if not ruleset.attrib.get('pattern'):                continue            exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))            # append exclusion            exclusions.append(exclusion_rgx)    # convert list of possible hosts to a simple regex    # TODO compress regex to improve performance    try:        target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)    except:        return ()    # return ruleset    return (target_hosts, rules, exclusions)# load all https rewrite rulesdef load_https_rules(rules_path):    # check if directory exists    if not isdir(rules_path):        logger.error("directory not found: '" + rules_path + "'")        return    # search all xml files which are stored in the https rule directory    xml_files = [join(rules_path, f)                 for f in listdir(rules_path)                 if isfile(join(rules_path, f)) and f[-4:] == '.xml']    # load xml-files    for ruleset_file in xml_files:        # calculate rewrite-rules        ruleset = load_single_https_ruleset(ruleset_file)        # skip if no ruleset returned        if not ruleset:            continue        # append ruleset        https_rules.append(ruleset)    logger.info('{n} rules loaded'.format(n=len(https_rules)))def https_url_rewrite(result):    skip_https_rewrite = False    # check if HTTPS rewrite is possible    for target, rules, exclusions in https_rules:        # check if target regex match with url        if target.match(result['parsed_url'].netloc):            # process exclusions            for exclusion in exclusions:                # check if exclusion match with url                if exclusion.match(result['url']):                    skip_https_rewrite = True                    break            # skip https rewrite if required            if skip_https_rewrite:                break            # process rules            for rule in rules:                try:                    new_result_url = rule[0].sub(rule[1], result['url'])                except:                    break                # parse new url                new_parsed_url = urlparse(new_result_url)                # continiue if nothing was rewritten                if result['url'] == new_result_url:                    continue                # get domainname from result                # TODO, does only work correct with TLD's like                #  asdf.com, not for asdf.com.de                # TODO, using publicsuffix instead of this rewrite rule                old_result_domainname = '.'.join(                    result['parsed_url'].hostname.split('.')[-2:])                new_result_domainname = '.'.join(                    new_parsed_url.hostname.split('.')[-2:])                # check if rewritten hostname is the same,                # to protect against wrong or malicious rewrite rules                if old_result_domainname == new_result_domainname:                    # set new url                    result['url'] = new_result_url            # target has matched, do not search over the other rules            break    return resultdef on_result(request, search, result):    if result['parsed_url'].scheme == 'http':        https_url_rewrite(result)    return Trueload_https_rules(rules_path)
 |