10 years ago · f141773814
--- a/searx/https_rewrite.py
+++ b/searx/https_rewrite.py
@@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
 
				 '''
			
 
				 
			
 
				 import re
			
 
				+from urlparse import urlparse
			
 
				 from lxml import etree
			
 
				 from os import listdir
			
 
				 from os.path import isfile, isdir, join
			
@@ -86,15 +87,23 @@ def load_single_https_ruleset(filepath):
 
				 
			
 
				             # TODO hack, which convert a javascript regex group
			
 
				             # into a valid python regex group
			
 
				-            rule_from = ruleset.attrib.get('from').replace('$', '\\')
			
 
				-            rule_to = ruleset.attrib.get('to').replace('$', '\\')
			
 
				+            rule_from = ruleset.attrib['from'].replace('$', '\\')
			
 
				+            if rule_from.endswith('\\'):
			
 
				+                rule_from = rule_from[:-1]+'$'
			
 
				+            rule_to = ruleset.attrib['to'].replace('$', '\\')
			
 
				+            if rule_to.endswith('\\'):
			
 
				+                rule_to = rule_to[:-1]+'$'
			
 
				 
			
 
				             # TODO, not working yet because of the hack above,
			
 
				             # currently doing that in webapp.py
			
 
				             # rule_from_rgx = re.compile(rule_from, re.I)
			
 
				 
			
 
				             # append rule
			
 
				-            rules.append((rule_from, rule_to))
			
 
				+            try:
			
 
				+                rules.append((re.compile(rule_from, re.I | re.U), rule_to))
			
 
				+            except:
			
 
				+                # TODO log regex error
			
 
				+                continue
			
 
				 
			
 
				         # this child define an exclusion
			
 
				         elif ruleset.tag == 'exclusion':
			
@@ -143,3 +152,56 @@ def load_https_rules(rules_path):
 
				         https_rules.append(ruleset)
			
 
				 
			
 
				     print(' * {n} https-rules loaded'.format(n=len(https_rules)))
			
 
				+
			
 
				+
			
 
				+
			
 
				+def https_url_rewrite(result):
			
 
				+    skip_https_rewrite = False
			
 
				+    # check if HTTPS rewrite is possible
			
 
				+    for target, rules, exclusions in https_rules:
			
 
				+
			
 
				+        # check if target regex match with url
			
 
				+        if target.match(result['parsed_url'].netloc):
			
 
				+            # process exclusions
			
 
				+            for exclusion in exclusions:
			
 
				+                # check if exclusion match with url
			
 
				+                if exclusion.match(result['url']):
			
 
				+                    skip_https_rewrite = True
			
 
				+                    break
			
 
				+
			
 
				+            # skip https rewrite if required
			
 
				+            if skip_https_rewrite:
			
 
				+                break
			
 
				+
			
 
				+            # process rules
			
 
				+            for rule in rules:
			
 
				+                try:
			
 
				+                    new_result_url = rule[0].sub(rule[1], result['url'])
			
 
				+                except:
			
 
				+                    break
			
 
				+
			
 
				+                # parse new url
			
 
				+                new_parsed_url = urlparse(new_result_url)
			
 
				+
			
 
				+                # continiue if nothing was rewritten
			
 
				+                if result['url'] == new_result_url:
			
 
				+                    continue
			
 
				+
			
 
				+                # get domainname from result
			
 
				+                # TODO, does only work correct with TLD's like
			
 
				+                #  asdf.com, not for asdf.com.de
			
 
				+                # TODO, using publicsuffix instead of this rewrite rule
			
 
				+                old_result_domainname = '.'.join(
			
 
				+                    result['parsed_url'].hostname.split('.')[-2:])
			
 
				+                new_result_domainname = '.'.join(
			
 
				+                    new_parsed_url.hostname.split('.')[-2:])
			
 
				+
			
 
				+                # check if rewritten hostname is the same,
			
 
				+                # to protect against wrong or malicious rewrite rules
			
 
				+                if old_result_domainname == new_result_domainname:
			
 
				+                    # set new url
			
 
				+                    result['url'] = new_result_url
			
 
				+
			
 
				+            # target has matched, do not search over the other rules
			
 
				+            break
			
 
				+    return result
			
--- a/searx/https_rules/Soundcloud.xml
+++ b/searx/https_rules/Soundcloud.xml
@@ -89,7 +89,7 @@
 
				 	<rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/"
			
 
				 		to="https://$1.sndcdn.com/" />
			
 
				 
			
 
				-	<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.)?soundcloud\.com/"
			
 
				+	<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.|)soundcloud\.com/"
			
 
				 		to="https://$1soundcloud.com/" />
			
 
				 
			
 
				 	<rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/"
			
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -41,15 +41,12 @@ from searx.utils import (
 
				     UnicodeWriter, highlight_content, html_to_text, get_themes
			
 
				 )
			
 
				 from searx.version import VERSION_STRING
			
 
				-from searx.https_rewrite import https_rules
			
 
				 from searx.languages import language_codes
			
 
				+from searx.https_rewrite import https_url_rewrite
			
 
				 from searx.search import Search
			
 
				 from searx.query import Query
			
 
				 from searx.autocomplete import backends as autocomplete_backends
			
 
				 
			
 
				-from urlparse import urlparse
			
 
				-import re
			
 
				-
			
 
				 
			
 
				 static_path, templates_path, themes =\
			
 
				     get_themes(settings['themes_path']
			
@@ -215,59 +212,7 @@ def index():
 
				         if settings['server']['https_rewrite']\
			
 
				            and result['parsed_url'].scheme == 'http':
			
 
				 
			
 
				-            skip_https_rewrite = False
			
 
				-
			
 
				-            # check if HTTPS rewrite is possible
			
 
				-            for target, rules, exclusions in https_rules:
			
 
				-
			
 
				-                # check if target regex match with url
			
 
				-                if target.match(result['url']):
			
 
				-                    # process exclusions
			
 
				-                    for exclusion in exclusions:
			
 
				-                        # check if exclusion match with url
			
 
				-                        if exclusion.match(result['url']):
			
 
				-                            skip_https_rewrite = True
			
 
				-                            break
			
 
				-
			
 
				-                    # skip https rewrite if required
			
 
				-                    if skip_https_rewrite:
			
 
				-                        break
			
 
				-
			
 
				-                    # process rules
			
 
				-                    for rule in rules:
			
 
				-                        try:
			
 
				-                            # TODO, precompile rule
			
 
				-                            p = re.compile(rule[0])
			
 
				-
			
 
				-                            # rewrite url if possible
			
 
				-                            new_result_url = p.sub(rule[1], result['url'])
			
 
				-                        except:
			
 
				-                            break
			
 
				-
			
 
				-                        # parse new url
			
 
				-                        new_parsed_url = urlparse(new_result_url)
			
 
				-
			
 
				-                        # continiue if nothing was rewritten
			
 
				-                        if result['url'] == new_result_url:
			
 
				-                            continue
			
 
				-
			
 
				-                        # get domainname from result
			
 
				-                        # TODO, does only work correct with TLD's like
			
 
				-                        #  asdf.com, not for asdf.com.de
			
 
				-                        # TODO, using publicsuffix instead of this rewrite rule
			
 
				-                        old_result_domainname = '.'.join(
			
 
				-                            result['parsed_url'].hostname.split('.')[-2:])
			
 
				-                        new_result_domainname = '.'.join(
			
 
				-                            new_parsed_url.hostname.split('.')[-2:])
			
 
				-
			
 
				-                        # check if rewritten hostname is the same,
			
 
				-                        # to protect against wrong or malicious rewrite rules
			
 
				-                        if old_result_domainname == new_result_domainname:
			
 
				-                            # set new url
			
 
				-                            result['url'] = new_result_url
			
 
				-
			
 
				-                    # target has matched, do not search over the other rules
			
 
				-                    break
			
 
				+            result = https_url_rewrite(result)
			
 
				 
			
 
				         if search.request_data.get('format', 'html') == 'html':
			
 
				             if 'content' in result: