Browse Source

[mod][fix] https rewrite refactor ++ fixes

Adam Tauber 10 years ago
parent
commit
f141773814
3 changed files with 68 additions and 61 deletions
  1. 65 3
      searx/https_rewrite.py
  2. 1 1
      searx/https_rules/Soundcloud.xml
  3. 2 57
      searx/webapp.py

+ 65 - 3
searx/https_rewrite.py

@@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
 '''
 '''
 
 
 import re
 import re
+from urlparse import urlparse
 from lxml import etree
 from lxml import etree
 from os import listdir
 from os import listdir
 from os.path import isfile, isdir, join
 from os.path import isfile, isdir, join
@@ -86,15 +87,23 @@ def load_single_https_ruleset(filepath):
 
 
             # TODO hack, which convert a javascript regex group
             # TODO hack, which convert a javascript regex group
             # into a valid python regex group
             # into a valid python regex group
-            rule_from = ruleset.attrib.get('from').replace('$', '\\')
-            rule_to = ruleset.attrib.get('to').replace('$', '\\')
+            rule_from = ruleset.attrib['from'].replace('$', '\\')
+            if rule_from.endswith('\\'):
+                rule_from = rule_from[:-1]+'$'
+            rule_to = ruleset.attrib['to'].replace('$', '\\')
+            if rule_to.endswith('\\'):
+                rule_to = rule_to[:-1]+'$'
 
 
             # TODO, not working yet because of the hack above,
             # TODO, not working yet because of the hack above,
             # currently doing that in webapp.py
             # currently doing that in webapp.py
             # rule_from_rgx = re.compile(rule_from, re.I)
             # rule_from_rgx = re.compile(rule_from, re.I)
 
 
             # append rule
             # append rule
-            rules.append((rule_from, rule_to))
+            try:
+                rules.append((re.compile(rule_from, re.I | re.U), rule_to))
+            except:
+                # TODO log regex error
+                continue
 
 
         # this child define an exclusion
         # this child define an exclusion
         elif ruleset.tag == 'exclusion':
         elif ruleset.tag == 'exclusion':
@@ -143,3 +152,56 @@ def load_https_rules(rules_path):
         https_rules.append(ruleset)
         https_rules.append(ruleset)
 
 
     print(' * {n} https-rules loaded'.format(n=len(https_rules)))
     print(' * {n} https-rules loaded'.format(n=len(https_rules)))
+
+
+
+def https_url_rewrite(result):
+    skip_https_rewrite = False
+    # check if HTTPS rewrite is possible
+    for target, rules, exclusions in https_rules:
+
+        # check if target regex match with url
+        if target.match(result['parsed_url'].netloc):
+            # process exclusions
+            for exclusion in exclusions:
+                # check if exclusion match with url
+                if exclusion.match(result['url']):
+                    skip_https_rewrite = True
+                    break
+
+            # skip https rewrite if required
+            if skip_https_rewrite:
+                break
+
+            # process rules
+            for rule in rules:
+                try:
+                    new_result_url = rule[0].sub(rule[1], result['url'])
+                except:
+                    break
+
+                # parse new url
+                new_parsed_url = urlparse(new_result_url)
+
+                # continiue if nothing was rewritten
+                if result['url'] == new_result_url:
+                    continue
+
+                # get domainname from result
+                # TODO, does only work correct with TLD's like
+                #  asdf.com, not for asdf.com.de
+                # TODO, using publicsuffix instead of this rewrite rule
+                old_result_domainname = '.'.join(
+                    result['parsed_url'].hostname.split('.')[-2:])
+                new_result_domainname = '.'.join(
+                    new_parsed_url.hostname.split('.')[-2:])
+
+                # check if rewritten hostname is the same,
+                # to protect against wrong or malicious rewrite rules
+                if old_result_domainname == new_result_domainname:
+                    # set new url
+                    result['url'] = new_result_url
+
+            # target has matched, do not search over the other rules
+            break
+    return result

+ 1 - 1
searx/https_rules/Soundcloud.xml

@@ -89,7 +89,7 @@
 	<rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/"
 	<rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/"
 		to="https://$1.sndcdn.com/" />
 		to="https://$1.sndcdn.com/" />
 
 
-	<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.)?soundcloud\.com/"
+	<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.|)soundcloud\.com/"
 		to="https://$1soundcloud.com/" />
 		to="https://$1soundcloud.com/" />
 
 
 	<rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/"
 	<rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/"

+ 2 - 57
searx/webapp.py

@@ -41,15 +41,12 @@ from searx.utils import (
     UnicodeWriter, highlight_content, html_to_text, get_themes
     UnicodeWriter, highlight_content, html_to_text, get_themes
 )
 )
 from searx.version import VERSION_STRING
 from searx.version import VERSION_STRING
-from searx.https_rewrite import https_rules
 from searx.languages import language_codes
 from searx.languages import language_codes
+from searx.https_rewrite import https_url_rewrite
 from searx.search import Search
 from searx.search import Search
 from searx.query import Query
 from searx.query import Query
 from searx.autocomplete import backends as autocomplete_backends
 from searx.autocomplete import backends as autocomplete_backends
 
 
-from urlparse import urlparse
-import re
-
 
 
 static_path, templates_path, themes =\
 static_path, templates_path, themes =\
     get_themes(settings['themes_path']
     get_themes(settings['themes_path']
@@ -215,59 +212,7 @@ def index():
         if settings['server']['https_rewrite']\
         if settings['server']['https_rewrite']\
            and result['parsed_url'].scheme == 'http':
            and result['parsed_url'].scheme == 'http':
 
 
-            skip_https_rewrite = False
-
-            # check if HTTPS rewrite is possible
-            for target, rules, exclusions in https_rules:
-
-                # check if target regex match with url
-                if target.match(result['url']):
-                    # process exclusions
-                    for exclusion in exclusions:
-                        # check if exclusion match with url
-                        if exclusion.match(result['url']):
-                            skip_https_rewrite = True
-                            break
-
-                    # skip https rewrite if required
-                    if skip_https_rewrite:
-                        break
-
-                    # process rules
-                    for rule in rules:
-                        try:
-                            # TODO, precompile rule
-                            p = re.compile(rule[0])
-
-                            # rewrite url if possible
-                            new_result_url = p.sub(rule[1], result['url'])
-                        except:
-                            break
-
-                        # parse new url
-                        new_parsed_url = urlparse(new_result_url)
-
-                        # continiue if nothing was rewritten
-                        if result['url'] == new_result_url:
-                            continue
-
-                        # get domainname from result
-                        # TODO, does only work correct with TLD's like
-                        #  asdf.com, not for asdf.com.de
-                        # TODO, using publicsuffix instead of this rewrite rule
-                        old_result_domainname = '.'.join(
-                            result['parsed_url'].hostname.split('.')[-2:])
-                        new_result_domainname = '.'.join(
-                            new_parsed_url.hostname.split('.')[-2:])
-
-                        # check if rewritten hostname is the same,
-                        # to protect against wrong or malicious rewrite rules
-                        if old_result_domainname == new_result_domainname:
-                            # set new url
-                            result['url'] = new_result_url
-
-                    # target has matched, do not search over the other rules
-                    break
+            result = https_url_rewrite(result)
 
 
         if search.request_data.get('format', 'html') == 'html':
         if search.request_data.get('format', 'html') == 'html':
             if 'content' in result:
             if 'content' in result: