5 years ago · 2006eb4680
--- a/searx/engines/1337x.py
+++ b/searx/engines/1337x.py
@@ -1,7 +1,6 @@
 
				 from urllib.parse import quote, urljoin
			
 
				 from lxml import html
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import get_torrent_size
			
 
				+from searx.utils import extract_text, get_torrent_size
			
 
				 
			
 
				 
			
 
				 url = 'https://1337x.to/'
			
--- a/searx/engines/acgsou.py
+++ b/searx/engines/acgsou.py
@@ -11,8 +11,7 @@
 
				 
			
 
				 from urllib.parse import urlencode
			
 
				 from lxml import html
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import get_torrent_size, int_or_zero
			
 
				+from searx.utils import extract_text, get_torrent_size, int_or_zero
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['files', 'images', 'videos', 'music']
			
--- a/searx/engines/apkmirror.py
+++ b/searx/engines/apkmirror.py
@@ -11,7 +11,7 @@
 
				 
			
 
				 from urllib.parse import urlencode
			
 
				 from lxml import html
			
 
				-from searx.engines.xpath import extract_text
			
 
				+from searx.utils import extract_text
			
 
				 
			
 
				 
			
 
				 # engine dependent config
			
--- a/searx/engines/archlinux.py
+++ b/searx/engines/archlinux.py
@@ -13,7 +13,7 @@
 
				 
			
 
				 from urllib.parse import urlencode, urljoin
			
 
				 from lxml import html
			
 
				-from searx.engines.xpath import extract_text
			
 
				+from searx.utils import extract_text
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['it']
			
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@@ -17,8 +17,7 @@ import re
 
				 from urllib.parse import urlencode
			
 
				 from lxml import html
			
 
				 from searx import logger, utils
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import match_language, gen_useragent, eval_xpath
			
 
				+from searx.utils import extract_text, match_language, gen_useragent, eval_xpath
			
 
				 
			
 
				 logger = logger.getChild('bing engine')
			
 
				 
			
--- a/searx/engines/btdigg.py
+++ b/searx/engines/btdigg.py
@@ -13,8 +13,7 @@
 
				 from lxml import html
			
 
				 from operator import itemgetter
			
 
				 from urllib.parse import quote, urljoin
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import get_torrent_size
			
 
				+from searx.utils import extract_text, get_torrent_size
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['videos', 'music', 'files']
			
--- a/searx/engines/deviantart.py
+++ b/searx/engines/deviantart.py
@@ -15,7 +15,7 @@
 
				 from lxml import html
			
 
				 import re
			
 
				 from urllib.parse import urlencode
			
 
				-from searx.engines.xpath import extract_text
			
 
				+from searx.utils import extract_text
			
 
				 
			
 
				 
			
 
				 # engine dependent config
			
--- a/searx/engines/digbt.py
+++ b/searx/engines/digbt.py
@@ -12,8 +12,7 @@
 
				 
			
 
				 from urllib.parse import urljoin
			
 
				 from lxml import html
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import get_torrent_size
			
 
				+from searx.utils import extract_text, get_torrent_size
			
 
				 
			
 
				 
			
 
				 categories = ['videos', 'music', 'files']
			
--- a/searx/engines/doku.py
+++ b/searx/engines/doku.py
@@ -11,8 +11,7 @@
 
				 
			
 
				 from urllib.parse import urlencode
			
 
				 from lxml.html import fromstring
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import eval_xpath
			
 
				+from searx.utils import extract_text, eval_xpath
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['general']  # TODO , 'images', 'music', 'videos', 'files'
			
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@@ -16,9 +16,8 @@
 
				 from lxml.html import fromstring
			
 
				 from json import loads
			
 
				 from urllib.parse import urlencode
			
 
				-from searx.engines.xpath import extract_text
			
 
				 from searx.poolrequests import get
			
 
				-from searx.utils import match_language, eval_xpath
			
 
				+from searx.utils import extract_text, match_language, eval_xpath
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['general']
			
--- a/searx/engines/duckduckgo_definitions.py
+++ b/searx/engines/duckduckgo_definitions.py
@@ -13,9 +13,8 @@ import json
 
				 from urllib.parse import urlencode
			
 
				 from lxml import html
			
 
				 from re import compile
			
 
				-from searx.engines.xpath import extract_text
			
 
				 from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
			
 
				-from searx.utils import html_to_text, match_language
			
 
				+from searx.utils import extract_text, html_to_text, match_language
			
 
				 
			
 
				 url = 'https://api.duckduckgo.com/'\
			
 
				     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
			
--- a/searx/engines/duckduckgo_images.py
+++ b/searx/engines/duckduckgo_images.py
@@ -15,12 +15,12 @@
 
				 
			
 
				 from json import loads
			
 
				 from urllib.parse import urlencode
			
 
				-from searx.engines.xpath import extract_text
			
 
				 from searx.engines.duckduckgo import (
			
 
				     _fetch_supported_languages, supported_languages_url,
			
 
				     get_region_code, language_aliases
			
 
				 )
			
 
				 from searx.poolrequests import get
			
 
				+from searx.utils import extract_text
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['images']
			
--- a/searx/engines/duden.py
+++ b/searx/engines/duden.py
@@ -11,8 +11,7 @@
 
				 from lxml import html, etree
			
 
				 import re
			
 
				 from urllib.parse import quote, urljoin
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import eval_xpath
			
 
				+from searx.utils import extract_text, eval_xpath
			
 
				 from searx import logger
			
 
				 
			
 
				 categories = ['general']
			
--- a/searx/engines/etools.py
+++ b/searx/engines/etools.py
@@ -11,8 +11,7 @@
 
				 
			
 
				 from lxml import html
			
 
				 from urllib.parse import quote
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import eval_xpath
			
 
				+from searx.utils import extract_text, eval_xpath
			
 
				 
			
 
				 categories = ['general']
			
 
				 paging = False
			
--- a/searx/engines/fdroid.py
+++ b/searx/engines/fdroid.py
@@ -11,7 +11,7 @@
 
				 
			
 
				 from urllib.parse import urlencode
			
 
				 from lxml import html
			
 
				-from searx.engines.xpath import extract_text
			
 
				+from searx.utils import extract_text
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['files']
			
--- a/searx/engines/framalibre.py
+++ b/searx/engines/framalibre.py
@@ -13,7 +13,7 @@
 
				 from html import escape
			
 
				 from urllib.parse import urljoin, urlencode
			
 
				 from lxml import html
			
 
				-from searx.engines.xpath import extract_text
			
 
				+from searx.utils import extract_text
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['it']
			
--- a/searx/engines/gentoo.py
+++ b/searx/engines/gentoo.py
@@ -13,7 +13,7 @@
 
				 
			
 
				 from urllib.parse import urlencode, urljoin
			
 
				 from lxml import html
			
 
				-from searx.engines.xpath import extract_text
			
 
				+from searx.utils import extract_text
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['it']
			
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -21,9 +21,8 @@ Definitions`_.
 
				 from urllib.parse import urlencode, urlparse
			
 
				 from lxml import html
			
 
				 from flask_babel import gettext
			
 
				-from searx.engines.xpath import extract_text
			
 
				 from searx import logger
			
 
				-from searx.utils import match_language, eval_xpath
			
 
				+from searx.utils import match_language, extract_text, eval_xpath
			
 
				 
			
 
				 logger = logger.getChild('google engine')
			
 
				 
			
--- a/searx/engines/google_images.py
+++ b/searx/engines/google_images.py
@@ -28,8 +28,7 @@ from urllib.parse import urlencode, urlparse, unquote
 
				 from lxml import html
			
 
				 from flask_babel import gettext
			
 
				 from searx import logger
			
 
				-from searx.utils import eval_xpath
			
 
				-from searx.engines.xpath import extract_text
			
 
				+from searx.utils import extract_text, eval_xpath
			
 
				 
			
 
				 # pylint: disable=unused-import
			
 
				 from searx.engines.google import (
			
--- a/searx/engines/google_videos.py
+++ b/searx/engines/google_videos.py
@@ -14,7 +14,7 @@ from datetime import date, timedelta
 
				 from json import loads
			
 
				 from urllib.parse import urlencode
			
 
				 from lxml import html
			
 
				-from searx.engines.xpath import extract_text
			
 
				+from searx.utils import extract_text
			
 
				 import re
			
 
				 
			
 
				 # engine dependent config
			
--- a/searx/engines/ina.py
+++ b/searx/engines/ina.py
@@ -16,7 +16,7 @@ from urllib.parse import urlencode
 
				 from lxml import html
			
 
				 from dateutil import parser
			
 
				 from html.parser import HTMLParser
			
 
				-from searx.engines.xpath import extract_text
			
 
				+from searx.utils import extract_text
			
 
				 
			
 
				 
			
 
				 # engine dependent config
			
--- a/searx/engines/kickass.py
+++ b/searx/engines/kickass.py
@@ -13,8 +13,7 @@
 
				 from lxml import html
			
 
				 from operator import itemgetter
			
 
				 from urllib.parse import quote, urljoin
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import get_torrent_size, convert_str_to_int
			
 
				+from searx.utils import extract_text, get_torrent_size, convert_str_to_int
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['videos', 'music', 'files']
			
--- a/searx/engines/nyaa.py
+++ b/searx/engines/nyaa.py
@@ -11,8 +11,7 @@
 
				 
			
 
				 from lxml import html
			
 
				 from urllib.parse import urlencode
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import get_torrent_size, int_or_zero
			
 
				+from searx.utils import extract_text, get_torrent_size, int_or_zero
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['files', 'images', 'videos', 'music']
			
--- a/searx/engines/piratebay.py
+++ b/searx/engines/piratebay.py
@@ -13,8 +13,7 @@ from datetime import datetime
 
				 from operator import itemgetter
			
 
				 
			
 
				 from urllib.parse import quote, urljoin
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import get_torrent_size
			
 
				+from searx.utils import extract_text, get_torrent_size
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ["videos", "music", "files"]
			
--- a/searx/engines/seedpeer.py
+++ b/searx/engines/seedpeer.py
@@ -12,7 +12,7 @@ from lxml import html
 
				 from json import loads
			
 
				 from operator import itemgetter
			
 
				 from urllib.parse import quote, urljoin
			
 
				-from searx.engines.xpath import extract_text
			
 
				+from searx.utils import extract_text
			
 
				 
			
 
				 
			
 
				 url = 'https://seedpeer.me/'
			
--- a/searx/engines/stackoverflow.py
+++ b/searx/engines/stackoverflow.py
@@ -12,7 +12,7 @@
 
				 
			
 
				 from urllib.parse import urlencode, urljoin
			
 
				 from lxml import html
			
 
				-from searx.engines.xpath import extract_text
			
 
				+from searx.utils import extract_text
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['it']
			
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -17,9 +17,8 @@ import re
 
				 from unicodedata import normalize, combining
			
 
				 from babel import Locale
			
 
				 from babel.localedata import locale_identifiers
			
 
				-from searx.engines.xpath import extract_text
			
 
				 from searx.languages import language_codes
			
 
				-from searx.utils import eval_xpath, match_language
			
 
				+from searx.utils import extract_text, eval_xpath, match_language
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['general']
			
--- a/searx/engines/tokyotoshokan.py
+++ b/searx/engines/tokyotoshokan.py
@@ -13,9 +13,8 @@
 
				 import re
			
 
				 from urllib.parse import urlencode
			
 
				 from lxml import html
			
 
				-from searx.engines.xpath import extract_text
			
 
				 from datetime import datetime
			
 
				-from searx.utils import get_torrent_size, int_or_zero
			
 
				+from searx.utils import extract_text, get_torrent_size, int_or_zero
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['files', 'videos', 'music']
			
--- a/searx/engines/torrentz.py
+++ b/searx/engines/torrentz.py
@@ -15,8 +15,7 @@ import re
 
				 from urllib.parse import urlencode
			
 
				 from lxml import html
			
 
				 from datetime import datetime
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import get_torrent_size
			
 
				+from searx.utils import extract_text, get_torrent_size
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['files', 'videos', 'music']
			
--- a/searx/engines/twitter.py
+++ b/searx/engines/twitter.py
@@ -15,7 +15,7 @@
 
				 from urllib.parse import urlencode, urljoin
			
 
				 from lxml import html
			
 
				 from datetime import datetime
			
 
				-from searx.engines.xpath import extract_text
			
 
				+from searx.utils import extract_text
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['social media']
			
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -13,9 +13,8 @@
 
				 
			
 
				 from searx import logger
			
 
				 from searx.poolrequests import get
			
 
				-from searx.engines.xpath import extract_text
			
 
				 from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
			
 
				-from searx.utils import match_language, eval_xpath
			
 
				+from searx.utils import extract_text, match_language, eval_xpath
			
 
				 
			
 
				 from urllib.parse import urlencode
			
 
				 from json import loads
			
--- a/searx/engines/www1x.py
+++ b/searx/engines/www1x.py
@@ -12,7 +12,7 @@
 
				 
			
 
				 from lxml import html
			
 
				 from urllib.parse import urlencode, urljoin
			
 
				-from searx.engines.xpath import extract_text
			
 
				+from searx.utils import extract_text
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['images']
			
--- a/searx/engines/xpath.py
+++ b/searx/engines/xpath.py
@@ -1,7 +1,6 @@
 
				-from urllib.parse import unquote, urlencode, urljoin, urlparse
			
 
				 from lxml import html
			
 
				-from lxml.etree import _ElementStringResult, _ElementUnicodeResult
			
 
				-from searx.utils import html_to_text, eval_xpath
			
 
				+from urllib.parse import urlencode
			
 
				+from searx.utils import extract_text, extract_url, eval_xpath
			
 
				 
			
 
				 search_url = None
			
 
				 url_xpath = None
			
@@ -21,76 +20,6 @@ page_size = 1
 
				 first_page_num = 1
			
 
				 
			
 
				 
			
 
				-'''
			
 
				-if xpath_results is list, extract the text from each result and concat the list
			
 
				-if xpath_results is a xml element, extract all the text node from it
			
 
				-   ( text_content() method from lxml )
			
 
				-if xpath_results is a string element, then it's already done
			
 
				-'''
			
 
				-
			
 
				-
			
 
				-def extract_text(xpath_results):
			
 
				-    if type(xpath_results) == list:
			
 
				-        # it's list of result : concat everything using recursive call
			
 
				-        result = ''
			
 
				-        for e in xpath_results:
			
 
				-            result = result + extract_text(e)
			
 
				-        return result.strip()
			
 
				-    elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:
			
 
				-        # it's a string
			
 
				-        return ''.join(xpath_results)
			
 
				-    else:
			
 
				-        # it's a element
			
 
				-        text = html.tostring(
			
 
				-            xpath_results, encoding='unicode', method='text', with_tail=False
			
 
				-        )
			
 
				-        text = text.strip().replace('\n', ' ')
			
 
				-        return ' '.join(text.split())
			
 
				-
			
 
				-
			
 
				-def extract_url(xpath_results, search_url):
			
 
				-    if xpath_results == []:
			
 
				-        raise Exception('Empty url resultset')
			
 
				-    url = extract_text(xpath_results)
			
 
				-
			
 
				-    if url.startswith('//'):
			
 
				-        # add http or https to this kind of url //example.com/
			
 
				-        parsed_search_url = urlparse(search_url)
			
 
				-        url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
			
 
				-    elif url.startswith('/'):
			
 
				-        # fix relative url to the search engine
			
 
				-        url = urljoin(search_url, url)
			
 
				-
			
 
				-    # fix relative urls that fall through the crack
			
 
				-    if '://' not in url:
			
 
				-        url = urljoin(search_url, url)
			
 
				-
			
 
				-    # normalize url
			
 
				-    url = normalize_url(url)
			
 
				-
			
 
				-    return url
			
 
				-
			
 
				-
			
 
				-def normalize_url(url):
			
 
				-    parsed_url = urlparse(url)
			
 
				-
			
 
				-    # add a / at this end of the url if there is no path
			
 
				-    if not parsed_url.netloc:
			
 
				-        raise Exception('Cannot parse url')
			
 
				-    if not parsed_url.path:
			
 
				-        url += '/'
			
 
				-
			
 
				-    # FIXME : hack for yahoo
			
 
				-    if parsed_url.hostname == 'search.yahoo.com'\
			
 
				-       and parsed_url.path.startswith('/r'):
			
 
				-        p = parsed_url.path
			
 
				-        mark = p.find('/**')
			
 
				-        if mark != -1:
			
 
				-            return unquote(p[mark + 3:]).decode()
			
 
				-
			
 
				-    return url
			
 
				-
			
 
				-
			
 
				 def request(query, params):
			
 
				     query = urlencode({'q': query})[2:]
			
 
				 
			
--- a/searx/engines/yahoo.py
+++ b/searx/engines/yahoo.py
@@ -13,8 +13,7 @@
 
				 
			
 
				 from urllib.parse import unquote, urlencode
			
 
				 from lxml import html
			
 
				-from searx.engines.xpath import extract_text, extract_url
			
 
				-from searx.utils import match_language, eval_xpath
			
 
				+from searx.utils import extract_text, extract_url, match_language, eval_xpath
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['general']
			
--- a/searx/engines/yahoo_news.py
+++ b/searx/engines/yahoo_news.py
@@ -13,12 +13,11 @@ import re
 
				 from datetime import datetime, timedelta
			
 
				 from urllib.parse import urlencode
			
 
				 from lxml import html
			
 
				-from searx.engines.xpath import extract_text, extract_url
			
 
				 from searx.engines.yahoo import (
			
 
				     parse_url, _fetch_supported_languages, supported_languages_url, language_aliases
			
 
				 )
			
 
				 from dateutil import parser
			
 
				-from searx.utils import match_language
			
 
				+from searx.utils import extract_text, extract_url, match_language
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['news']
			
--- a/searx/engines/yggtorrent.py
+++ b/searx/engines/yggtorrent.py
@@ -12,8 +12,7 @@ from lxml import html
 
				 from operator import itemgetter
			
 
				 from datetime import datetime
			
 
				 from urllib.parse import quote
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import get_torrent_size
			
 
				+from searx.utils import extract_text, get_torrent_size
			
 
				 from searx.poolrequests import get as http_get
			
 
				 
			
 
				 # engine dependent config
			
--- a/searx/engines/youtube_noapi.py
+++ b/searx/engines/youtube_noapi.py
@@ -11,8 +11,7 @@
 
				 from functools import reduce
			
 
				 from json import loads
			
 
				 from urllib.parse import quote_plus
			
 
				-from searx.engines.xpath import extract_text
			
 
				-from searx.utils import list_get
			
 
				+from searx.utils import extract_text, list_get
			
 
				 
			
 
				 # engine dependent config
			
 
				 categories = ['videos', 'music']
			
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -10,9 +10,13 @@ from os.path import splitext, join
 
				 from io import open
			
 
				 from random import choice
			
 
				 from html.parser import HTMLParser
			
 
				-from lxml.etree import XPath
			
 
				+from urllib.parse import urljoin, urlparse, unquote
			
 
				+
			
 
				+from lxml import html
			
 
				+from lxml.etree import XPath, _ElementStringResult, _ElementUnicodeResult
			
 
				 from babel.core import get_global
			
 
				 
			
 
				+
			
 
				 from searx import settings
			
 
				 from searx.version import VERSION_STRING
			
 
				 from searx.languages import language_codes
			
@@ -106,6 +110,74 @@ def html_to_text(html):
 
				     return s.get_text()
			
 
				 
			
 
				 
			
 
				+def extract_text(xpath_results):
			
 
				+    '''
			
 
				+    if xpath_results is list, extract the text from each result and concat the list
			
 
				+    if xpath_results is a xml element, extract all the text node from it
			
 
				+    ( text_content() method from lxml )
			
 
				+    if xpath_results is a string element, then it's already done
			
 
				+    '''
			
 
				+    if type(xpath_results) == list:
			
 
				+        # it's list of result : concat everything using recursive call
			
 
				+        result = ''
			
 
				+        for e in xpath_results:
			
 
				+            result = result + extract_text(e)
			
 
				+        return result.strip()
			
 
				+    elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:
			
 
				+        # it's a string
			
 
				+        return ''.join(xpath_results)
			
 
				+    else:
			
 
				+        # it's a element
			
 
				+        text = html.tostring(
			
 
				+            xpath_results, encoding='unicode', method='text', with_tail=False
			
 
				+        )
			
 
				+        text = text.strip().replace('\n', ' ')
			
 
				+        return ' '.join(text.split())
			
 
				+
			
 
				+
			
 
				+def extract_url(xpath_results, search_url):
			
 
				+    if xpath_results == []:
			
 
				+        raise Exception('Empty url resultset')
			
 
				+    url = extract_text(xpath_results)
			
 
				+
			
 
				+    if url.startswith('//'):
			
 
				+        # add http or https to this kind of url //example.com/
			
 
				+        parsed_search_url = urlparse(search_url)
			
 
				+        url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
			
 
				+    elif url.startswith('/'):
			
 
				+        # fix relative url to the search engine
			
 
				+        url = urljoin(search_url, url)
			
 
				+
			
 
				+    # fix relative urls that fall through the crack
			
 
				+    if '://' not in url:
			
 
				+        url = urljoin(search_url, url)
			
 
				+
			
 
				+    # normalize url
			
 
				+    url = normalize_url(url)
			
 
				+
			
 
				+    return url
			
 
				+
			
 
				+
			
 
				+def normalize_url(url):
			
 
				+    parsed_url = urlparse(url)
			
 
				+
			
 
				+    # add a / at this end of the url if there is no path
			
 
				+    if not parsed_url.netloc:
			
 
				+        raise Exception('Cannot parse url')
			
 
				+    if not parsed_url.path:
			
 
				+        url += '/'
			
 
				+
			
 
				+    # FIXME : hack for yahoo
			
 
				+    if parsed_url.hostname == 'search.yahoo.com'\
			
 
				+       and parsed_url.path.startswith('/r'):
			
 
				+        p = parsed_url.path
			
 
				+        mark = p.find('/**')
			
 
				+        if mark != -1:
			
 
				+            return unquote(p[mark + 3:]).decode()
			
 
				+
			
 
				+    return url
			
 
				+
			
 
				+
			
 
				 def dict_subset(d, properties):
			
 
				     result = {}
			
 
				     for k in properties:
			
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -1,4 +1,7 @@
 
				 # -*- coding: utf-8 -*-
			
 
				+import lxml.etree
			
 
				+from lxml import html
			
 
				+
			
 
				 from searx.testing import SearxTestCase
			
 
				 from searx import utils
			
 
				 
			
@@ -16,7 +19,30 @@ class TestUtils(SearxTestCase):
 
				         self.assertTrue(utils.searx_useragent().startswith('searx'))
			
 
				 
			
 
				     def test_html_to_text(self):
			
 
				-        html = """
			
 
				+        html_str = """
			
 
				+        <a href="/testlink" class="link_access_account">
			
 
				+            <style>
			
 
				+                .toto {
			
 
				+                    color: red;
			
 
				+                }
			
 
				+            </style>
			
 
				+            <span class="toto">
			
 
				+                <span>
			
 
				+                    <img src="test.jpg" />
			
 
				+                </span>
			
 
				+            </span>
			
 
				+            <span class="titi">
			
 
				+                            Test text
			
 
				+            </span>
			
 
				+            <script>value='dummy';</script>
			
 
				+        </a>
			
 
				+        """
			
 
				+        self.assertIsInstance(utils.html_to_text(html_str), str)
			
 
				+        self.assertIsNotNone(utils.html_to_text(html_str))
			
 
				+        self.assertEqual(utils.html_to_text(html_str), "Test text")
			
 
				+
			
 
				+    def test_extract_text(self):
			
 
				+        html_str = """
			
 
				         <a href="/testlink" class="link_access_account">
			
 
				             <span class="toto">
			
 
				                 <span>
			
@@ -28,9 +54,24 @@ class TestUtils(SearxTestCase):
 
				             </span>
			
 
				         </a>
			
 
				         """
			
 
				-        self.assertIsInstance(utils.html_to_text(html), str)
			
 
				-        self.assertIsNotNone(utils.html_to_text(html))
			
 
				-        self.assertEqual(utils.html_to_text(html), "Test text")
			
 
				+        dom = html.fromstring(html_str)
			
 
				+        self.assertEqual(utils.extract_text(dom), 'Test text')
			
 
				+        self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text')
			
 
				+        self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg')
			
 
				+        self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '')
			
 
				+
			
 
				+    def test_extract_url(self):
			
 
				+        def f(html_str, search_url):
			
 
				+            return utils.extract_url(html.fromstring(html_str), search_url)
			
 
				+        self.assertEqual(f('<span id="42">https://example.com</span>', 'http://example.com/'), 'https://example.com/')
			
 
				+        self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/')
			
 
				+        self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/')
			
 
				+        self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/')
			
 
				+        self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1')
			
 
				+        with self.assertRaises(lxml.etree.ParserError):
			
 
				+            f('', 'https://example.com')
			
 
				+        with self.assertRaises(Exception):
			
 
				+            utils.extract_url([], 'https://example.com')
			
 
				 
			
 
				     def test_html_to_text_invalid(self):
			
 
				         html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'