Browse Source

Merge pull request #1260 from MarcAbonce/engine-fixes

[fix] Engine fixes
Adam Tauber 7 years ago
parent
commit
e5def5b019

+ 2 - 2
searx/engines/google_news.py

@@ -68,8 +68,8 @@ def response(resp):
     for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'):
     for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'):
         try:
         try:
             r = {
             r = {
-                'url': result.xpath('.//a[@class="l _PMs"]')[0].attrib.get("href"),
-                'title': ''.join(result.xpath('.//a[@class="l _PMs"]//text()')),
+                'url': result.xpath('.//a[@class="l lLrAF"]')[0].attrib.get("href"),
+                'title': ''.join(result.xpath('.//a[@class="l lLrAF"]//text()')),
                 'content': ''.join(result.xpath('.//div[@class="st"]//text()')),
                 'content': ''.join(result.xpath('.//div[@class="st"]//text()')),
             }
             }
         except:
         except:

+ 6 - 7
searx/engines/wikidata.py

@@ -27,7 +27,7 @@ result_count = 1
 # urls
 # urls
 wikidata_host = 'https://www.wikidata.org'
 wikidata_host = 'https://www.wikidata.org'
 url_search = wikidata_host \
 url_search = wikidata_host \
-    + '/wiki/Special:ItemDisambiguation?{query}'
+    + '/w/index.php?{query}'
 
 
 wikidata_api = wikidata_host + '/w/api.php'
 wikidata_api = wikidata_host + '/w/api.php'
 url_detail = wikidata_api\
 url_detail = wikidata_api\
@@ -40,7 +40,7 @@ url_map = 'https://www.openstreetmap.org/'\
 url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400'
 url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400'
 
 
 # xpaths
 # xpaths
-wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title'
+wikidata_ids_xpath = '//ul[@class="mw-search-results"]/li//a/@href'
 title_xpath = '//*[contains(@class,"wikibase-title-label")]'
 title_xpath = '//*[contains(@class,"wikibase-title-label")]'
 description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]'
 description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]'
 property_xpath = '//div[@id="{propertyid}"]'
 property_xpath = '//div[@id="{propertyid}"]'
@@ -57,22 +57,21 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
 
 
 
 
 def request(query, params):
 def request(query, params):
-    language = match_language(params['language'], supported_languages).split('-')[0]
-
     params['url'] = url_search.format(
     params['url'] = url_search.format(
-        query=urlencode({'label': query, 'language': language}))
+        query=urlencode({'search': query}))
     return params
     return params
 
 
 
 
 def response(resp):
 def response(resp):
     results = []
     results = []
     html = fromstring(resp.text)
     html = fromstring(resp.text)
-    wikidata_ids = html.xpath(wikidata_ids_xpath)
+    search_results = html.xpath(wikidata_ids_xpath)
 
 
     language = match_language(resp.search_params['language'], supported_languages).split('-')[0]
     language = match_language(resp.search_params['language'], supported_languages).split('-')[0]
 
 
     # TODO: make requests asynchronous to avoid timeout when result_count > 1
     # TODO: make requests asynchronous to avoid timeout when result_count > 1
-    for wikidata_id in wikidata_ids[:result_count]:
+    for search_result in search_results[:result_count]:
+        wikidata_id = search_result.split('/')[-1]
         url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language}))
         url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language}))
         htmlresponse = get(url)
         htmlresponse = get(url)
         jsonresponse = loads(htmlresponse.text)
         jsonresponse = loads(htmlresponse.text)

+ 1 - 1
searx/engines/xpath.py

@@ -53,7 +53,7 @@ def extract_url(xpath_results, search_url):
     if url.startswith('//'):
     if url.startswith('//'):
         # add http or https to this kind of url //example.com/
         # add http or https to this kind of url //example.com/
         parsed_search_url = urlparse(search_url)
         parsed_search_url = urlparse(search_url)
-        url = u'{0}:{1}'.format(parsed_search_url.scheme, url)
+        url = u'{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
     elif url.startswith('/'):
     elif url.startswith('/'):
         # fix relative url to the search engine
         # fix relative url to the search engine
         url = urljoin(search_url, url)
         url = urljoin(search_url, url)

+ 1 - 0
searx/settings.yml

@@ -174,6 +174,7 @@ engines:
   - name : wikidata
   - name : wikidata
     engine : wikidata
     engine : wikidata
     shortcut : wd
     shortcut : wd
+    timeout : 3.0
     weight : 2
     weight : 2
 
 
   - name : duckduckgo
   - name : duckduckgo

+ 2 - 2
tests/unit/engines/test_google_news.py

@@ -42,7 +42,7 @@ class TestGoogleNewsEngine(SearxTestCase):
                 <div class="ts _JGs _JHs _tJs _KGs _jHs">
                 <div class="ts _JGs _JHs _tJs _KGs _jHs">
                     <div class="_hJs">
                     <div class="_hJs">
                         <h3 class="r _gJs">
                         <h3 class="r _gJs">
-                            <a class="l _PMs" href="https://example.com/" onmousedown="return rwt(this,'','','','11','AFQjCNEyehpzD5cJK1KUfXBx9RmsbqqG9g','','0ahUKEwjB58OR54HWAhWnKJoKHSQhAMY4ChCpAggiKAAwAA','','',event)">Example title</a>
+                            <a class="l lLrAF" href="https://example.com/" onmousedown="return rwt(this,'','','','11','AFQjCNEyehpzD5cJK1KUfXBx9RmsbqqG9g','','0ahUKEwjB58OR54HWAhWnKJoKHSQhAMY4ChCpAggiKAAwAA','','',event)">Example title</a>
                         </h3>
                         </h3>
                         <div class="slp">
                         <div class="slp">
                             <span class="_OHs _PHs">
                             <span class="_OHs _PHs">
@@ -63,7 +63,7 @@ class TestGoogleNewsEngine(SearxTestCase):
                     </a>
                     </a>
                     <div class="_hJs">
                     <div class="_hJs">
                         <h3 class="r _gJs">
                         <h3 class="r _gJs">
-                            <a class="l _PMs" href="https://example2.com/" onmousedown="return rwt(this,'','','','12','AFQjCNHObfH7sYmLWI1SC-YhWXKZFRzRjw','','0ahUKEwjB58OR54HWAhWnKJoKHSQhAMY4ChCpAgglKAAwAQ','','',event)">Example title 2</a>
+                            <a class="l lLrAF" href="https://example2.com/" onmousedown="return rwt(this,'','','','12','AFQjCNHObfH7sYmLWI1SC-YhWXKZFRzRjw','','0ahUKEwjB58OR54HWAhWnKJoKHSQhAMY4ChCpAgglKAAwAQ','','',event)">Example title 2</a>
                         </h3>
                         </h3>
                         <div class="slp">
                         <div class="slp">
                             <span class="_OHs _PHs">
                             <span class="_OHs _PHs">

+ 1 - 5
tests/unit/engines/test_wikidata.py

@@ -9,20 +9,15 @@ from searx.testing import SearxTestCase
 class TestWikidataEngine(SearxTestCase):
 class TestWikidataEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
-        wikidata.supported_languages = ['en', 'es']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
-        dicto['language'] = 'en-US'
         params = wikidata.request(query, dicto)
         params = wikidata.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)
         self.assertIn(query, params['url'])
         self.assertIn(query, params['url'])
         self.assertIn('wikidata.org', params['url'])
         self.assertIn('wikidata.org', params['url'])
-        self.assertIn('en', params['url'])
 
 
-        dicto['language'] = 'es-ES'
         params = wikidata.request(query, dicto)
         params = wikidata.request(query, dicto)
         self.assertIn(query, params['url'])
         self.assertIn(query, params['url'])
-        self.assertIn('es', params['url'])
 
 
     # successful cases are not tested here to avoid sending additional requests
     # successful cases are not tested here to avoid sending additional requests
     def test_response(self):
     def test_response(self):
@@ -31,6 +26,7 @@ class TestWikidataEngine(SearxTestCase):
         self.assertRaises(AttributeError, wikidata.response, '')
         self.assertRaises(AttributeError, wikidata.response, '')
         self.assertRaises(AttributeError, wikidata.response, '[]')
         self.assertRaises(AttributeError, wikidata.response, '[]')
 
 
+        wikidata.supported_languages = ['en', 'es']
         response = mock.Mock(text='<html></html>', search_params={"language": "en"})
         response = mock.Mock(text='<html></html>', search_params={"language": "en"})
         self.assertEqual(wikidata.response(response), [])
         self.assertEqual(wikidata.response(response), [])