Browse Source

Merge pull request #249 from dalf/master

[fix] update yahoo engine according to the web site changes
Adam Tauber 10 years ago
parent
commit
7f7f10bb6f
2 changed files with 65 additions and 67 deletions
  1. 6 5
      searx/engines/yahoo.py
  2. 59 62
      searx/tests/engines/test_yahoo.py

+ 6 - 5
searx/engines/yahoo.py

@@ -24,11 +24,11 @@ base_url = 'https://search.yahoo.com/'
 search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
 search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
 
 
 # specific xpath variables
 # specific xpath variables
-results_xpath = '//div[@class="res"]'
+results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
 url_xpath = './/h3/a/@href'
 url_xpath = './/h3/a/@href'
 title_xpath = './/h3/a'
 title_xpath = './/h3/a'
-content_xpath = './/div[@class="abstr"]'
-suggestion_xpath = '//div[@id="satat"]//a'
+content_xpath = './/div[@class="compText aAbs"]'
+suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a"
 
 
 
 
 # remove yahoo-specific tracking-url
 # remove yahoo-specific tracking-url
@@ -91,11 +91,12 @@ def response(resp):
                         'content': content})
                         'content': content})
 
 
     # if no suggestion found, return results
     # if no suggestion found, return results
-    if not dom.xpath(suggestion_xpath):
+    suggestions = dom.xpath(suggestion_xpath)
+    if not suggestions:
         return results
         return results
 
 
     # parse suggestion
     # parse suggestion
-    for suggestion in dom.xpath(suggestion_xpath):
+    for suggestion in suggestions:
         # append suggestion
         # append suggestion
         results.append({'suggestion': extract_text(suggestion)})
         results.append({'suggestion': extract_text(suggestion)})
 
 

+ 59 - 62
searx/tests/engines/test_yahoo.py

@@ -55,86 +55,83 @@ class TestYahooEngine(SearxTestCase):
         self.assertEqual(yahoo.response(response), [])
         self.assertEqual(yahoo.response(response), [])
 
 
         html = """
         html = """
-        <div class="res">
-            <div>
-                <h3>
-                <a id="link-1" class="yschttl spt" href="http://r.search.yahoo.com/_ylt=A0LEVzClb9JUSKcAEGRXNyoA;
-                    _ylu=X3oDMTEzZm1qazYwBHNlYwNzcgRwb3MDMQRjb2xvA2JmMQR2dGlkA1NNRTcwM18x/RV=2/RE=1423106085/RO=10
-                    /RU=https%3a%2f%2fthis.is.the.url%2f/RK=0/RS=dtcJsfP4mEeBOjnVfUQ-"target="_blank" data-bk="5063.1">
-                    <b>This</b> is the title
-                </a>
+<ol class="reg mb-15 searchCenterMiddle">
+    <li class="first">
+        <div class="dd algo fst Sr">
+            <div class="compTitle">
+                <h3 class="title"><a class=" td-u" href="http://r.search.yahoo.com/_ylt=A0LEb9JUSKcAEGRXNyoA;
+                     _ylu=X3oDMTEzZm1qazYwBHNlYwNzcgRwb3MDMQRjb2xvA2Jm2dGlkA1NNRTcwM18x/RV=2/RE=1423106085/RO=10
+                     /RU=https%3a%2f%2fthis.is.the.url%2f/RK=0/RS=dtcJsfP4mEeBOjnVfUQ-"
+                     target="_blank" data-bid="54e712e13671c">
+                     <b><b>This is the title</b></b></a>
                 </h3>
                 </h3>
             </div>
             </div>
-            <span class="url" dir="ltr">www.<b>test</b>.com</span>
-            <div class="abstr">
-                <b>This</b> is the content
+            <div class="compText aAbs">
+                <p class="lh-18"><b><b>This is the </b>content</b>
+                </p>
             </div>
             </div>
         </div>
         </div>
-        <div id="satat"  data-bns="Yahoo" data-bk="124.1">
-            <h2>Also Try</h2>
-            <table>
-                <tbody>
-                    <tr>
-                        <td>
-                            <a id="srpnat0" class="" href="https://search.yahoo.com/search=rs-bottom" >
-                                <span>
-                                    <b></b>This is <b>the suggestion</b>
-                                </span>
-                            </a>
-                        </td>
-                    </tr>
-                </tbody>
-            </table>
+    </li>
+    <li>
+        <div class="dd algo lst Sr">
+            <div class="compTitle">
+                <h3 class="title"><a class=" td-u" href="http://r.search.yahoo.com/_ylt=AwrBT7zgEudUW.wAe2ZXNyoA;
+                     _ylu=X3oDMTBybGY3bmpvBGNvbG8DYmYxBHBvcwMyBHZ0aWQDBHNlYwNzcg--/RV=2\/RE=1424458593/RO=10
+                     /RU=https%3a%2f%2fthis.is.the.second.url%2f/RK=0/RS=jIctjj_cBH1Efj88GCgHKp3__Qk-"
+                     target="_blank" data-bid="54e712e136926">
+                     This is the second <b><b>title</b></b></a>
+                </h3>
+            </div>
+            <div class="compText aAbs">
+                <p class="lh-18">This is the second content</p>
+            </div>
         </div>
         </div>
+    </li>
+</ol>
+<div class="dd assist fst lst AlsoTry" data-bid="54e712e138d04">
+    <div class="compTitle mb-4 h-17">
+        <h3 class="title">Also Try</h3> </div>
+    <table class="compTable m-0 ac-1st td-u fz-ms">
+        <tbody>
+            <tr>
+                <td class="w-50p pr-28"><a href="https://search.yahoo.com/"><B>This is the </B>suggestion<B></B></a>
+                </td>
+            </tr>
+    </table>
+</div>
         """
         """
         response = mock.Mock(text=html)
         response = mock.Mock(text=html)
         results = yahoo.response(response)
         results = yahoo.response(response)
+        print results
         self.assertEqual(type(results), list)
         self.assertEqual(type(results), list)
-        self.assertEqual(len(results), 2)
+        self.assertEqual(len(results), 3)
         self.assertEqual(results[0]['title'], 'This is the title')
         self.assertEqual(results[0]['title'], 'This is the title')
         self.assertEqual(results[0]['url'], 'https://this.is.the.url/')
         self.assertEqual(results[0]['url'], 'https://this.is.the.url/')
         self.assertEqual(results[0]['content'], 'This is the content')
         self.assertEqual(results[0]['content'], 'This is the content')
-        self.assertEqual(results[1]['suggestion'], 'This is the suggestion')
+        self.assertEqual(results[1]['title'], 'This is the second title')
+        self.assertEqual(results[1]['url'], 'https://this.is.the.second.url/')
+        self.assertEqual(results[1]['content'], 'This is the second content')
+        self.assertEqual(results[2]['suggestion'], 'This is the suggestion')
 
 
         html = """
         html = """
-        <div class="res">
-            <div>
-                <h3>
-                <a id="link-1" class="yschttl spt" href="http://r.search.yahoo.com/_ylt=A0LEVzClb9JUSKcAEGRXNyoA;
-                    _ylu=X3oDMTEzZm1qazYwBHNlYwNzcgRwb3MDMQRjb2xvA2JmMQR2dGlkA1NNRTcwM18x/RV=2/RE=1423106085/RO=10
-                    /RU=https%3a%2f%2fthis.is.the.url%2f/RK=0/RS=dtcJsfP4mEeBOjnVfUQ-"target="_blank" data-bk="5063.1">
-                    <b>This</b> is the title
-                </a>
-                </h3>
-            </div>
-            <span class="url" dir="ltr">www.<b>test</b>.com</span>
-            <div class="abstr">
-                <b>This</b> is the content
-            </div>
-        </div>
-        <div class="res">
-            <div>
-                <h3>
-                <a id="link-1" class="yschttl spt">
-                    <b>This</b> is the title
-                </a>
-                </h3>
-            </div>
-            <span class="url" dir="ltr">www.<b>test</b>.com</span>
-            <div class="abstr">
-                <b>This</b> is the content
-            </div>
-        </div>
-        <div class="res">
-            <div>
-                <h3>
+<ol class="reg mb-15 searchCenterMiddle">
+    <li class="first">
+        <div class="dd algo fst Sr">
+            <div class="compTitle">
+                <h3 class="title"><a class=" td-u" href="http://r.search.yahoo.com/_ylt=A0LEb9JUSKcAEGRXNyoA;
+                     _ylu=X3oDMTEzZm1qazYwBHNlYwNzcgRwb3MDMQRjb2xvA2Jm2dGlkA1NNRTcwM18x/RV=2/RE=1423106085/RO=10
+                     /RU=https%3a%2f%2fthis.is.the.url%2f/RK=0/RS=dtcJsfP4mEeBOjnVfUQ-"
+                     target="_blank" data-bid="54e712e13671c">
+                  <b><b>This is the title</b></b></a>
                 </h3>
                 </h3>
             </div>
             </div>
-            <span class="url" dir="ltr">www.<b>test</b>.com</span>
-            <div class="abstr">
-                <b>This</b> is the content
+            <div class="compText aAbs">
+                <p class="lh-18"><b><b>This is the </b>content</b>
+                </p>
             </div>
             </div>
         </div>
         </div>
+    </li>
+</ol>
         """
         """
         response = mock.Mock(text=html)
         response = mock.Mock(text=html)
         results = yahoo.response(response)
         results = yahoo.response(response)