Browse Source

Merge pull request #1628 from MarcAbonce/google_fix

[fix] Update xpaths for new Google results page
Adam Tauber 5 years ago
parent
commit
4cddb829f9
2 changed files with 44 additions and 94 deletions
  1. 7 14
      searx/engines/google.py
  2. 37 80
      tests/unit/engines/test_google.py

+ 7 - 14
searx/engines/google.py

@@ -107,13 +107,12 @@ images_path = '/images'
 supported_languages_url = 'https://www.google.com/preferences?#languages'
 
 # specific xpath variables
-results_xpath = '//div[@class="g"]'
-url_xpath = './/h3/a/@href'
-title_xpath = './/h3'
-content_xpath = './/span[@class="st"]'
-content_misc_xpath = './/div[@class="f slp"]'
-suggestion_xpath = '//p[@class="_Bmc"]'
-spelling_suggestion_xpath = '//a[@class="spell"]'
+results_xpath = '//div[contains(@class, "ZINbbc")]'
+url_xpath = './/div[@class="kCrYT"][1]/a/@href'
+title_xpath = './/div[@class="kCrYT"][1]/a/div[1]'
+content_xpath = './/div[@class="kCrYT"][2]//div[contains(@class, "BNeawe")]//div[contains(@class, "BNeawe")]'
+suggestion_xpath = '//div[contains(@class, "ZINbbc")][last()]//div[@class="rVLSBd"]/a//div[contains(@class, "BNeawe")]'
+spelling_suggestion_xpath = '//div[@id="scc"]//a'
 
 # map : detail location
 map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()'
@@ -199,10 +198,6 @@ def request(query, params):
     params['headers']['Accept-Language'] = language + ',' + language + '-' + country
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
 
-    # Force Safari 3.1 on Mac OS X (Leopard) user agent to avoid loading the new UI that Searx can't parse
-    params['headers']['User-Agent'] = ("Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_4)"
-                                       "AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1")
-
     params['google_hostname'] = google_hostname
 
     return params
@@ -274,9 +269,7 @@ def response(resp):
                 content = extract_text_from_dom(result, content_xpath)
                 if content is None:
                     continue
-                content_misc = extract_text_from_dom(result, content_misc_xpath)
-                if content_misc is not None:
-                    content = content_misc + "<br />" + content
+
                 # append result
                 results.append({'url': url,
                                 'title': title,

+ 37 - 80
tests/unit/engines/test_google.py

@@ -58,93 +58,50 @@ class TestGoogleEngine(SearxTestCase):
         self.assertEqual(google.response(response), [])
 
         html = """
-        <div class="g">
-            <h3 class="r">
-                <a href="http://this.should.be.the.link/">
-                    <b>This</b> is <b>the</b> title
-                </a>
-            </h3>
-            <div class="s">
-                <div class="kv" style="margin-bottom:2px">
-                    <cite>
-                        <b>test</b>.psychologies.com/
-                    </cite>
-                    <div class="_nBb">‎
-                        <div style="display:inline" onclick="google.sham(this);" aria-expanded="false"
-                            aria-haspopup="true" tabindex="0" data-ved="0CBUQ7B0wAA">
-                            <span class="_O0">
-                            </span>
+        <div class="ZINbbc xpd O9g5cc uUPGi">
+            <div>
+                <div class="kCrYT">
+                    <a href="/url?q=http://this.should.be.the.link/">
+                        <div class="BNeawe">
+                            <b>This</b> is <b>the</b> title
                         </div>
-                        <div style="display:none" class="am-dropdown-menu" role="menu" tabindex="-1">
-                            <ul>
-                                <li class="_Ykb">
-                                    <a class="_Zkb" href="http://www.google.fr/url?url=http://webcache.googleusercontent
-                                        .com/search%3Fcache:R1Z_4pGXjuIJ:http://test.psychologies.com/">
-                                        En cache
-                                    </a>
-                                </li>
-                                <li class="_Ykb">
-                                    <a class="_Zkb" href="/search?safe=off&amp;q=related:test.psy.com/">
-                                        Pages similaires
-                                    </a>
-                                </li>
-                            </ul>
+                        <div class="BNeawe">
+                            http://website
+                        </div>
+                    </a>
+                </div>
+                <div class="kCrYT">
+                    <div>
+                        <div class="BNeawe">
+                            <div>
+                                <div class="BNeawe">
+                                    This should be the content.
+                                </div>
+                            </div>
                         </div>
                     </div>
                 </div>
-                <span class="st">
-                    This should be the content.
-                </span>
-                <br>
-                <div class="osl">‎
-                    <a href="http://www.google.fr/url?url=http://test.psychologies.com/tests/">
-                        Test Personnalité
-                    </a> - ‎
-                    <a href="http://www.google.fr/url?url=http://test.psychologies.com/test/">
-                        Tests - Moi
-                    </a> - ‎
-                    <a href="http://www.google.fr/url?url=http://test.psychologies.com/test/tests-couple">
-                        Test Couple
-                    </a>
-                    - ‎
-                    <a href="http://www.google.fr/url?url=http://test.psychologies.com/tests/tests-amour">
-                        Test Amour
+            </div>
+        </p>
+        <div class="ZINbbc xpd O9g5cc uUPGi">
+            <div>
+                <div class="kCrYT">
+                    <span>
+                        <div class="BNeawe">
+                            Related searches
+                        </div>
+                    </span>
+                </div>
+                <div class="rVLSBd">
+                    <a>
+                        <div>
+                            <div class="BNeawe">
+                                suggestion title
+                            </div>
+                        </div>
                     </a>
                 </div>
             </div>
-        </div>
-        <div class="g">
-            <h3 class="r">
-                <a href="http://www.google.com/images?q=toto">
-                    <b>This</b>
-                </a>
-            </h3>
-        </div>
-        <div class="g">
-            <h3 class="r">
-                <a href="http://www.google.com/search?q=toto">
-                    <b>This</b> is
-                </a>
-            </h3>
-        </div>
-        <div class="g">
-            <h3 class="r">
-                <a href="€">
-                    <b>This</b> is <b>the</b>
-                </a>
-            </h3>
-        </div>
-        <div class="g">
-            <h3 class="r">
-                <a href="/url?q=url">
-                    <b>This</b> is <b>the</b>
-                </a>
-            </h3>
-        </div>
-        <p class="_Bmc" style="margin:3px 8px">
-            <a href="/search?num=20&amp;safe=off&amp;q=t&amp;revid=1754833769&amp;sa=X&amp;ei=-&amp;ved=">
-                suggestion <b>title</b>
-            </a>
         </p>
         """
         response = self.mock_response(html)