Browse Source

[feat] bing: raise error upon receiving wrong results page

0xhtml 2 months ago
parent
commit
6a2cb45791
1 changed files with 21 additions and 14 deletions
  1. 21 14
      searx/engines/bing.py

+ 21 - 14
searx/engines/bing.py

@@ -38,6 +38,7 @@ import babel.languages
 from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
 from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
 from searx.locales import language_tag, region_tag
 from searx.locales import language_tag, region_tag
 from searx.enginelib.traits import EngineTraits
 from searx.enginelib.traits import EngineTraits
+from searx.exceptions import SearxEngineAPIException
 
 
 if TYPE_CHECKING:
 if TYPE_CHECKING:
     import logging
     import logging
@@ -161,27 +162,33 @@ def response(resp):
         results.append({'url': url, 'title': title, 'content': content})
         results.append({'url': url, 'title': title, 'content': content})
 
 
     # get number_of_results
     # get number_of_results
-    try:
+    if results:
         result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
         result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
         if "-" in result_len_container:
         if "-" in result_len_container:
-
-            # Remove the part "from-to" for paginated request ...
-            result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]
+            start_str, result_len_container = re.split(r'-\d+', result_len_container)
+            start = int(start_str)
+        else:
+            start = 1
 
 
         result_len_container = re.sub('[^0-9]', '', result_len_container)
         result_len_container = re.sub('[^0-9]', '', result_len_container)
-
         if len(result_len_container) > 0:
         if len(result_len_container) > 0:
             result_len = int(result_len_container)
             result_len = int(result_len_container)
 
 
-    except Exception as e:  # pylint: disable=broad-except
-        logger.debug('result error :\n%s', e)
-
-    if result_len and _page_offset(resp.search_params.get("pageno", 0)) > result_len:
-        # Avoid reading more results than available.
-        # For example, if there is 100 results from some search and we try to get results from 120 to 130,
-        # Bing will send back the results from 0 to 10 and no error.
-        # If we compare results count with the first parameter of the request we can avoid this "invalid" results.
-        return []
+        expected_start = _page_offset(resp.search_params.get("pageno", 1))
+
+        if expected_start != start:
+            if expected_start > result_len:
+                # Avoid reading more results than available.
+                # For example, if there is 100 results from some search and we try to get results from 120 to 130,
+                # Bing will send back the results from 0 to 10 and no error.
+                # If we compare results count with the first parameter of the request we can avoid this "invalid"
+                # results.
+                return []
+
+            # Sometimes Bing will send back the first result page instead of the requested page as a rate limiting
+            # measure.
+            msg = f"Expected results to start at {expected_start}, but got results starting at {start}"
+            raise SearxEngineAPIException(msg)
 
 
     results.append({'number_of_results': result_len})
     results.append({'number_of_results': result_len})
     return results
     return results