Browse Source

[enh] checker: background check

See settings.yml for the options
SIGUSR1 signal starts the checker.
The result is available at /stats/checker
Alexandre Flament 4 years ago
parent
commit
3a9f513521

+ 5 - 67
searx/search/__init__.py

@@ -28,7 +28,9 @@ from searx.external_bang import get_bang_url
 from searx.results import ResultContainer
 from searx import logger
 from searx.plugins import plugins
+from searx.search.models import EngineRef, SearchQuery
 from searx.search.processors import processors, initialize as initialize_processors
+from searx.search.checker import initialize as initialize_checker
 
 
 logger = logger.getChild('search')
@@ -45,75 +47,11 @@ else:
         sys.exit(1)
 
 
-def initialize(settings_engines=None):
+def initialize(settings_engines=None, enable_checker=False):
     settings_engines = settings_engines or settings['engines']
     initialize_processors(settings_engines)
-
-
-class EngineRef:
-
-    __slots__ = 'name', 'category'
-
-    def __init__(self, name: str, category: str):
-        self.name = name
-        self.category = category
-
-    def __repr__(self):
-        return "EngineRef({!r}, {!r})".format(self.name, self.category)
-
-    def __eq__(self, other):
-        return self.name == other.name and self.category == other.category
-
-    def __hash__(self):
-        return hash((self.name, self.category))
-
-
-class SearchQuery:
-    """container for all the search parameters (query, language, etc...)"""
-
-    __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\
-                'timeout_limit', 'external_bang'
-
-    def __init__(self,
-                 query: str,
-                 engineref_list: typing.List[EngineRef],
-                 lang: str='all',
-                 safesearch: int=0,
-                 pageno: int=1,
-                 time_range: typing.Optional[str]=None,
-                 timeout_limit: typing.Optional[float]=None,
-                 external_bang: typing.Optional[str]=None):
-        self.query = query
-        self.engineref_list = engineref_list
-        self.lang = lang
-        self.safesearch = safesearch
-        self.pageno = pageno
-        self.time_range = time_range
-        self.timeout_limit = timeout_limit
-        self.external_bang = external_bang
-
-    @property
-    def categories(self):
-        return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
-
-    def __repr__(self):
-        return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\
-               format(self.query, self.engineref_list, self.lang, self.safesearch,
-                      self.pageno, self.time_range, self.timeout_limit, self.external_bang)
-
-    def __eq__(self, other):
-        return self.query == other.query\
-            and self.engineref_list == other.engineref_list\
-            and self.lang == other.lang\
-            and self.safesearch == other.safesearch\
-            and self.pageno == other.pageno\
-            and self.time_range == other.time_range\
-            and self.timeout_limit == other.timeout_limit\
-            and self.external_bang == other.external_bang
-
-    def __hash__(self):
-        return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range,
-                     self.timeout_limit, self.external_bang))
+    if enable_checker:
+        initialize_checker()
 
 
 class Search:

+ 3 - 0
searx/search/checker/__init__.py

@@ -1 +1,4 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
 from .impl import Checker
+from .background import initialize, get_result

+ 23 - 7
searx/search/checker/__main__.py

@@ -1,9 +1,13 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
 import sys
 import os
+import argparse
 
 import searx.search
-import searx.search.processors
 import searx.search.checker
+from searx.search import processors
+from searx.engines import engine_shortcuts
 
 
 if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']:
@@ -18,20 +22,24 @@ else:
     BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", ""
 
 
-def iter_processor():
-    if len(sys.argv) > 1:
-        for name, processor in searx.search.processors.items():
-            if name in sys.argv:
+def iter_processor(engine_name_list):
+    if len(engine_name_list) > 0:
+        for name in engine_name_list:
+            name = engine_shortcuts.get(name, name)
+            processor = processors.get(name)
+            if processor is not None:
                 yield name, processor
+            else:
+                print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Not found ', RESET_SEQ)
     else:
         for name, processor in searx.search.processors.items():
             yield name, processor
 
 
-def main():
+def run(engine_name_list):
     searx.search.initialize()
     broken_urls = []
-    for name, processor in iter_processor():
+    for name, processor in iter_processor(engine_name_list):
         if sys.stdout.isatty():
             print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ)
         checker = searx.search.checker.Checker(processor)
@@ -48,5 +56,13 @@ def main():
         print('Error fetching', url)
 
 
+def main():
+    parser = argparse.ArgumentParser(description='Check searx engines.')
+    parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*',
+                        help='engines name or shortcut list. Empty for all engines.')
+    args = parser.parse_args()
+    run(args.engine_name_list)
+
+
 if __name__ == '__main__':
     main()

+ 106 - 0
searx/search/checker/background.py

@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import json
+import random
+import time
+import threading
+import os
+import signal
+
+from searx import logger, settings, searx_debug
+from searx.exceptions import SearxSettingsException
+from searx.search.processors import processors
+from searx.search.checker import Checker
+from searx.shared import schedule, storage
+
+
+CHECKER_RESULT = 'CHECKER_RESULT'
+running = threading.Lock()
+
+
+def _get_interval(every, error_msg):
+    if isinstance(every, int):
+        every = (every, every)
+    if not isinstance(every, (tuple, list))\
+       or len(every) != 2\
+       or not isinstance(every[0], int)\
+       or not isinstance(every[1], int):
+        raise SearxSettingsException(error_msg, None)
+    return every
+
+
+def _get_every():
+    every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800))
+    return _get_interval(every, 'checker.scheduling.every is not a int or list')
+
+
+def get_result():
+    serialized_result = storage.get_str('CHECKER_RESULT')
+    if serialized_result is not None:
+        return json.loads(serialized_result)
+
+
+def run():
+    if not running.acquire(blocking=False):
+        return
+    try:
+        logger.info('Starting checker')
+        result = {}
+        for name, processor in processors.items():
+            logger.debug('Checking %s engine', name)
+            checker = Checker(processor)
+            checker.run()
+            if checker.test_results.succesfull:
+                result[name] = {'status': True}
+            else:
+                result[name] = {'status': False, 'errors': checker.test_results.errors}
+
+        storage.set_str('CHECKER_RESULT', json.dumps(result))
+        logger.info('Check done')
+    finally:
+        running.release()
+
+
+def _run_with_delay():
+    every = _get_every()
+    delay = random.randint(0, every[1] - every[0])
+    logger.debug('Start checker in %i seconds', delay)
+    time.sleep(delay)
+    run()
+
+
+def _start_scheduling():
+    every = _get_every()
+    schedule(every[0], _run_with_delay)
+    run()
+
+
+def _signal_handler(signum, frame):
+    t = threading.Thread(target=run)
+    t.daemon = True
+    t.start()
+
+
+def initialize():
+    logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid())
+    signal.signal(signal.SIGUSR1, _signal_handler)
+
+    # special case when debug is activate
+    if searx_debug and settings.get('checker', {}).get('off_when_debug', True):
+        logger.info('debug mode: checker is disabled')
+        return
+
+    # check value of checker.scheduling.every now
+    scheduling = settings.get('checker', {}).get('scheduling', None)
+    if scheduling is None or not scheduling:
+        logger.info('Checker scheduler is disabled')
+        return
+
+    #
+    start_after = scheduling.get('start_after', (300, 1800))
+    start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list')
+    delay = random.randint(start_after[0], start_after[1])
+    logger.info('Start checker in %i seconds', delay)
+    t = threading.Timer(delay, _start_scheduling)
+    t.daemon = True
+    t.start()

+ 11 - 1
searx/search/checker/impl.py

@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
 import typing
 import types
 import functools
@@ -11,7 +13,7 @@ import requests.exceptions
 
 from searx import poolrequests, logger
 from searx.results import ResultContainer
-from searx.search import SearchQuery, EngineRef
+from searx.search.models import SearchQuery, EngineRef
 from searx.search.processors import EngineProcessor
 
 
@@ -240,18 +242,24 @@ class ResultContainerTests:
             self._check_infoboxes(self.result_container.infoboxes)
 
     def has_infobox(self):
+        """Check the ResultContainer has at least one infobox"""
         if len(self.result_container.infoboxes) == 0:
             self._record_error('No infobox')
 
     def has_answer(self):
+        """Check the ResultContainer has at least one answer"""
         if len(self.result_container.answers) == 0:
             self._record_error('No answer')
 
     def has_language(self, lang):
+        """Check at least one title or content of the results is written in the `lang`.
+
+        Detected using pycld3, may be not accurate"""
         if lang not in self.languages:
             self._record_error(lang + ' not found')
 
     def not_empty(self):
+        """Check the ResultContainer has at least one answer or infobox or result"""
         result_types = set()
         results = self.result_container.get_ordered_results()
         if len(results) > 0:
@@ -267,6 +275,7 @@ class ResultContainerTests:
             self._record_error('No result')
 
     def one_title_contains(self, title: str):
+        """Check one of the title contains `title` (case insensitive comparaison)"""
         title = title.lower()
         for result in self.result_container.get_ordered_results():
             if title in result['title'].lower():
@@ -287,6 +296,7 @@ class CheckerTests:
         self.result_container_tests_list = result_container_tests_list
 
     def unique_results(self):
+        """Check the results of each ResultContain is unique"""
         urls_list = [rct.result_urls for rct in self.result_container_tests_list]
         if len(urls_list[0]) > 0:
             # results on the first page

+ 69 - 0
searx/search/models.py

@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import typing
+
+
+class EngineRef:
+
+    __slots__ = 'name', 'category'
+
+    def __init__(self, name: str, category: str):
+        self.name = name
+        self.category = category
+
+    def __repr__(self):
+        return "EngineRef({!r}, {!r})".format(self.name, self.category)
+
+    def __eq__(self, other):
+        return self.name == other.name and self.category == other.category
+
+    def __hash__(self):
+        return hash((self.name, self.category))
+
+
+class SearchQuery:
+    """container for all the search parameters (query, language, etc...)"""
+
+    __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\
+                'timeout_limit', 'external_bang'
+
+    def __init__(self,
+                 query: str,
+                 engineref_list: typing.List[EngineRef],
+                 lang: str='all',
+                 safesearch: int=0,
+                 pageno: int=1,
+                 time_range: typing.Optional[str]=None,
+                 timeout_limit: typing.Optional[float]=None,
+                 external_bang: typing.Optional[str]=None):
+        self.query = query
+        self.engineref_list = engineref_list
+        self.lang = lang
+        self.safesearch = safesearch
+        self.pageno = pageno
+        self.time_range = time_range
+        self.timeout_limit = timeout_limit
+        self.external_bang = external_bang
+
+    @property
+    def categories(self):
+        return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
+
+    def __repr__(self):
+        return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\
+               format(self.query, self.engineref_list, self.lang, self.safesearch,
+                      self.pageno, self.time_range, self.timeout_limit, self.external_bang)
+
+    def __eq__(self, other):
+        return self.query == other.query\
+            and self.engineref_list == other.engineref_list\
+            and self.lang == other.lang\
+            and self.safesearch == other.safesearch\
+            and self.pageno == other.pageno\
+            and self.time_range == other.time_range\
+            and self.timeout_limit == other.timeout_limit\
+            and self.external_bang == other.external_bang
+
+    def __hash__(self):
+        return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range,
+                     self.timeout_limit, self.external_bang))

+ 27 - 18
searx/settings.yml

@@ -102,24 +102,33 @@ outgoing: # communication with search engines
 #   - "HTTPS rewrite"
 #   - ...
 
-additional_tests:
-  rosebud: &test_rosebud
-    matrix:
-        query: rosebud
-        lang: en
-    result_container:
-        - not_empty
-        - [one_title_contains', 'citizen kane']
-    test:
-        - unique_results
-
-tests:
-  infobox: &tests_infobox
-    infobox:
-        matrix:
-            query: ["linux", "new york", "bbc"]
-        result_container:
-            - has_infobox
+checker:
+    # disable checker when in debug mode
+    off_when_debug: True
+    # scheduling: interval or int
+    # use "scheduling: False" to disable scheduling
+    scheduling:
+        start_after: [300, 1800]  # delay to start the first run of the checker
+        every: [86400, 90000]  # how often the checker runs
+    # additional tests: only for the YAML anchors (see the engines section)
+    additional_tests:
+        rosebud: &test_rosebud
+          matrix:
+              query: rosebud
+              lang: en
+          result_container:
+              - not_empty
+              - ['one_title_contains', 'citizen kane']
+          test:
+              - unique_results
+    # tests: only for the YAML anchors (see the engines section)
+    tests:
+        infobox: &tests_infobox
+          infobox:
+              matrix:
+                  query: ["linux", "new york", "bbc"]
+              result_container:
+                  - has_infobox
 
 engines:
   - name: apk mirror

+ 9 - 3
searx/webapp.py

@@ -71,7 +71,8 @@ from searx.webadapter import get_search_query_from_webapp, get_selected_categori
 from searx.utils import html_to_text, gen_useragent, dict_subset, match_language
 from searx.version import VERSION_STRING
 from searx.languages import language_codes as languages
-from searx.search import SearchWithPlugins, initialize
+from searx.search import SearchWithPlugins, initialize as search_initialize
+from searx.search.checker import get_result as checker_get_result
 from searx.query import RawTextQuery
 from searx.autocomplete import searx_bang, backends as autocomplete_backends
 from searx.plugins import plugins
@@ -81,7 +82,6 @@ from searx.answerers import answerers
 from searx.poolrequests import get_global_proxies
 from searx.metrology.error_recorder import errors_per_engines
 
-
 # serve pages with HTTP/1.1
 from werkzeug.serving import WSGIRequestHandler
 WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0'))
@@ -136,7 +136,7 @@ werkzeug_reloader = flask_run_development or (searx_debug and __name__ == "__mai
 # initialize the engines except on the first run of the werkzeug server.
 if not werkzeug_reloader\
    or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_MAIN") == "true"):
-    initialize()
+    search_initialize(enable_checker=True)
 
 babel = Babel(app)
 
@@ -977,6 +977,12 @@ def stats_errors():
     return jsonify(result)
 
 
+@app.route('/stats/checker', methods=['GET'])
+def stats_checker():
+    result = checker_get_result()
+    return jsonify(result)
+
+
 @app.route('/robots.txt', methods=['GET'])
 def robots():
     return Response("""User-agent: *

+ 2 - 1
setup.py

@@ -49,7 +49,8 @@ setup(
     },
     entry_points={
         'console_scripts': [
-            'searx-run = searx.webapp:run'
+            'searx-run = searx.webapp:run',
+            'searx-checker = searx.search.checker.__main__:main'
         ]
     },
     package_data={