Browse Source

[doc] add documentation about the XPath engine

- pylint searx/engines/xpath.py
- fix indentation of some long lines
- add logging
- add doc-strings

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 3 years ago
parent
commit
8cd544b2a6
4 changed files with 107 additions and 25 deletions
  1. 5 1
      docs/dev/engine_overview.rst
  2. 1 0
      docs/dev/index.rst
  3. 9 0
      docs/dev/xpath_engine.rst
  4. 92 24
      searx/engines/xpath.py

+ 5 - 1
docs/dev/engine_overview.rst

@@ -43,7 +43,7 @@ argument                type        information
 categories              list        pages, in which the engine is working
 categories              list        pages, in which the engine is working
 paging                  boolean     support multible pages
 paging                  boolean     support multible pages
 time_range_support      boolean     support search time range
 time_range_support      boolean     support search time range
-engine_type             str         ``online`` by default, other possibles values are 
+engine_type             str         ``online`` by default, other possibles values are
                                     ``offline``, ``online_dictionnary``, ``online_currency``
                                     ``offline``, ``online_dictionnary``, ``online_currency``
 ======================= =========== ========================================================
 ======================= =========== ========================================================
 
 
@@ -100,6 +100,8 @@ example code
    paging = True
    paging = True
 
 
 
 
+.. _engine request:
+
 making a request
 making a request
 ================
 ================
 
 
@@ -198,6 +200,8 @@ example code
        return params
        return params
 
 
 
 
+.. _engine results:
+
 returned results
 returned results
 ================
 ================
 
 

+ 1 - 0
docs/dev/index.rst

@@ -9,6 +9,7 @@ Developer documentation
    quickstart
    quickstart
    contribution_guide
    contribution_guide
    engine_overview
    engine_overview
+   xpath_engine
    search_api
    search_api
    plugins
    plugins
    translation
    translation

+ 9 - 0
docs/dev/xpath_engine.rst

@@ -0,0 +1,9 @@
+.. _xpath_engine:
+
+================
+The XPath engine
+================
+
+.. automodule:: searx.engines.xpath
+  :members:
+

+ 92 - 24
searx/engines/xpath.py

@@ -1,51 +1,106 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+# pylint: disable=missing-function-docstring
+"""The XPath engine is a *generic* engine with which it is possible to configure
+engines in the settings.
+
+Here is a simple example of a XPath engine configured in the
+:ref:`settings engine` section, further read :ref:`engines-dev`.
+
+.. code:: yaml
+
+  - name : bitbucket
+    engine : xpath
+    paging : True
+    search_url : https://bitbucket.org/repo/all/{pageno}?name={query}
+    url_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]/@href
+    title_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]
+    content_xpath : //article[@class="repo-summary"]/p
+
+"""
 
 
-from lxml import html
 from urllib.parse import urlencode
 from urllib.parse import urlencode
+
+from lxml import html
 from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list
 from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list
+from searx import logger
+
+logger = logger.getChild('XPath engine')
 
 
 search_url = None
 search_url = None
+"""
+Search URL of the engine, replacements are:
+
+``{query}``:
+  Search terms from user.
+
+``{pageno}``:
+  Page number if engine supports pagging :py:obj:`paging`
+
+"""
+
+soft_max_redirects = 0
+'''Maximum redirects, soft limit. Record an error but don't stop the engine'''
+
+results_xpath = ''
+'''XPath selector for the list of result items'''
+
 url_xpath = None
 url_xpath = None
+'''XPath selector of result's ``url``.'''
+
 content_xpath = None
 content_xpath = None
+'''XPath selector of result's ``content``.'''
+
 title_xpath = None
 title_xpath = None
+'''XPath selector of result's ``title``.'''
+
 thumbnail_xpath = False
 thumbnail_xpath = False
-paging = False
+'''XPath selector of result's ``img_src``.'''
+
 suggestion_xpath = ''
 suggestion_xpath = ''
-results_xpath = ''
+'''XPath selector of result's ``suggestion``.'''
+
 cached_xpath = ''
 cached_xpath = ''
 cached_url = ''
 cached_url = ''
-soft_max_redirects = 0
 
 
-# parameters for engines with paging support
-#
-# number of results on each page
-# (only needed if the site requires not a page number, but an offset)
+paging = False
+'''Engine supports paging [True or False].'''
+
 page_size = 1
 page_size = 1
-# number of the first page (usually 0 or 1)
-first_page_num = 1
+'''Number of results on each page.  Only needed if the site requires not a page
+number, but an offset.'''
 
 
+first_page_num = 1
+'''Number of the first page (usually 0 or 1).'''
 
 
 def request(query, params):
 def request(query, params):
+    '''Build request parameters (see :ref:`engine request`).
+
+    '''
     query = urlencode({'q': query})[2:]
     query = urlencode({'q': query})[2:]
 
 
-    fp = {'query': query}
+    fargs = {'query': query}
     if paging and search_url.find('{pageno}') >= 0:
     if paging and search_url.find('{pageno}') >= 0:
-        fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
+        fargs['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
 
 
-    params['url'] = search_url.format(**fp)
+    params['url'] = search_url.format(**fargs)
     params['query'] = query
     params['query'] = query
     params['soft_max_redirects'] = soft_max_redirects
     params['soft_max_redirects'] = soft_max_redirects
+    logger.debug("query_url --> %s", params['url'])
 
 
     return params
     return params
 
 
-
 def response(resp):
 def response(resp):
+    '''Scrap *results* from the response (see :ref:`engine results`).
+
+    '''
     results = []
     results = []
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
-    is_onion = True if 'onions' in categories else False  # pylint: disable=undefined-variable
+    is_onion = 'onions' in categories  # pylint: disable=undefined-variable
 
 
     if results_xpath:
     if results_xpath:
         for result in eval_xpath_list(dom, results_xpath):
         for result in eval_xpath_list(dom, results_xpath):
+
             url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
             url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
             title = extract_text(eval_xpath_list(result, title_xpath, min_len=1))
             title = extract_text(eval_xpath_list(result, title_xpath, min_len=1))
             content = extract_text(eval_xpath_list(result, content_xpath, min_len=1))
             content = extract_text(eval_xpath_list(result, content_xpath, min_len=1))
@@ -59,13 +114,16 @@ def response(resp):
 
 
             # add alternative cached url if available
             # add alternative cached url if available
             if cached_xpath:
             if cached_xpath:
-                tmp_result['cached_url'] = cached_url\
+                tmp_result['cached_url'] = (
+                    cached_url
                     + extract_text(eval_xpath_list(result, cached_xpath, min_len=1))
                     + extract_text(eval_xpath_list(result, cached_xpath, min_len=1))
+                )
 
 
             if is_onion:
             if is_onion:
                 tmp_result['is_onion'] = True
                 tmp_result['is_onion'] = True
 
 
             results.append(tmp_result)
             results.append(tmp_result)
+
     else:
     else:
         if cached_xpath:
         if cached_xpath:
             for url, title, content, cached in zip(
             for url, title, content, cached in zip(
@@ -75,8 +133,12 @@ def response(resp):
                 map(extract_text, eval_xpath_list(dom, content_xpath)),
                 map(extract_text, eval_xpath_list(dom, content_xpath)),
                 map(extract_text, eval_xpath_list(dom, cached_xpath))
                 map(extract_text, eval_xpath_list(dom, cached_xpath))
             ):
             ):
-                results.append({'url': url, 'title': title, 'content': content,
-                                'cached_url': cached_url + cached, 'is_onion': is_onion})
+                results.append({
+                    'url': url,
+                    'title': title,
+                    'content': content,
+                    'cached_url': cached_url + cached, 'is_onion': is_onion
+                })
         else:
         else:
             for url, title, content in zip(
             for url, title, content in zip(
                 (extract_url(x, search_url) for
                 (extract_url(x, search_url) for
@@ -84,10 +146,16 @@ def response(resp):
                 map(extract_text, eval_xpath_list(dom, title_xpath)),
                 map(extract_text, eval_xpath_list(dom, title_xpath)),
                 map(extract_text, eval_xpath_list(dom, content_xpath))
                 map(extract_text, eval_xpath_list(dom, content_xpath))
             ):
             ):
-                results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion})
-
-    if not suggestion_xpath:
-        return results
-    for suggestion in eval_xpath(dom, suggestion_xpath):
-        results.append({'suggestion': extract_text(suggestion)})
+                results.append({
+                    'url': url,
+                    'title': title,
+                    'content': content,
+                    'is_onion': is_onion
+                })
+
+    if suggestion_xpath:
+        for suggestion in eval_xpath(dom, suggestion_xpath):
+            results.append({'suggestion': extract_text(suggestion)})
+
+    logger.debug("found %s results", len(results))
     return results
     return results