Browse Source

[mod] json_engine: add content_html_to_text and title_html_to_text

Some JSON API returns HTML in either in the HTML or the content.
This commit adds two new parameters to the json_engine:
content_html_to_text and title_html_to_text, False by default.

If True, then the searx.utils.html_to_text removes the HTML tags.

Update crossref, openairedatasets and openairepublications engines
Alexandre Flament 4 years ago
parent
commit
ff84a1af35
2 changed files with 19 additions and 5 deletions
  1. 15 5
      searx/engines/json_engine.py
  2. 4 0
      searx/settings.yml

+ 15 - 5
searx/engines/json_engine.py

@@ -3,13 +3,15 @@
 from collections.abc import Iterable
 from json import loads
 from urllib.parse import urlencode
-from searx.utils import to_string
+from searx.utils import to_string, html_to_text
 
 
 search_url = None
 url_query = None
 content_query = None
 title_query = None
+content_html_to_text = False
+title_html_to_text = False
 paging = False
 suggestion_query = ''
 results_query = ''
@@ -92,9 +94,17 @@ def request(query, params):
     return params
 
 
+def identity(arg):
+    return arg
+
+
 def response(resp):
     results = []
     json = loads(resp.text)
+
+    title_filter = html_to_text if title_html_to_text else identity
+    content_filter = html_to_text if content_html_to_text else identity
+
     if results_query:
         rs = query(json, results_query)
         if not len(rs):
@@ -111,8 +121,8 @@ def response(resp):
                 content = ""
             results.append({
                 'url': to_string(url),
-                'title': to_string(title),
-                'content': to_string(content),
+                'title': title_filter(to_string(title)),
+                'content': content_filter(to_string(content)),
             })
     else:
         for url, title, content in zip(
@@ -122,8 +132,8 @@ def response(resp):
         ):
             results.append({
                 'url': to_string(url),
-                'title': to_string(title),
-                'content': to_string(content),
+                'title': title_filter(to_string(title)),
+                'content': content_filter(to_string(content)),
             })
 
     if not suggestion_query:

+ 4 - 0
searx/settings.yml

@@ -267,7 +267,9 @@ engines:
     search_url : https://search.crossref.org/dois?q={query}&page={pageno}
     url_query : doi
     title_query : title
+    title_html_to_text: True
     content_query : fullCitation
+    content_html_to_text: True
     categories : science
     shortcut : cr
     about:
@@ -757,6 +759,7 @@ engines:
     url_query : metadata/oaf:entity/oaf:result/children/instance/webresource/url/$
     title_query : metadata/oaf:entity/oaf:result/title/$
     content_query : metadata/oaf:entity/oaf:result/description/$
+    content_html_to_text: True
     categories : science
     shortcut : oad
     timeout: 5.0
@@ -776,6 +779,7 @@ engines:
     url_query : metadata/oaf:entity/oaf:result/children/instance/webresource/url/$
     title_query : metadata/oaf:entity/oaf:result/title/$
     content_query : metadata/oaf:entity/oaf:result/description/$
+    content_html_to_text: True
     categories : science
     shortcut : oap
     timeout: 5.0