3 years ago · 160f3e022e
--- a/docs/dev/searxng_extra/index.rst
+++ b/docs/dev/searxng_extra/index.rst
@@ -1,14 +1,15 @@
 
				 .. _searxng_extra:
			
 
				 
			
 
				-======================================================
			
 
				-Tooling box ``searxng_extra`` for developers and users
			
 
				-======================================================
			
 
				+=============================
			
 
				+Tooling box ``searxng_extra``
			
 
				+=============================
			
 
				 
			
 
				-In the folder :origin:`searxng_extra/` we maintain some tools useful for
			
 
				+In the folder :origin:`searxng_extra/` we maintain some tools useful for CI and
			
 
				 developers.
			
 
				 
			
 
				 .. toctree::
			
 
				    :maxdepth: 2
			
 
				    :caption: Contents
			
 
				 
			
 
				+   update
			
 
				    standalone_searx.py
			
--- a/docs/dev/searxng_extra/update.rst
+++ b/docs/dev/searxng_extra/update.rst
@@ -0,0 +1,88 @@
 
				+=========================
			
 
				+``searxng_extra/update/``
			
 
				+=========================
			
 
				+
			
 
				+:origin:`[source] <searxng_extra/update/__init__.py>`
			
 
				+
			
 
				+Scripts to update static data in :origin:`searx/data/`
			
 
				+
			
 
				+.. _update_ahmia_blacklist.py:
			
 
				+
			
 
				+``update_ahmia_blacklist.py``
			
 
				+=============================
			
 
				+
			
 
				+:origin:`[source] <searxng_extra/update/update_ahmia_blacklist.py>`
			
 
				+
			
 
				+.. automodule:: searxng_extra.update.update_ahmia_blacklist
			
 
				+  :members:
			
 
				+
			
 
				+
			
 
				+``update_currencies.py``
			
 
				+========================
			
 
				+
			
 
				+:origin:`[source] <searxng_extra/update/update_currencies.py>`
			
 
				+
			
 
				+.. automodule:: searxng_extra.update.update_currencies
			
 
				+  :members:
			
 
				+
			
 
				+``update_engine_descriptions.py``
			
 
				+=================================
			
 
				+
			
 
				+:origin:`[source] <searxng_extra/update/update_engine_descriptions.py>`
			
 
				+
			
 
				+.. automodule:: searxng_extra.update.update_engine_descriptions
			
 
				+  :members:
			
 
				+
			
 
				+
			
 
				+``update_external_bangs.py``
			
 
				+============================
			
 
				+
			
 
				+:origin:`[source] <searxng_extra/update/update_external_bangs.py>`
			
 
				+
			
 
				+.. automodule:: searxng_extra.update.update_external_bangs
			
 
				+  :members:
			
 
				+
			
 
				+
			
 
				+``update_firefox_version.py``
			
 
				+=============================
			
 
				+
			
 
				+:origin:`[source] <searxng_extra/update/update_firefox_version.py>`
			
 
				+
			
 
				+.. automodule:: searxng_extra.update.update_firefox_version
			
 
				+  :members:
			
 
				+
			
 
				+
			
 
				+``update_languages.py``
			
 
				+=======================
			
 
				+
			
 
				+:origin:`[source] <searxng_extra/update/update_languages.py>`
			
 
				+
			
 
				+.. automodule:: searxng_extra.update.update_languages
			
 
				+  :members:
			
 
				+
			
 
				+
			
 
				+``update_osm_keys_tags.py``
			
 
				+===========================
			
 
				+
			
 
				+:origin:`[source] <searxng_extra/update/update_osm_keys_tags.py>`
			
 
				+
			
 
				+.. automodule:: searxng_extra.update.update_osm_keys_tags
			
 
				+  :members:
			
 
				+
			
 
				+
			
 
				+``update_pygments.py``
			
 
				+======================
			
 
				+
			
 
				+:origin:`[source] <searxng_extra/update/update_pygments.py>`
			
 
				+
			
 
				+.. automodule:: searxng_extra.update.update_pygments
			
 
				+  :members:
			
 
				+
			
 
				+
			
 
				+``update_wikidata_units.py``
			
 
				+============================
			
 
				+
			
 
				+:origin:`[source] <searxng_extra/update/update_wikidata_units.py>`
			
 
				+
			
 
				+.. automodule:: searxng_extra.update.update_wikidata_units
			
 
				+  :members:
			
--- a/searxng_extra/update/update_ahmia_blacklist.py
+++ b/searxng_extra/update/update_ahmia_blacklist.py
@@ -1,10 +1,15 @@
 
				 #!/usr/bin/env python
			
 
				+# lint: pylint
			
 
				 # SPDX-License-Identifier: AGPL-3.0-or-later
			
 
				+"""This script saves `Ahmia's blacklist`_ for onion sites.
			
 
				 
			
 
				-# This script saves Ahmia's blacklist for onion sites.
			
 
				-# More info in https://ahmia.fi/blacklist/
			
 
				+Output file: :origin:`searx/data/ahmia_blacklist.txt` (:origin:`CI Update data
			
 
				+...  <.github/workflows/data-update.yml>`).
			
 
				+
			
 
				+.. _Ahmia's blacklist: https://ahmia.fi/blacklist/
			
 
				+
			
 
				+"""
			
 
				 
			
 
				-# set path
			
 
				 from os.path import join
			
 
				 
			
 
				 import requests
			
@@ -17,15 +22,14 @@ def fetch_ahmia_blacklist():
 
				     resp = requests.get(URL, timeout=3.0)
			
 
				     if resp.status_code != 200:
			
 
				         raise Exception("Error fetching Ahmia blacklist, HTTP code " + resp.status_code)
			
 
				-    else:
			
 
				-        blacklist = resp.text.split()
			
 
				-        return blacklist
			
 
				+    return resp.text.split()
			
 
				 
			
 
				 
			
 
				 def get_ahmia_blacklist_filename():
			
 
				     return join(join(searx_dir, "data"), "ahmia_blacklist.txt")
			
 
				 
			
 
				 
			
 
				-blacklist = fetch_ahmia_blacklist()
			
 
				-with open(get_ahmia_blacklist_filename(), "w") as f:
			
 
				-    f.write('\n'.join(blacklist))
			
 
				+if __name__ == '__main__':
			
 
				+    blacklist = fetch_ahmia_blacklist()
			
 
				+    with open(get_ahmia_blacklist_filename(), "w", encoding='utf-8') as f:
			
 
				+        f.write('\n'.join(blacklist))
			
--- a/searxng_extra/update/update_currencies.py
+++ b/searxng_extra/update/update_currencies.py
@@ -1,13 +1,22 @@
 
				 #!/usr/bin/env python
			
 
				+# lint: pylint
			
 
				 # SPDX-License-Identifier: AGPL-3.0-or-later
			
 
				 
			
 
				+"""Fetch currencies from :origin:`searx/engines/wikidata.py` engine.
			
 
				+
			
 
				+Output file: :origin:`searx/data/currencies.json` (:origin:`CI Update data ...
			
 
				+<.github/workflows/data-update.yml>`).
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+# pylint: disable=invalid-name
			
 
				+
			
 
				 import re
			
 
				 import unicodedata
			
 
				 import json
			
 
				 
			
 
				 # set path
			
 
				-from sys import path
			
 
				-from os.path import realpath, dirname, join
			
 
				+from os.path import join
			
 
				 
			
 
				 from searx import searx_dir
			
 
				 from searx.locales import LOCALE_NAMES
			
--- a/searxng_extra/update/update_engine_descriptions.py
+++ b/searxng_extra/update/update_engine_descriptions.py
@@ -1,6 +1,16 @@
 
				 #!/usr/bin/env python
			
 
				+# lint: pylint
			
 
				 # SPDX-License-Identifier: AGPL-3.0-or-later
			
 
				 
			
 
				+"""Fetch website description from websites and from
			
 
				+:origin:`searx/engines/wikidata.py` engine.
			
 
				+
			
 
				+Output file: :origin:`searx/data/engine_descriptions.json`.
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+# pylint: disable=invalid-name, global-statement
			
 
				+
			
 
				 import json
			
 
				 from urllib.parse import urlparse
			
 
				 from os.path import join
			
@@ -102,7 +112,7 @@ def get_wikipedia_summary(lang, pageid):
 
				         response.raise_for_status()
			
 
				         api_result = json.loads(response.text)
			
 
				         return api_result.get('extract')
			
 
				-    except:
			
 
				+    except Exception:  # pylint: disable=broad-except
			
 
				         return None
			
 
				 
			
 
				 
			
@@ -134,7 +144,7 @@ def get_website_description(url, lang1, lang2=None):
 
				     try:
			
 
				         response = searx.network.get(url, headers=headers, timeout=10)
			
 
				         response.raise_for_status()
			
 
				-    except Exception:
			
 
				+    except Exception:  # pylint: disable=broad-except
			
 
				         return (None, None)
			
 
				 
			
 
				     try:
			
--- a/searxng_extra/update/update_external_bangs.py
+++ b/searxng_extra/update/update_external_bangs.py
@@ -1,17 +1,20 @@
 
				 #!/usr/bin/env python
			
 
				 # lint: pylint
			
 
				 # SPDX-License-Identifier: AGPL-3.0-or-later
			
 
				-"""
			
 
				-Update searx/data/external_bangs.json using the duckduckgo bangs.
			
 
				+"""Update :origin:`searx/data/external_bangs.json` using the duckduckgo bangs
			
 
				+(:origin:`CI Update data ... <.github/workflows/data-update.yml>`).
			
 
				+
			
 
				+https://duckduckgo.com/newbang loads:
			
 
				 
			
 
				-https://duckduckgo.com/newbang loads
			
 
				 * a javascript which provides the bang version ( https://duckduckgo.com/bv1.js )
			
 
				 * a JSON file which contains the bangs ( https://duckduckgo.com/bang.v260.js for example )
			
 
				 
			
 
				 This script loads the javascript, then the bangs.
			
 
				 
			
 
				-The javascript URL may change in the future ( for example https://duckduckgo.com/bv2.js ),
			
 
				-but most probably it will requires to update RE_BANG_VERSION
			
 
				+The javascript URL may change in the future ( for example
			
 
				+https://duckduckgo.com/bv2.js ), but most probably it will requires to update
			
 
				+RE_BANG_VERSION
			
 
				+
			
 
				 """
			
 
				 # pylint: disable=C0116
			
 
				 
			
--- a/searxng_extra/update/update_firefox_version.py
+++ b/searxng_extra/update/update_firefox_version.py
@@ -1,21 +1,30 @@
 
				 #!/usr/bin/env python
			
 
				+# lint: pylint
			
 
				 # SPDX-License-Identifier: AGPL-3.0-or-later
			
 
				 
			
 
				+"""Fetch firefox useragent signatures
			
 
				+
			
 
				+Output file: :origin:`searx/data/useragents.json` (:origin:`CI Update data ...
			
 
				+<.github/workflows/data-update.yml>`).
			
 
				+
			
 
				+"""
			
 
				+
			
 
				 import json
			
 
				-import requests
			
 
				 import re
			
 
				-from os.path import dirname, join
			
 
				+from os.path import join
			
 
				 from urllib.parse import urlparse, urljoin
			
 
				-from distutils.version import LooseVersion, StrictVersion
			
 
				+from distutils.version import LooseVersion
			
 
				+
			
 
				+import requests
			
 
				 from lxml import html
			
 
				 from searx import searx_dir
			
 
				 
			
 
				 URL = 'https://ftp.mozilla.org/pub/firefox/releases/'
			
 
				 RELEASE_PATH = '/pub/firefox/releases/'
			
 
				 
			
 
				-NORMAL_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?$')
			
 
				-# BETA_REGEX = re.compile('.*[0-9]b([0-9\-a-z]+)$')
			
 
				-# ESR_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?esr$')
			
 
				+NORMAL_REGEX = re.compile(r'^[0-9]+\.[0-9](\.[0-9])?$')
			
 
				+# BETA_REGEX = re.compile(r'.*[0-9]b([0-9\-a-z]+)$')
			
 
				+# ESR_REGEX = re.compile(r'^[0-9]+\.[0-9](\.[0-9])?esr$')
			
 
				 
			
 
				 #
			
 
				 useragents = {
			
@@ -32,20 +41,19 @@ def fetch_firefox_versions():
 
				     resp = requests.get(URL, timeout=2.0)
			
 
				     if resp.status_code != 200:
			
 
				         raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code)
			
 
				-    else:
			
 
				-        dom = html.fromstring(resp.text)
			
 
				-        versions = []
			
 
				+    dom = html.fromstring(resp.text)
			
 
				+    versions = []
			
 
				 
			
 
				-        for link in dom.xpath('//a/@href'):
			
 
				-            url = urlparse(urljoin(URL, link))
			
 
				-            path = url.path
			
 
				-            if path.startswith(RELEASE_PATH):
			
 
				-                version = path[len(RELEASE_PATH) : -1]
			
 
				-                if NORMAL_REGEX.match(version):
			
 
				-                    versions.append(LooseVersion(version))
			
 
				+    for link in dom.xpath('//a/@href'):
			
 
				+        url = urlparse(urljoin(URL, link))
			
 
				+        path = url.path
			
 
				+        if path.startswith(RELEASE_PATH):
			
 
				+            version = path[len(RELEASE_PATH) : -1]
			
 
				+            if NORMAL_REGEX.match(version):
			
 
				+                versions.append(LooseVersion(version))
			
 
				 
			
 
				-        list.sort(versions, reverse=True)
			
 
				-        return versions
			
 
				+    list.sort(versions, reverse=True)
			
 
				+    return versions
			
 
				 
			
 
				 
			
 
				 def fetch_firefox_last_versions():
			
@@ -66,6 +74,7 @@ def get_useragents_filename():
 
				     return join(join(searx_dir, "data"), "useragents.json")
			
 
				 
			
 
				 
			
 
				-useragents["versions"] = fetch_firefox_last_versions()
			
 
				-with open(get_useragents_filename(), "w") as f:
			
 
				-    json.dump(useragents, f, indent=4, ensure_ascii=False)
			
 
				+if __name__ == '__main__':
			
 
				+    useragents["versions"] = fetch_firefox_last_versions()
			
 
				+    with open(get_useragents_filename(), "w", encoding='utf-8') as f:
			
 
				+        json.dump(useragents, f, indent=4, ensure_ascii=False)
			
--- a/searxng_extra/update/update_languages.py
+++ b/searxng_extra/update/update_languages.py
@@ -1,9 +1,17 @@
 
				 #!/usr/bin/env python
			
 
				+# lint: pylint
			
 
				+
			
 
				 # SPDX-License-Identifier: AGPL-3.0-or-later
			
 
				+"""This script generates languages.py from intersecting each engine's supported
			
 
				+languages.
			
 
				+
			
 
				+Output files: :origin:`searx/data/engines_languages.json` and
			
 
				+:origin:`searx/languages.py` (:origin:`CI Update data ...
			
 
				+<.github/workflows/data-update.yml>`).
			
 
				+
			
 
				+"""
			
 
				 
			
 
				-# This script generates languages.py from intersecting each engine's supported languages.
			
 
				-#
			
 
				-# Output files: searx/data/engines_languages.json and searx/languages.py
			
 
				+# pylint: disable=invalid-name
			
 
				 
			
 
				 import json
			
 
				 from pathlib import Path
			
@@ -24,7 +32,7 @@ languages_file = Path(searx_dir) / 'languages.py'
 
				 def fetch_supported_languages():
			
 
				     set_timeout_for_thread(10.0)
			
 
				 
			
 
				-    engines_languages = dict()
			
 
				+    engines_languages = {}
			
 
				     names = list(engines)
			
 
				     names.sort()
			
 
				 
			
@@ -32,7 +40,7 @@ def fetch_supported_languages():
 
				         if hasattr(engines[engine_name], 'fetch_supported_languages'):
			
 
				             engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
			
 
				             print("fetched %s languages from engine %s" % (len(engines_languages[engine_name]), engine_name))
			
 
				-            if type(engines_languages[engine_name]) == list:
			
 
				+            if type(engines_languages[engine_name]) == list:  # pylint: disable=unidiomatic-typecheck
			
 
				                 engines_languages[engine_name] = sorted(engines_languages[engine_name])
			
 
				 
			
 
				     print("fetched languages from %s engines" % len(engines_languages))
			
@@ -55,7 +63,7 @@ def get_locale(lang_code):
 
				 
			
 
				 # Join all language lists.
			
 
				 def join_language_lists(engines_languages):
			
 
				-    language_list = dict()
			
 
				+    language_list = {}
			
 
				     for engine_name in engines_languages:
			
 
				         for lang_code in engines_languages[engine_name]:
			
 
				 
			
@@ -91,7 +99,7 @@ def join_language_lists(engines_languages):
 
				                     'name': language_name,
			
 
				                     'english_name': english_name,
			
 
				                     'counter': set(),
			
 
				-                    'countries': dict(),
			
 
				+                    'countries': {},
			
 
				                 }
			
 
				 
			
 
				             # add language with country if not in list
			
@@ -119,6 +127,7 @@ def join_language_lists(engines_languages):
 
				 def filter_language_list(all_languages):
			
 
				     min_engines_per_lang = 13
			
 
				     min_engines_per_country = 7
			
 
				+    # pylint: disable=consider-using-dict-items, consider-iterating-dictionary
			
 
				     main_engines = [
			
 
				         engine_name
			
 
				         for engine_name in engines.keys()
			
@@ -138,7 +147,7 @@ def filter_language_list(all_languages):
 
				     }
			
 
				 
			
 
				     def _copy_lang_data(lang, country_name=None):
			
 
				-        new_dict = dict()
			
 
				+        new_dict = {}
			
 
				         new_dict['name'] = all_languages[lang]['name']
			
 
				         new_dict['english_name'] = all_languages[lang]['english_name']
			
 
				         if country_name:
			
@@ -146,10 +155,10 @@ def filter_language_list(all_languages):
 
				         return new_dict
			
 
				 
			
 
				     # for each language get country codes supported by most engines or at least one country code
			
 
				-    filtered_languages_with_countries = dict()
			
 
				+    filtered_languages_with_countries = {}
			
 
				     for lang, lang_data in filtered_languages.items():
			
 
				         countries = lang_data['countries']
			
 
				-        filtered_countries = dict()
			
 
				+        filtered_countries = {}
			
 
				 
			
 
				         # get language's country codes with enough supported engines
			
 
				         for lang_country, country_data in countries.items():
			
@@ -211,7 +220,7 @@ def write_languages_file(languages):
 
				 
			
 
				     language_codes = tuple(language_codes)
			
 
				 
			
 
				-    with open(languages_file, 'w') as new_file:
			
 
				+    with open(languages_file, 'w', encoding='utf-8') as new_file:
			
 
				         file_content = "{file_headers} {language_codes},\n)\n".format(
			
 
				             # fmt: off
			
 
				             file_headers = '\n'.join(file_headers),
			
@@ -224,7 +233,7 @@ def write_languages_file(languages):
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     load_engines(settings['engines'])
			
 
				-    engines_languages = fetch_supported_languages()
			
 
				-    all_languages = join_language_lists(engines_languages)
			
 
				-    filtered_languages = filter_language_list(all_languages)
			
 
				-    write_languages_file(filtered_languages)
			
 
				+    _engines_languages = fetch_supported_languages()
			
 
				+    _all_languages = join_language_lists(_engines_languages)
			
 
				+    _filtered_languages = filter_language_list(_all_languages)
			
 
				+    write_languages_file(_filtered_languages)
			
--- a/searxng_extra/update/update_osm_keys_tags.py
+++ b/searxng_extra/update/update_osm_keys_tags.py
@@ -5,7 +5,10 @@
 
				 
			
 
				 To get the i18n names, the scripts uses `Wikidata Query Service`_ instead of for
			
 
				 example `OSM tags API`_ (sidenote: the actual change log from
			
 
				-map.atownsend.org.uk_ might be useful to normalize OSM tags)
			
 
				+map.atownsend.org.uk_ might be useful to normalize OSM tags).
			
 
				+
			
 
				+Output file: :origin:`searx/data/osm_keys_tags` (:origin:`CI Update data ...
			
 
				+<.github/workflows/data-update.yml>`).
			
 
				 
			
 
				 .. _Wikidata Query Service: https://query.wikidata.org/
			
 
				 .. _OSM tags API: https://taginfo.openstreetmap.org/taginfo/apidoc
			
--- a/searxng_extra/update/update_wikidata_units.py
+++ b/searxng_extra/update/update_wikidata_units.py
@@ -3,6 +3,13 @@
 
				 # lint: pylint
			
 
				 # pylint: disable=missing-module-docstring
			
 
				 
			
 
				+"""Fetch units from :origin:`searx/engines/wikidata.py` engine.
			
 
				+
			
 
				+Output file: :origin:`searx/data/wikidata_units.json` (:origin:`CI Update data
			
 
				+...  <.github/workflows/data-update.yml>`).
			
 
				+
			
 
				+"""
			
 
				+
			
 
				 import json
			
 
				 import collections
			
 
				 
			
@@ -54,5 +61,6 @@ def get_wikidata_units_filename():
 
				     return join(join(searx_dir, "data"), "wikidata_units.json")
			
 
				 
			
 
				 
			
 
				-with open(get_wikidata_units_filename(), 'w', encoding="utf8") as f:
			
 
				-    json.dump(get_data(), f, indent=4, ensure_ascii=False)
			
 
				+if __name__ == '__main__':
			
 
				+    with open(get_wikidata_units_filename(), 'w', encoding="utf8") as f:
			
 
				+        json.dump(get_data(), f, indent=4, ensure_ascii=False)