Browse Source

[mod] implement searx.wikidata_units for unit converters

Markus Heiser 1 month ago
parent
commit
a800dd0473
3 changed files with 234 additions and 190 deletions
  1. 1 127
      searx/plugins/unit_converter.py
  2. 231 0
      searx/wikidata_units.py
  3. 2 63
      searxng_extra/update/update_wikidata_units.py

+ 1 - 127
searx/plugins/unit_converter.py

@@ -15,7 +15,7 @@ import babel.numbers
 
 from flask_babel import gettext, get_locale
 
-from searx import data
+from searx.units import symbol_to_si
 from searx.plugins import Plugin, PluginInfo
 from searx.result_types import EngineResults
 
@@ -86,132 +86,6 @@ RE_MEASURE = r'''
 '''
 
 
-ADDITIONAL_UNITS = [
-    {
-        "si_name": "Q11579",
-        "symbol": "°C",
-        "to_si": lambda val: val + 273.15,
-        "from_si": lambda val: val - 273.15,
-    },
-    {
-        "si_name": "Q11579",
-        "symbol": "°F",
-        "to_si": lambda val: (val + 459.67) * 5 / 9,
-        "from_si": lambda val: (val * 9 / 5) - 459.67,
-    },
-]
-"""Additional items to convert from a measure unit to a SI unit (vice versa).
-
-.. code:: python
-
-    {
-        "si_name": "Q11579",                 # Wikidata item ID of the SI unit (Kelvin)
-        "symbol": "°C",                      # symbol of the measure unit
-        "to_si": lambda val: val + 273.15,   # convert measure value (val) to SI unit
-        "from_si": lambda val: val - 273.15, # convert SI value (val) measure unit
-    },
-    {
-        "si_name": "Q11573",
-        "symbol": "mi",
-        "to_si": 1609.344,                   # convert measure value (val) to SI unit
-        "from_si": 1 / 1609.344              # convert SI value (val) measure unit
-    },
-
-The values of ``to_si`` and ``from_si`` can be of :py:obj:`float` (a multiplier)
-or a callable_ (val in / converted value returned).
-
-.. _callable: https://docs.python.org/3/glossary.html#term-callable
-"""
-
-
-ALIAS_SYMBOLS = {
-    '°C': ('C',),
-    '°F': ('F',),
-    'mi': ('L',),
-}
-"""Alias symbols for known unit of measure symbols / by example::
-
-    '°C': ('C', ...),  # list of alias symbols for °C (Q69362731)
-    '°F': ('F', ...),  # list of alias symbols for °F (Q99490479)
-    'mi': ('L',),      # list of alias symbols for mi (Q253276)
-"""
-
-
-SYMBOL_TO_SI = []
-
-
-def symbol_to_si():
-    """Generates a list of tuples, each tuple is a measure unit and the fields
-    in the tuple are:
-
-    0. Symbol of the measure unit (e.g. 'mi' for measure unit 'miles' Q253276)
-
-    1. SI name of the measure unit (e.g. Q11573 for SI unit 'metre')
-
-    2. Factor to get SI value from measure unit (e.g. 1mi is equal to SI 1m
-       multiplied by 1609.344)
-
-    3. Factor to get measure value from from SI value (e.g. SI 100m is equal to
-       100mi divided by 1609.344)
-
-    The returned list is sorted, the first items are created from
-    ``WIKIDATA_UNITS``, the second group of items is build from
-    :py:obj:`ADDITIONAL_UNITS` and items created from :py:obj:`ALIAS_SYMBOLS`.
-
-    If you search this list for a symbol, then a match with a symbol from
-    Wikidata has the highest weighting (first hit in the list), followed by the
-    symbols from the :py:obj:`ADDITIONAL_UNITS` and the lowest weighting is
-    given to the symbols resulting from the aliases :py:obj:`ALIAS_SYMBOLS`.
-
-    """
-
-    global SYMBOL_TO_SI  # pylint: disable=global-statement
-    if SYMBOL_TO_SI:
-        return SYMBOL_TO_SI
-
-    # filter out units which can't be normalized to a SI unit and filter out
-    # units without a symbol / arcsecond does not have a symbol
-    # https://www.wikidata.org/wiki/Q829073
-
-    for item in data.WIKIDATA_UNITS.values():
-        if item['to_si_factor'] and item['symbol']:
-            SYMBOL_TO_SI.append(
-                (
-                    item['symbol'],
-                    item['si_name'],
-                    1 / item['to_si_factor'],  # from_si
-                    item['to_si_factor'],  # to_si
-                    item['symbol'],
-                )
-            )
-
-    for item in ADDITIONAL_UNITS:
-        SYMBOL_TO_SI.append(
-            (
-                item['symbol'],
-                item['si_name'],
-                item['from_si'],
-                item['to_si'],
-                item['symbol'],
-            )
-        )
-
-    alias_items = []
-    for item in SYMBOL_TO_SI:
-        for alias in ALIAS_SYMBOLS.get(item[0], ()):
-            alias_items.append(
-                (
-                    alias,
-                    item[1],
-                    item[2],  # from_si
-                    item[3],  # to_si
-                    item[0],  # origin unit
-                )
-            )
-    SYMBOL_TO_SI = SYMBOL_TO_SI + alias_items
-    return SYMBOL_TO_SI
-
-
 def _parse_text_and_convert(from_query, to_query) -> str | None:
 
     # pylint: disable=too-many-branches, too-many-locals

+ 231 - 0
searx/wikidata_units.py

@@ -0,0 +1,231 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Unit conversion on the basis of `SPARQL/WIKIDATA Precision, Units and
+Coordinates`_
+
+.. _SPARQL/WIKIDATA Precision, Units and Coordinates:
+   https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities
+"""
+
+__all__ = ["convert_from_si", "convert_to_si", "symbol_to_si"]
+
+import collections
+
+from searx import data
+from searx.engines import wikidata
+
+ADDITIONAL_UNITS = [
+    {
+        "si_name": "Q11579",
+        "symbol": "°C",
+        "to_si": lambda val: val + 273.15,
+        "from_si": lambda val: val - 273.15,
+    },
+    {
+        "si_name": "Q11579",
+        "symbol": "°F",
+        "to_si": lambda val: (val + 459.67) * 5 / 9,
+        "from_si": lambda val: (val * 9 / 5) - 459.67,
+    },
+]
+"""Additional items to convert from a measure unit to a SI unit (vice versa).
+
+.. code:: python
+
+    {
+        "si_name": "Q11579",                 # Wikidata item ID of the SI unit (Kelvin)
+        "symbol": "°C",                      # symbol of the measure unit
+        "to_si": lambda val: val + 273.15,   # convert measure value (val) to SI unit
+        "from_si": lambda val: val - 273.15, # convert SI value (val) measure unit
+    },
+    {
+        "si_name": "Q11573",
+        "symbol": "mi",
+        "to_si": 1609.344,                   # convert measure value (val) to SI unit
+        "from_si": 1 / 1609.344              # convert SI value (val) measure unit
+    },
+
+The values of ``to_si`` and ``from_si`` can be of :py:obj:`float` (a multiplier)
+or a callable_ (val in / converted value returned).
+
+.. _callable: https://docs.python.org/3/glossary.html#term-callable
+"""
+
+
+ALIAS_SYMBOLS = {
+    '°C': ('C',),
+    '°F': ('F',),
+    'mi': ('L',),
+}
+"""Alias symbols for known unit of measure symbols / by example::
+
+    '°C': ('C', ...),  # list of alias symbols for °C (Q69362731)
+    '°F': ('F', ...),  # list of alias symbols for °F (Q99490479)
+    'mi': ('L',),      # list of alias symbols for mi (Q253276)
+"""
+
+
+SYMBOL_TO_SI = []
+UNITS_BY_SI_NAME: dict | None = None
+
+
+def convert_from_si(si_name: str, symbol: str, value: float | int) -> float:
+    from_si = units_by_si_name(si_name)[symbol][symbol]["from_si"]
+    if isinstance(from_si, (float, int)):
+        value = float(value) * from_si
+    else:
+        value = from_si(float(value))
+    return value
+
+
+def convert_to_si(si_name: str, symbol: str, value: float | int) -> float:
+    to_si = units_by_si_name(si_name)[symbol][symbol]["to_si"]
+    if isinstance(to_si, (float, int)):
+        value = float(value) * to_si
+    else:
+        value = to_si(float(value))
+    return value
+
+
+def units_by_si_name(si_name):
+
+    global UNITS_BY_SI_NAME
+    if UNITS_BY_SI_NAME is not None:
+        return UNITS_BY_SI_NAME[si_name]
+
+    UNITS_BY_SI_NAME = {}
+    for item in symbol_to_si():
+        by_symbol = UNITS_BY_SI_NAME.get(si_name)
+        if by_symbol is None:
+            by_symbol = {}
+            UNITS_BY_SI_NAME[si_name] = by_symbol
+        by_symbol[item["symbol"]] = item
+    return UNITS_BY_SI_NAME[si_name]
+
+
+def symbol_to_si():
+    """Generates a list of tuples, each tuple is a measure unit and the fields
+    in the tuple are:
+
+    0. Symbol of the measure unit (e.g. 'mi' for measure unit 'miles' Q253276)
+
+    1. SI name of the measure unit (e.g. Q11573 for SI unit 'metre')
+
+    2. Factor to get SI value from measure unit (e.g. 1mi is equal to SI 1m
+       multiplied by 1609.344)
+
+    3. Factor to get measure value from from SI value (e.g. SI 100m is equal to
+       100mi divided by 1609.344)
+
+    The returned list is sorted, the first items are created from
+    ``WIKIDATA_UNITS``, the second group of items is build from
+    :py:obj:`ADDITIONAL_UNITS` and items created from :py:obj:`ALIAS_SYMBOLS`.
+
+    If you search this list for a symbol, then a match with a symbol from
+    Wikidata has the highest weighting (first hit in the list), followed by the
+    symbols from the :py:obj:`ADDITIONAL_UNITS` and the lowest weighting is
+    given to the symbols resulting from the aliases :py:obj:`ALIAS_SYMBOLS`.
+
+    """
+
+    global SYMBOL_TO_SI  # pylint: disable=global-statement
+    if SYMBOL_TO_SI:
+        return SYMBOL_TO_SI
+
+    # filter out units which can't be normalized to a SI unit and filter out
+    # units without a symbol / arcsecond does not have a symbol
+    # https://www.wikidata.org/wiki/Q829073
+
+    for item in data.WIKIDATA_UNITS.values():
+        if item['to_si_factor'] and item['symbol']:
+            SYMBOL_TO_SI.append(
+                (
+                    item['symbol'],
+                    item['si_name'],
+                    1 / item['to_si_factor'],  # from_si
+                    item['to_si_factor'],  # to_si
+                    item['symbol'],
+                )
+            )
+
+    for item in ADDITIONAL_UNITS:
+        SYMBOL_TO_SI.append(
+            (
+                item['symbol'],
+                item['si_name'],
+                item['from_si'],
+                item['to_si'],
+                item['symbol'],
+            )
+        )
+
+    alias_items = []
+    for item in SYMBOL_TO_SI:
+        for alias in ALIAS_SYMBOLS.get(item[0], ()):
+            alias_items.append(
+                (
+                    alias,
+                    item[1],
+                    item[2],  # from_si
+                    item[3],  # to_si
+                    item[0],  # origin unit
+                )
+            )
+    SYMBOL_TO_SI = SYMBOL_TO_SI + alias_items
+    return SYMBOL_TO_SI
+
+
+# the response contains duplicate ?item with the different ?symbol
+# "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result
+# even if a ?item has different ?symbol of the same rank.
+# A deterministic result
+# see:
+# * https://www.wikidata.org/wiki/Help:Ranking
+# * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section)
+# * https://w.wiki/32BT
+# * https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities
+#   see the result for https://www.wikidata.org/wiki/Q11582
+#   there are multiple symbols the same rank
+
+SARQL_REQUEST = """
+SELECT DISTINCT ?item ?symbol ?tosi ?tosiUnit
+WHERE
+{
+  ?item wdt:P31/wdt:P279 wd:Q47574 .
+  ?item p:P5061 ?symbolP .
+  ?symbolP ps:P5061 ?symbol ;
+           wikibase:rank ?rank .
+  OPTIONAL {
+    ?item p:P2370 ?tosistmt .
+    ?tosistmt psv:P2370 ?tosinode .
+    ?tosinode wikibase:quantityAmount ?tosi .
+    ?tosinode wikibase:quantityUnit ?tosiUnit .
+  }
+  FILTER(LANG(?symbol) = "en").
+}
+ORDER BY ?item DESC(?rank) ?symbol
+"""
+
+
+def fetch_units():
+    """Fetch units from Wikidata.  Function is used to update persistence of
+    :py:obj:`searx.data.WIKIDATA_UNITS`."""
+
+    results = collections.OrderedDict()
+    response = wikidata.send_wikidata_query(SARQL_REQUEST)
+    for unit in response['results']['bindings']:
+
+        symbol = unit['symbol']['value']
+        name = unit['item']['value'].rsplit('/', 1)[1]
+        si_name = unit.get('tosiUnit', {}).get('value', '')
+        if si_name:
+            si_name = si_name.rsplit('/', 1)[1]
+
+        to_si_factor = unit.get('tosi', {}).get('value', '')
+        if name not in results:
+            # ignore duplicate: always use the first one
+            results[name] = {
+                'symbol': symbol,
+                'si_name': si_name if si_name else None,
+                'to_si_factor': float(to_si_factor) if to_si_factor else None,
+            }
+    return results

+ 2 - 63
searxng_extra/update/update_wikidata_units.py

@@ -8,76 +8,15 @@ Output file: :origin:`searx/data/wikidata_units.json` (:origin:`CI Update data
 """
 
 import json
-import collections
 
-# set path
-from os.path import join
-
-from searx import searx_dir
 from searx.engines import wikidata, set_loggers
 from searx.data import data_dir
+from searx.wikidata_units import fetch_units
 
 DATA_FILE = data_dir / 'wikidata_units.json'
-
 set_loggers(wikidata, 'wikidata')
 
-# the response contains duplicate ?item with the different ?symbol
-# "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result
-# even if a ?item has different ?symbol of the same rank.
-# A deterministic result
-# see:
-# * https://www.wikidata.org/wiki/Help:Ranking
-# * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section)
-# * https://w.wiki/32BT
-# * https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities
-#   see the result for https://www.wikidata.org/wiki/Q11582
-#   there are multiple symbols the same rank
-SARQL_REQUEST = """
-SELECT DISTINCT ?item ?symbol ?tosi ?tosiUnit
-WHERE
-{
-  ?item wdt:P31/wdt:P279 wd:Q47574 .
-  ?item p:P5061 ?symbolP .
-  ?symbolP ps:P5061 ?symbol ;
-           wikibase:rank ?rank .
-  OPTIONAL {
-    ?item p:P2370 ?tosistmt .
-    ?tosistmt psv:P2370 ?tosinode .
-    ?tosinode wikibase:quantityAmount ?tosi .
-    ?tosinode wikibase:quantityUnit ?tosiUnit .
-  }
-  FILTER(LANG(?symbol) = "en").
-}
-ORDER BY ?item DESC(?rank) ?symbol
-"""
-
-
-def get_data():
-    results = collections.OrderedDict()
-    response = wikidata.send_wikidata_query(SARQL_REQUEST)
-    for unit in response['results']['bindings']:
-
-        symbol = unit['symbol']['value']
-        name = unit['item']['value'].rsplit('/', 1)[1]
-        si_name = unit.get('tosiUnit', {}).get('value', '')
-        if si_name:
-            si_name = si_name.rsplit('/', 1)[1]
-
-        to_si_factor = unit.get('tosi', {}).get('value', '')
-        if name not in results:
-            # ignore duplicate: always use the first one
-            results[name] = {
-                'symbol': symbol,
-                'si_name': si_name if si_name else None,
-                'to_si_factor': float(to_si_factor) if to_si_factor else None,
-            }
-    return results
-
-
-def get_wikidata_units_filename():
-    return join(join(searx_dir, "data"), "")
-
 
 if __name__ == '__main__':
     with DATA_FILE.open('w', encoding="utf8") as f:
-        json.dump(get_data(), f, indent=4, sort_keys=True, ensure_ascii=False)
+        json.dump(fetch_units(), f, indent=4, sort_keys=True, ensure_ascii=False)