update_wikidata_units.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. #!/usr/bin/env python
  2. # SPDX-License-Identifier: AGPL-3.0-or-later
  3. # lint: pylint
  4. # pylint: disable=missing-module-docstring
  5. import json
  6. import collections
  7. # set path
  8. from os.path import join
  9. from searx import searx_dir
  10. from searx.engines import wikidata, set_loggers
  11. set_loggers(wikidata, 'wikidata')
  12. # the response contains duplicate ?item with the different ?symbol
  13. # "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result
  14. # even if a ?item has different ?symbol of the same rank.
  15. # A deterministic result
  16. # see:
  17. # * https://www.wikidata.org/wiki/Help:Ranking
  18. # * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section)
  19. # * https://w.wiki/32BT
  20. # see the result for https://www.wikidata.org/wiki/Q11582
  21. # there are multiple symbols the same rank
  22. SARQL_REQUEST = """
  23. SELECT DISTINCT ?item ?symbol
  24. WHERE
  25. {
  26. ?item wdt:P31/wdt:P279 wd:Q47574 .
  27. ?item p:P5061 ?symbolP .
  28. ?symbolP ps:P5061 ?symbol ;
  29. wikibase:rank ?rank .
  30. FILTER(LANG(?symbol) = "en").
  31. }
  32. ORDER BY ?item DESC(?rank) ?symbol
  33. """
  34. def get_data():
  35. results = collections.OrderedDict()
  36. response = wikidata.send_wikidata_query(SARQL_REQUEST)
  37. for unit in response['results']['bindings']:
  38. name = unit['item']['value'].replace('http://www.wikidata.org/entity/', '')
  39. unit = unit['symbol']['value']
  40. if name not in results:
  41. # ignore duplicate: always use the first one
  42. results[name] = unit
  43. return results
  44. def get_wikidata_units_filename():
  45. return join(join(searx_dir, "data"), "wikidata_units.json")
  46. with open(get_wikidata_units_filename(), 'w', encoding="utf8") as f:
  47. json.dump(get_data(), f, indent=4, ensure_ascii=False)