update_wikidata_units.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. #!/usr/bin/env python
  2. import json
  3. import collections
  4. # set path
  5. from os.path import join
  6. from searx import searx_dir
  7. from searx.engines import wikidata, set_loggers
  8. set_loggers(wikidata, 'wikidata')
  9. # the response contains duplicate ?item with the different ?symbol
  10. # "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result
  11. # even if a ?item has different ?symbol of the same rank.
  12. # A deterministic result
  13. # see:
  14. # * https://www.wikidata.org/wiki/Help:Ranking
  15. # * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section)
  16. # * https://w.wiki/32BT
  17. # see the result for https://www.wikidata.org/wiki/Q11582
  18. # there are multiple symbols the same rank
  19. SARQL_REQUEST = """
  20. SELECT DISTINCT ?item ?symbol
  21. WHERE
  22. {
  23. ?item wdt:P31/wdt:P279 wd:Q47574 .
  24. ?item p:P5061 ?symbolP .
  25. ?symbolP ps:P5061 ?symbol ;
  26. wikibase:rank ?rank .
  27. FILTER(LANG(?symbol) = "en").
  28. }
  29. ORDER BY ?item DESC(?rank) ?symbol
  30. """
  31. def get_data():
  32. results = collections.OrderedDict()
  33. response = wikidata.send_wikidata_query(SARQL_REQUEST)
  34. for unit in response['results']['bindings']:
  35. name = unit['item']['value'].replace('http://www.wikidata.org/entity/', '')
  36. unit = unit['symbol']['value']
  37. if name not in results:
  38. # ignore duplicate: always use the first one
  39. results[name] = unit
  40. return results
  41. def get_wikidata_units_filename():
  42. return join(join(searx_dir, "data"), "wikidata_units.json")
  43. with open(get_wikidata_units_filename(), 'w') as f:
  44. json.dump(get_data(), f, indent=4, ensure_ascii=False)