fetch_wikidata_units.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. #!/usr/bin/env python
  2. import json
  3. import collections
  4. # set path
  5. from sys import path
  6. from os.path import realpath, dirname, join
  7. path.append(realpath(dirname(realpath(__file__)) + '/../'))
  8. from searx import searx_dir
  9. from searx.engines.wikidata import send_wikidata_query
  10. # the response contains duplicate ?item with the different ?symbol
  11. # "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result
  12. # even if a ?item has different ?symbol of the same rank.
  13. # A deterministic result
  14. # see:
  15. # * https://www.wikidata.org/wiki/Help:Ranking
  16. # * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section)
  17. # * https://w.wiki/32BT
  18. # see the result for https://www.wikidata.org/wiki/Q11582
  19. # there are multiple symbols the same rank
  20. SARQL_REQUEST = """
  21. SELECT DISTINCT ?item ?symbol
  22. WHERE
  23. {
  24. ?item wdt:P31/wdt:P279 wd:Q47574 .
  25. ?item p:P5061 ?symbolP .
  26. ?symbolP ps:P5061 ?symbol ;
  27. wikibase:rank ?rank .
  28. FILTER(LANG(?symbol) = "en").
  29. }
  30. ORDER BY ?item DESC(?rank) ?symbol
  31. """
  32. def get_data():
  33. results = collections.OrderedDict()
  34. response = send_wikidata_query(SARQL_REQUEST)
  35. for unit in response['results']['bindings']:
  36. name = unit['item']['value'].replace('http://www.wikidata.org/entity/', '')
  37. unit = unit['symbol']['value']
  38. if name not in results:
  39. # ignore duplicate: always use the first one
  40. results[name] = unit
  41. return results
  42. def get_wikidata_units_filename():
  43. return join(join(searx_dir, "data"), "wikidata_units.json")
  44. with open(get_wikidata_units_filename(), 'w') as f:
  45. json.dump(get_data(), f, indent=4, ensure_ascii=False)