Browse Source

Merge pull request #2224 from dalf/update-infobox-engines

[enh] update infobox engines
Noémi Ványi 4 years ago
parent
commit
10ddd421f2

+ 4 - 1
searx/data/__init__.py

@@ -2,7 +2,8 @@ import json
 from pathlib import Path
 
 
-__init__ = ['ENGINES_LANGUGAGES', 'CURRENCIES', 'USER_AGENTS', 'bangs_loader', 'ahmia_blacklist_loader']
+__init__ = ['ENGINES_LANGUGAGES', 'CURRENCIES', 'USER_AGENTS', 'EXTERNAL_URLS', 'WIKIDATA_UNITS',
+            'bangs_loader', 'ahmia_blacklist_loader']
 data_dir = Path(__file__).parent
 
 
@@ -24,3 +25,5 @@ def ahmia_blacklist_loader():
 ENGINES_LANGUAGES = load('engines_languages.json')
 CURRENCIES = load('currencies.json')
 USER_AGENTS = load('useragents.json')
+EXTERNAL_URLS = load('external_urls.json')
+WIKIDATA_UNITS = load('wikidata_units.json')

+ 156 - 0
searx/data/external_urls.json

@@ -0,0 +1,156 @@
+{
+    "facebook_profile": {
+        "category_name": "Facebook",
+        "url_name": "Facebook profile",
+        "urls": {
+            "default": "https://facebook.com/$1"
+        }
+    },
+    "youtube_channel": {
+        "category_name": "YouTube",
+        "url_name": "YouTube channel",
+        "urls": {
+            "default": "https://www.youtube.com/channel/$1"
+        }
+    },
+    "youtube_video": {
+        "category_name": "YouTube",
+        "url_name": "YouTube video",
+        "urls": {
+            "default": "https://www.youtube.com/watch?v=$1"
+        }
+    },
+    "twitter_profile": {
+        "category_name": "Twitter",
+        "url_name": "Twitter profile",
+        "urls": {
+            "default": "https://twitter.com/$1"
+        }
+    },
+    "instagram_profile": {
+        "category_name": "Instagram",
+        "url_name": "Instagram profile",
+        "urls": {
+            "default": "https://www.instagram.com/$1"
+        }
+    },
+    "imdb_title": {
+        "category_name": "IMDB",
+        "url_name": "IMDB title",
+        "urls": {
+            "default": "https://www.imdb.com/title/$1"
+        }
+    },
+    "imdb_name": {
+        "category_name": "IMDB",
+        "url_name": "IMDB name",
+        "urls": {
+            "default": "https://www.imdb.com/name/$1"
+        }
+    },
+    "imdb_character": {
+        "category_name": "IMDB",
+        "url_name": "IMDB character",
+        "urls": {
+            "default": "https://www.imdb.com/character/$1"
+        }
+    },
+    "imdb_company": {
+        "category_name": "IMDB",
+        "url_name": "IMDB company",
+        "urls": {
+            "default": "https://www.imdb.com/company/$1"
+        }
+    },
+    "imdb_event": {
+        "category_name": "IMDB",
+        "url_name": "IMDB event",
+        "urls": {
+            "default": "https://www.imdb.com/event/$1"
+        }
+    },
+    "rotten_tomatoes": {
+        "category_name": "Rotten tomatoes",
+        "url_name": "Rotten tomatoes title",
+        "urls": {
+            "default": "https://www.rottentomatoes.com/$1"
+        }
+    },
+    "spotify_artist_id": {
+        "category_name": "Spotify",
+        "url_name": "Spotify artist",
+        "urls": {
+            "default": "https://open.spotify.com/artist/$1"
+        }
+    },
+    "itunes_artist_id": {
+        "category_name": "iTunes",
+        "url_name": "iTunes artist",
+        "urls": {
+            "default": "https://music.apple.com/us/artist/$1"
+        }
+    },
+    "soundcloud_id": {
+        "category_name": "Soundcloud",
+        "url_name": "Soundcloud artist",
+        "urls": {
+            "default": "https://soundcloud.com/$1"
+        }
+    },
+    "netflix_id": {
+        "category_name": "Netflix",
+        "url_name": "Netflix movie",
+        "urls": {
+            "default": "https://www.netflix.com/watch/$1"
+        }
+    },
+    "github_profile": {
+        "category_name": "Github",
+        "url_name": "Github profile",
+        "urls": {
+            "default": "https://wwww.github.com/$1"
+        }
+    },
+    "musicbrainz_artist": {
+        "category_name": "Musicbrainz",
+        "url_name": "Musicbrainz artist",
+        "urls": {
+            "default": "http://musicbrainz.org/artist/$1"
+        }
+    },
+    "musicbrainz_work": {
+        "category_name": "Musicbrainz",
+        "url_name": "Musicbrainz work",
+        "urls": {
+            "default": "http://musicbrainz.org/work/$1"
+        }
+    },
+    "musicbrainz_release_group": {
+        "category_name": "Musicbrainz",
+        "url_name": "Musicbrainz release group",
+        "urls": {
+            "default": "http://musicbrainz.org/release-group/$1"
+        }
+    },
+    "musicbrainz_label": {
+        "category_name": "Musicbrainz",
+        "url_name": "Musicbrainz label",
+        "urls": {
+            "default": "http://musicbrainz.org/label/$1"
+        }
+    },
+    "wikimedia_image": {
+        "category_name": "Wikipedia",
+        "url_name": "Wikipedia image",
+        "urls": {
+            "default": "https://commons.wikimedia.org/wiki/Special:FilePath/$1?width=500&height=400"
+        }
+    },
+    "map": {
+        "category_name": "Map",
+        "url_name": "geo map",
+        "urls": {
+            "default": "https://www.openstreetmap.org/?lat=${latitude}&lon=${longitude}&zoom=${zoom}&layers=M"
+        }
+    }
+}

+ 1006 - 0
searx/data/wikidata_units.json

@@ -0,0 +1,1006 @@
+{
+    "Q199": "1",
+    "Q100149279": "°We",
+    "Q100995": "lb",
+    "Q1022113": "cm³",
+    "Q102573": "Bq",
+    "Q103246": "Sv",
+    "Q103510": "bar",
+    "Q10380431": "TJ",
+    "Q1040401": "das",
+    "Q1040427": "hs",
+    "Q1042866": "Zibit",
+    "Q1050958": "inHg",
+    "Q1051665": "m/s²",
+    "Q1052397": "rad",
+    "Q1054140": "Mm",
+    "Q10543042": "Ym",
+    "Q1057069": "hg",
+    "Q1063756": "rad/s",
+    "Q1063786": "in²",
+    "Q1065153": "mrad",
+    "Q1066138": "Ps",
+    "Q1067722": "Fg",
+    "Q1069725": "p.",
+    "Q1084321": "Tb/s",
+    "Q1086691": "fg",
+    "Q1091257": "tex",
+    "Q1092296": "a",
+    "Q1104069": "CAD$",
+    "Q11061003": "μm²",
+    "Q11061005": "nm²",
+    "Q1131660": "st",
+    "Q1137675": "cr",
+    "Q1140444": "Zbit",
+    "Q1140577": "Ybit",
+    "Q1152074": "Pbit",
+    "Q1152323": "Tbit",
+    "Q1165799": "mil",
+    "Q11776930": "Mg",
+    "Q11830636": "psf",
+    "Q11929860": "kpc",
+    "Q1194225": "lbf",
+    "Q1194580": "Mibit",
+    "Q1195111": "Ebit",
+    "Q1196837": "ω_P",
+    "Q1197459": "Ms",
+    "Q11982285": "Em³",
+    "Q11982288": "Zm³",
+    "Q11982289": "Tm³",
+    "Q12011178": "Zs",
+    "Q1204894": "Gibit",
+    "Q12257695": "Eb/s",
+    "Q12257696": "EB/s",
+    "Q12261466": "kB/s",
+    "Q12265780": "Pb/s",
+    "Q12265783": "PB/s",
+    "Q12269121": "Yb/s",
+    "Q12269122": "YB/s",
+    "Q12269308": "Zb/s",
+    "Q12269309": "ZB/s",
+    "Q1247300": "cm H₂O",
+    "Q12714022": "sh cwt",
+    "Q12789864": "GeV",
+    "Q12874593": "W h",
+    "Q128822": "kn",
+    "Q13035094": "J/mol",
+    "Q130964": "cal",
+    "Q131255": "F",
+    "Q13147228": "g/cm³",
+    "Q1322380": "Ts",
+    "Q1323615": "oz t",
+    "Q132643": "kr",
+    "Q13400897": "g",
+    "Q13479685": "mm wg",
+    "Q1351253": "Eibit",
+    "Q1351334": "Pibit",
+    "Q13542672": "Ry",
+    "Q13548586": "THz",
+    "Q13582667": "kgf/cm²",
+    "Q1361854": "dwt",
+    "Q1363007": "slug",
+    "Q1374438": "ks",
+    "Q13753469": "MB/s",
+    "Q1377051": "Gs",
+    "Q1394540": "bm",
+    "Q1396128": "F",
+    "Q1413142": "Gb",
+    "Q14158377": "A_P",
+    "Q14623803": "MDa",
+    "Q14623804": "kDa",
+    "Q1472674": "Sv",
+    "Q14754979": "Zg",
+    "Q14786969": "MJ",
+    "Q14913554": "Ys",
+    "Q14914907": "th",
+    "Q14916719": "Gpc",
+    "Q14923662": "Pm³",
+    "Q1511773": "LSd",
+    "Q15120301": "l atm",
+    "Q1542309": "xu",
+    "Q1545979": "ft³",
+    "Q1550511": "yd²",
+    "Q15551713": "Sh",
+    "Q1569733": "St",
+    "Q15784325": "apc",
+    "Q160680": "Br",
+    "Q160857": "hp",
+    "Q1628990": "hph",
+    "Q163343": "T",
+    "Q163354": "H",
+    "Q1640501": "hyl",
+    "Q1645498": "μg",
+    "Q16859309": "lb·ft",
+    "Q169893": "S",
+    "Q170804": "Wb",
+    "Q17093295": "m/h",
+    "Q17255465": "v_P",
+    "Q173117": "R$",
+    "Q1741429": "kpm",
+    "Q174467": "Lm",
+    "Q174728": "cm",
+    "Q174789": "mm",
+    "Q175821": "μm",
+    "Q1768929": "p",
+    "Q1770733": "Tg",
+    "Q1772386": "dg",
+    "Q177493": "Gs",
+    "Q177612": "sr",
+    "Q1777507": "fs",
+    "Q177974": "atm",
+    "Q178506": "bbl",
+    "Q178674": "nm",
+    "Q1793863": "sn",
+    "Q179836": "lx",
+    "Q180154": "km/h",
+    "Q180892": "M☉",
+    "Q1815100": "cl",
+    "Q182098": "kWh",
+    "Q1823150": "μW",
+    "Q182429": "m/s",
+    "Q1826195": "dl",
+    "Q18413919": "cm/s",
+    "Q184172": "FF",
+    "Q185078": "a",
+    "Q185153": "erg",
+    "Q185648": "Torr",
+    "Q185759": "span",
+    "Q1872619": "zs",
+    "Q189097": "₧",
+    "Q190095": "Gy",
+    "Q190951": "S$",
+    "Q191118": "t",
+    "Q1913097": "fg",
+    "Q1916026": "μV",
+    "Q192027": "Bd",
+    "Q192274": "pm",
+    "Q193098": "KD",
+    "Q1935515": "mA s",
+    "Q19392152": "TL",
+    "Q193933": "dpt",
+    "Q194339": "B$",
+    "Q1970718": "mam",
+    "Q1972579": "pdl",
+    "Q199462": "LE",
+    "Q199471": "Afs",
+    "Q200323": "dm",
+    "Q200337": "Kz",
+    "Q201880": "LL",
+    "Q201933": "dyn",
+    "Q2029156": "quad",
+    "Q2029519": "hl",
+    "Q203567": "₦",
+    "Q2042279": "m H₂O",
+    "Q204737": "៛",
+    "Q2051195": "GWh",
+    "Q2055118": "ppb",
+    "Q2064166": "fc",
+    "Q206600": "ރ",
+    "Q20706220": "cmm",
+    "Q20706221": "dmm",
+    "Q2080811": "vol%",
+    "Q208526": "NT$",
+    "Q208528": "gon",
+    "Q208634": "kat",
+    "Q208788": "fm",
+    "Q209351": "b",
+    "Q209426": "′",
+    "Q21006887": "ppm",
+    "Q2100949": "P",
+    "Q21014455": "m/min",
+    "Q210472": "B/.",
+    "Q21061369": "g/kg",
+    "Q21062777": "MPa",
+    "Q21064807": "kPa",
+    "Q21064845": "mol/L",
+    "Q21075844": "ml/l",
+    "Q21077820": "mg/m³",
+    "Q21091747": "mg/kg",
+    "Q211256": "mph",
+    "Q211580": "BTU (th)",
+    "Q212120": "A h",
+    "Q2140397": "in³",
+    "Q214377": "ell",
+    "Q2143992": "kHz",
+    "Q21489891": "nm³",
+    "Q21489892": "Gm³",
+    "Q21489893": "Mm³",
+    "Q21489894": "μm³",
+    "Q21500224": "mas",
+    "Q2151240": "mag",
+    "Q215571": "N m",
+    "Q21604951": "g/m³",
+    "Q2165290": "yd³",
+    "Q216880": "kp",
+    "Q217208": "a",
+    "Q2175964": "dm³",
+    "Q218593": "in",
+    "Q2199357": "dec",
+    "Q22137107": "mas/y",
+    "Q2215478": "ppt",
+    "Q2221356": "mW h",
+    "Q22350885": "da",
+    "Q2243141": "Gb/s",
+    "Q2254856": "ca",
+    "Q22673229": "ft/min",
+    "Q2269250": "kb/s",
+    "Q2282891": "μl",
+    "Q2282906": "ng",
+    "Q229354": "Ci",
+    "Q232291": "mi²",
+    "Q2332346": "ml",
+    "Q23823681": "TW",
+    "Q23925410": "gal (UK)",
+    "Q23925413": "gal (US)",
+    "Q23931040": "dam²",
+    "Q23931103": "nmi²",
+    "Q2414435": "$b.",
+    "Q242988": "Lib$",
+    "Q2438073": "ag",
+    "Q2448803": "mV",
+    "Q2451296": "μF",
+    "Q246868": "lot",
+    "Q2474258": "mSv",
+    "Q2483628": "as",
+    "Q2489298": "cm²",
+    "Q249439": "q_P",
+    "Q2518569": "nSv",
+    "Q253276": "mi",
+    "Q25472681": "GB/s",
+    "Q25472693": "TB/s",
+    "Q25499149": "oct",
+    "Q25511288": "mb",
+    "Q2553708": "MV",
+    "Q2554092": "kV",
+    "Q259502": "AU$",
+    "Q260126": "rem",
+    "Q2612219": "Pg",
+    "Q261247": "ct",
+    "Q2619500": "foe",
+    "Q2636421": "nH",
+    "Q2637946": "dal",
+    "Q2642547": "ha",
+    "Q2652700": "Osm",
+    "Q2655272": "Eg",
+    "Q2659078": "TW h",
+    "Q2670039": "₶",
+    "Q26708069": "kcal",
+    "Q267391": "K",
+    "Q2679083": "μH",
+    "Q2682463": "nF",
+    "Q2691798": "cg",
+    "Q271206": "sud£",
+    "Q2737347": "mm²",
+    "Q2739114": "μSv",
+    "Q275112": "Bz$",
+    "Q2756030": "pF",
+    "Q2757753": "PW h",
+    "Q2762458": "ys",
+    "Q27864215": "μW h",
+    "Q2793566": "GV",
+    "Q27949241": "R",
+    "Q2799294": "Gg",
+    "Q281096": "cd/m²",
+    "Q28719934": "keV",
+    "Q28924752": "g/mol",
+    "Q28924753": "kg/mol",
+    "Q2924137": "mH",
+    "Q296936": "toe",
+    "Q29924639": "kVA",
+    "Q30001811": "aBq",
+    "Q30001813": "aC",
+    "Q30001814": "aHz",
+    "Q30001815": "aJ",
+    "Q30001816": "akat",
+    "Q30001818": "aL",
+    "Q30001819": "alm",
+    "Q30001820": "alx",
+    "Q30001822": "aN",
+    "Q30001823": "aΩ",
+    "Q30001825": "aPa",
+    "Q30001826": "arad",
+    "Q30001827": "aS",
+    "Q30001828": "aSv",
+    "Q30001829": "asr",
+    "Q30001830": "aT",
+    "Q30001831": "aV",
+    "Q30001832": "aW",
+    "Q30001833": "aWb",
+    "Q3013059": "kyr",
+    "Q3194304": "kbit",
+    "Q3207456": "mW",
+    "Q321017": "R",
+    "Q3221356": "ym",
+    "Q3239557": "pg",
+    "Q3241121": "mg",
+    "Q324923": "Hart",
+    "Q3249364": "cs",
+    "Q3251645": "ds",
+    "Q3267417": "Tm",
+    "Q3270676": "zm",
+    "Q32750621": "liq pt (US)",
+    "Q32750759": "fl oz (US)",
+    "Q32750816": "bu (US)",
+    "Q32751272": "dry pt (US)",
+    "Q32751296": "bbl (US)",
+    "Q3276763": "GHz",
+    "Q3277907": "Em",
+    "Q3277915": "Zm",
+    "Q3277919": "Pm",
+    "Q3312063": "fL",
+    "Q3320608": "kW",
+    "Q3331719": "dm²",
+    "Q3332689": "ToR",
+    "Q3332814": "Mbit",
+    "Q3396758": "daa",
+    "Q3414243": "rps",
+    "Q3421309": "R_J",
+    "Q3495543": "mbar",
+    "Q355198": "px",
+    "Q3674704": "km/s",
+    "Q3675550": "mm³",
+    "Q3712659": "$",
+    "Q376660": "nat",
+    "Q37732658": "°R",
+    "Q3773454": "Mpc",
+    "Q3815076": "Kibit",
+    "Q3833309": "£",
+    "Q3858002": "mA h",
+    "Q3867152": "ft/s²",
+    "Q389062": "Tibit",
+    "Q3902688": "pl",
+    "Q3902709": "ps",
+    "Q39360235": "US lea",
+    "Q39360471": "nl",
+    "Q39362962": "µin",
+    "Q39363132": "UK lg",
+    "Q39363209": "UK nl",
+    "Q39380159": "US nmi",
+    "Q39462789": "µin²",
+    "Q39467934": "kgf/m²",
+    "Q39469927": "N/m²",
+    "Q39617688": "cwt long",
+    "Q39617818": "t lb",
+    "Q39628023": "y",
+    "Q39699418": "cm/s²",
+    "Q39708248": "S",
+    "Q39709980": "bd",
+    "Q39710113": "bhp EDR",
+    "Q3972226": "kL",
+    "Q4041686": "iwg",
+    "Q4068266": "Ʒ",
+    "Q4176683": "aC",
+    "Q420266": "oz. fl.",
+    "Q42319606": "people/m²",
+    "Q4243638": "km³",
+    "Q4456994": "mF",
+    "Q469356": "tn. sh.",
+    "Q476572": "Ha",
+    "Q482798": "yd",
+    "Q483261": "Da",
+    "Q483725": "A.M.",
+    "Q484092": "lm",
+    "Q4861171": "H",
+    "Q494083": "fur",
+    "Q4989854": "kJ",
+    "Q500515": "Gal",
+    "Q5042194": "£",
+    "Q50808017": "kg m²",
+    "Q5139563": "hPa",
+    "Q514845": "pz",
+    "Q5195628": "hm³",
+    "Q5198770": "dam³",
+    "Q524410": "byr",
+    "Q53393488": "PHz",
+    "Q53393490": "EHz",
+    "Q53393494": "ZHz",
+    "Q53393498": "YHz",
+    "Q53393659": "ML",
+    "Q53393664": "GL",
+    "Q53393674": "ZL",
+    "Q53393678": "YL",
+    "Q53393771": "yL",
+    "Q53393868": "GJ",
+    "Q53393886": "PJ",
+    "Q53393890": "EJ",
+    "Q53448786": "yHz",
+    "Q53448790": "zHz",
+    "Q53448794": "fHz",
+    "Q53448797": "pHz",
+    "Q53448801": "nHz",
+    "Q53448806": "μHz",
+    "Q53448808": "mHz",
+    "Q53448813": "cHz",
+    "Q53448817": "dHz",
+    "Q53448820": "daHz",
+    "Q53448826": "hHz",
+    "Q53448828": "yJ",
+    "Q53448832": "zJ",
+    "Q53448842": "pJ",
+    "Q53448844": "nJ",
+    "Q53448847": "μJ",
+    "Q53448851": "mJ",
+    "Q53448856": "cJ",
+    "Q53448860": "dJ",
+    "Q53448864": "daJ",
+    "Q53448875": "hJ",
+    "Q53448879": "yPa",
+    "Q53448883": "zPa",
+    "Q53448886": "fPa",
+    "Q53448892": "pPa",
+    "Q53448897": "nPa",
+    "Q53448900": "μPa",
+    "Q53448906": "mPa",
+    "Q53448909": "cPa",
+    "Q53448914": "dPa",
+    "Q53448918": "daPa",
+    "Q53448922": "GPa",
+    "Q53448927": "TPa",
+    "Q53448931": "PPa",
+    "Q53448936": "EPa",
+    "Q53448939": "ZPa",
+    "Q53448943": "YPa",
+    "Q53448949": "yV",
+    "Q53448952": "zV",
+    "Q53448957": "fV",
+    "Q53448960": "pV",
+    "Q53448965": "nV",
+    "Q53448969": "cV",
+    "Q53448973": "dV",
+    "Q53448977": "daV",
+    "Q53448981": "hV",
+    "Q53448985": "TV",
+    "Q53448990": "PV",
+    "Q53448994": "EV",
+    "Q53448996": "ZV",
+    "Q53449001": "YV",
+    "Q53449006": "yW",
+    "Q53449008": "zW",
+    "Q53449013": "fW",
+    "Q53449018": "pW",
+    "Q53449021": "nW",
+    "Q53449025": "cW",
+    "Q53449029": "dW",
+    "Q53449033": "daW",
+    "Q53449036": "hW",
+    "Q53449040": "PW",
+    "Q53449045": "EW",
+    "Q53449049": "ZW",
+    "Q53449054": "YW",
+    "Q53561461": "wf",
+    "Q53561822": "wf",
+    "Q53651160": "zm³",
+    "Q53651201": "Ym³",
+    "Q53651356": "ym³",
+    "Q53651512": "pm³",
+    "Q53651713": "fm³",
+    "Q536785": "ρ_P",
+    "Q53951982": "Mt",
+    "Q53952048": "kt",
+    "Q54006645": "ZWb",
+    "Q54081925": "ZSv",
+    "Q54082468": "ZS",
+    "Q54083144": "ZΩ",
+    "Q54083318": "ZN",
+    "Q54083566": "Zlm",
+    "Q54083579": "Zlx",
+    "Q54083712": "ZBq",
+    "Q54083746": "ZC",
+    "Q54083766": "ZF",
+    "Q54083779": "ZGy",
+    "Q54083795": "ZH",
+    "Q54083813": "Zkat",
+    "Q5409016": "MVA",
+    "Q5465723": "ft-pdl",
+    "Q549389": "bit/s",
+    "Q550341": "V A",
+    "Q552299": "ch",
+    "Q55442349": "U/L",
+    "Q55726194": "mg/L",
+    "Q56156859": "mmol",
+    "Q56156949": "μmol",
+    "Q56157046": "nmol",
+    "Q56157048": "pmol",
+    "Q56160603": "fmol",
+    "Q56302633": "UM",
+    "Q56317116": "mgal",
+    "Q56317622": "Q_P",
+    "Q56318907": "kbar",
+    "Q56349362": "Bs.S",
+    "Q56402798": "kN",
+    "Q5711261": "am³",
+    "Q581432": "‴",
+    "Q5879479": "GW",
+    "Q6003257": "am",
+    "Q6009164": "MW h",
+    "Q6014364": "in/s",
+    "Q603071": "E°",
+    "Q605704": "doz",
+    "Q60742631": "AU/yr",
+    "Q608697": "Mx",
+    "Q610135": "G",
+    "Q613726": "Yg",
+    "Q6170164": "yg",
+    "Q6171168": "zg",
+    "Q61756607": "yd",
+    "Q61793198": "rd",
+    "Q61794766": "ch (US survey)",
+    "Q61994988": "Wth",
+    "Q61995006": "KWth",
+    "Q626299": "psi",
+    "Q630369": "var",
+    "Q636200": "U",
+    "Q640907": "sb",
+    "Q6414556": "kip",
+    "Q648908": "bya",
+    "Q64996135": "gal (US)/min",
+    "Q65028392": "mm/yr",
+    "Q651336": "M_J",
+    "Q6517513": "dag",
+    "Q667419": "UK t",
+    "Q681996": "M⊕",
+    "Q685662": "p_P",
+    "Q6859652": "mm Hg",
+    "Q686163": "$",
+    "Q68725821": "°Rø",
+    "Q68726230": "°De",
+    "Q68726625": "°N",
+    "Q69362731": "°C",
+    "Q69363953": "K",
+    "Q693944": "gr",
+    "Q6982035": "MW",
+    "Q69878540": "fl oz (UK)",
+    "Q70378044": "dmol",
+    "Q70378549": "dK",
+    "Q70393458": "kmol",
+    "Q70395375": "Tmol",
+    "Q70395643": "Mmol",
+    "Q70395830": "kK",
+    "Q70396179": "mK",
+    "Q70397275": "μK",
+    "Q70397725": "cmol",
+    "Q70397932": "cK",
+    "Q70398457": "nK",
+    "Q70398619": "MK",
+    "Q70398813": "Gmol",
+    "Q70398991": "GK",
+    "Q70440025": "daK",
+    "Q70440438": "hK",
+    "Q70440620": "damol",
+    "Q70440823": "hmol",
+    "Q70443020": "EK",
+    "Q70443154": "yK",
+    "Q70443282": "zK",
+    "Q70443367": "fK",
+    "Q70443453": "TK",
+    "Q70443757": "pK",
+    "Q70443901": "YK",
+    "Q70444029": "PK",
+    "Q70444141": "Emol",
+    "Q70444284": "ymol",
+    "Q70444386": "zmol",
+    "Q70444514": "Ymol",
+    "Q70444609": "Pmol",
+    "Q712226": "km²",
+    "Q72081071": "MeV",
+    "Q723733": "ms",
+    "Q730251": "ft·lbf",
+    "Q732707": "MHz",
+    "Q73408": "K",
+    "Q7350781": "Mb/s",
+    "Q743895": "bpm",
+    "Q748716": "ft/s",
+    "Q750178": "‱",
+    "Q752197": "kJ/mol",
+    "Q7672057": "TU",
+    "Q777017": "dBm",
+    "Q78754556": "rot",
+    "Q78756901": "rev",
+    "Q78757683": "windings",
+    "Q79726": "kB",
+    "Q79735": "MB",
+    "Q79738": "GB",
+    "Q79741": "TB",
+    "Q79744": "PB",
+    "Q79745": "EB",
+    "Q79747": "ZB",
+    "Q7974920": "W s",
+    "Q79752": "YB",
+    "Q79756": "KiB",
+    "Q79758": "MiB",
+    "Q79765": "GiB",
+    "Q79769": "TiB",
+    "Q79774": "PiB",
+    "Q79777": "EiB",
+    "Q79779": "ZiB",
+    "Q79781": "YiB",
+    "Q80237579": "J/nm",
+    "Q809678": "Ba",
+    "Q81062869": "W/nm",
+    "Q81073100": "W/(sr nm)",
+    "Q81292": "acre",
+    "Q81454": "Å",
+    "Q8229770": "B/s",
+    "Q828224": "km",
+    "Q829073": "\"",
+    "Q83216": "cd",
+    "Q83327": "eV",
+    "Q834105": "g/L",
+    "Q835916": "IU",
+    "Q838801": "ns",
+    "Q842015": "μs",
+    "Q842981": "thm (US)",
+    "Q844211": "kg/m³",
+    "Q844338": "hm",
+    "Q844976": "Oe",
+    "Q845958": "¥",
+    "Q848856": "dam",
+    "Q851872": "o",
+    "Q854546": "Gm",
+    "Q855161": "Yibit",
+    "Q856240": "ft³/min",
+    "Q857027": "ft²",
+    "Q85854198": "MN",
+    "Q864818": "abA",
+    "Q87262709": "kΩ",
+    "Q87416053": "MΩ",
+    "Q88296091": "tsp",
+    "Q9026416": "MWth",
+    "Q9048643": "nl",
+    "Q905912": "L",
+    "Q906223": "Es",
+    "Q909066": "at",
+    "Q911730": "nx",
+    "Q914151": "P_P",
+    "Q915169": "F_P",
+    "Q93318": "nmi",
+    "Q940052": "q",
+    "Q94076025": "dalm",
+    "Q94076717": "dakat",
+    "Q942092": "BWI$",
+    "Q94414053": "Prad",
+    "Q94414499": "PC",
+    "Q94415026": "Grad",
+    "Q94415255": "GC",
+    "Q94415438": "Yrad",
+    "Q94415526": "YC",
+    "Q94415782": "Mrad",
+    "Q94416260": "GN",
+    "Q94416535": "cN",
+    "Q94416879": "YN",
+    "Q94417138": "PN",
+    "Q94417481": "μGy",
+    "Q94417583": "μS",
+    "Q94417598": "μT",
+    "Q94417933": "μlm",
+    "Q94418102": "μN",
+    "Q94418220": "μsr",
+    "Q94418481": "μBq",
+    "Q94479580": "GΩ",
+    "Q94480021": "PΩ",
+    "Q94480081": "YΩ",
+    "Q94480128": "cΩ",
+    "Q94480131": "TΩ",
+    "Q94480136": "pΩ",
+    "Q94480254": "nΩ",
+    "Q94480476": "dΩ",
+    "Q94480633": "EΩ",
+    "Q94480967": "daΩ",
+    "Q94481176": "hΩ",
+    "Q94481339": "fΩ",
+    "Q94481646": "yΩ",
+    "Q94487174": "zΩ",
+    "Q94487366": "mΩ",
+    "Q94487561": "μΩ",
+    "Q94487750": "kGy",
+    "Q94488007": "klx",
+    "Q94488361": "MF",
+    "Q94488759": "GBq",
+    "Q94489041": "PBq",
+    "Q94489223": "YBq",
+    "Q94489429": "MBq",
+    "Q94489465": "kBq",
+    "Q94489476": "TBq",
+    "Q94489494": "kWb",
+    "Q94489520": "kS",
+    "Q94490951": "klm",
+    "Q94491129": "kkat",
+    "Q94634634": "cC",
+    "Q94634655": "MC",
+    "Q94634666": "kC",
+    "Q94634677": "TC",
+    "Q94634684": "μC",
+    "Q94634699": "mC",
+    "Q94693759": "csr",
+    "Q94693773": "msr",
+    "Q94693786": "mWb",
+    "Q94693805": "μWb",
+    "Q94693819": "GS",
+    "Q94693849": "cS",
+    "Q94693918": "MS",
+    "Q94694019": "TS",
+    "Q94694096": "pS",
+    "Q94694154": "nS",
+    "Q94694206": "mS",
+    "Q94731530": "mlm",
+    "Q94731808": "mkat",
+    "Q94731887": "μkat",
+    "Q94732218": "nkat",
+    "Q94732627": "pkat",
+    "Q94733432": "fkat",
+    "Q94733760": "cGy",
+    "Q94734107": "dGy",
+    "Q94734232": "mGy",
+    "Q94734359": "daGy",
+    "Q94734468": "aGy",
+    "Q94734527": "pGy",
+    "Q94734593": "nGy",
+    "Q94734689": "kT",
+    "Q94734788": "mT",
+    "Q94939947": "Gkat",
+    "Q94940018": "Pkat",
+    "Q94940081": "ykat",
+    "Q94940160": "dkat",
+    "Q94940232": "Ekat",
+    "Q94940295": "Ykat",
+    "Q94940582": "Tkat",
+    "Q94940892": "hkat",
+    "Q94941461": "zkat",
+    "Q94942602": "MGy",
+    "Q94942863": "GGy",
+    "Q94986863": "YWb",
+    "Q94986889": "PWb",
+    "Q94986906": "cWb",
+    "Q94986920": "GWb",
+    "Q94986942": "MWb",
+    "Q94986962": "TWb",
+    "Q95178536": "Mlm",
+    "Q95178777": "Tlm",
+    "Q95178881": "clm",
+    "Q95179024": "plm",
+    "Q95179137": "nlm",
+    "Q95179382": "hlm",
+    "Q95179467": "flm",
+    "Q95179608": "zlm",
+    "Q95179695": "Mkat",
+    "Q95179788": "ckat",
+    "Q95179882": "PGy",
+    "Q95377836": "PF",
+    "Q95377853": "YF",
+    "Q95378017": "kF",
+    "Q95378296": "TF",
+    "Q95379145": "cF",
+    "Q95379382": "GF",
+    "Q95379491": "daC",
+    "Q95379580": "hC",
+    "Q95379588": "dC",
+    "Q95379596": "EC",
+    "Q95445986": "nC",
+    "Q95446327": "pC",
+    "Q95446670": "fC",
+    "Q95447079": "zC",
+    "Q95447237": "yC",
+    "Q95447253": "fF",
+    "Q95447263": "zF",
+    "Q95447276": "aF",
+    "Q95447555": "dF",
+    "Q95447863": "EF",
+    "Q95448262": "yF",
+    "Q95448479": "hF",
+    "Q95448689": "daF",
+    "Q95448950": "kSv",
+    "Q95559229": "GSv",
+    "Q95559368": "YSv",
+    "Q95559441": "MSv",
+    "Q95559576": "TSv",
+    "Q95559603": "PSv",
+    "Q95609154": "nWb",
+    "Q95609210": "fWb",
+    "Q95609261": "zWb",
+    "Q95609291": "dWb",
+    "Q95609317": "EWb",
+    "Q95676212": "pWb",
+    "Q95676232": "yWb",
+    "Q95676243": "hWb",
+    "Q95676250": "daWb",
+    "Q95676257": "PS",
+    "Q95676260": "YS",
+    "Q95676273": "zS",
+    "Q95676275": "fS",
+    "Q95676279": "yS",
+    "Q95676287": "hS",
+    "Q95676291": "daS",
+    "Q95676297": "dS",
+    "Q95676298": "ES",
+    "Q95720731": "YGy",
+    "Q95720734": "TGy",
+    "Q95720736": "fGy",
+    "Q95720739": "yGy",
+    "Q95720741": "zGy",
+    "Q95720742": "EGy",
+    "Q95720746": "hGy",
+    "Q95720749": "mlx",
+    "Q95720758": "μlx",
+    "Q95720773": "dalx",
+    "Q95720777": "hlx",
+    "Q95720781": "dlx",
+    "Q95720786": "clx",
+    "Q95857671": "zSv",
+    "Q95859071": "fSv",
+    "Q95860960": "daSv",
+    "Q95861107": "hSv",
+    "Q95861296": "dSv",
+    "Q95862182": "ESv",
+    "Q95863358": "cSv",
+    "Q95863591": "ySv",
+    "Q95863894": "pSv",
+    "Q95864194": "zBq",
+    "Q95864378": "fBq",
+    "Q95864695": "daBq",
+    "Q95864940": "hBq",
+    "Q95865286": "dBq",
+    "Q95865530": "EBq",
+    "Q95865716": "cBq",
+    "Q95865877": "yBq",
+    "Q95866173": "pBq",
+    "Q95866344": "nBq",
+    "Q95866767": "mBq",
+    "Q95867993": "mN",
+    "Q95948345": "crad",
+    "Q95948364": "drad",
+    "Q95948734": "daN",
+    "Q95948739": "hN",
+    "Q95948747": "dN",
+    "Q95976839": "Plm",
+    "Q95976853": "Glm",
+    "Q95976869": "Ylm",
+    "Q95976889": "ylm",
+    "Q95976917": "dlm",
+    "Q95976919": "Elm",
+    "Q95976921": "nT",
+    "Q95993516": "TN",
+    "Q95993522": "nN",
+    "Q95993524": "fN",
+    "Q95993526": "yN",
+    "Q95993528": "zN",
+    "Q95993530": "EN",
+    "Q95993532": "pN",
+    "Q95993537": "μrad",
+    "Q95993542": "nrad",
+    "Q95993547": "frad",
+    "Q95993553": "prad",
+    "Q95993554": "darad",
+    "Q95993557": "hrad",
+    "Q95993619": "pT",
+    "Q96025401": "daT",
+    "Q96025405": "Trad",
+    "Q96025407": "Zrad",
+    "Q96025409": "zrad",
+    "Q96025413": "yrad",
+    "Q96025414": "Erad",
+    "Q96025419": "Ylx",
+    "Q96025422": "Glx",
+    "Q96025427": "Plx",
+    "Q96025431": "Mlx",
+    "Q96025433": "Tlx",
+    "Q96025435": "nlx",
+    "Q96025441": "flx",
+    "Q96050953": "GH",
+    "Q96051010": "PH",
+    "Q96051029": "YH",
+    "Q96051052": "cH",
+    "Q96051074": "TH",
+    "Q96051106": "MH",
+    "Q96051123": "kH",
+    "Q96051126": "fH",
+    "Q96051133": "yH",
+    "Q96051139": "hH",
+    "Q96051142": "dH",
+    "Q96051144": "EH",
+    "Q96051150": "pH",
+    "Q96051160": "daH",
+    "Q96051186": "zH",
+    "Q96051199": "aH",
+    "Q96051245": "ylx",
+    "Q96051267": "Elx",
+    "Q96051282": "plx",
+    "Q96051312": "zlx",
+    "Q96070067": "PT",
+    "Q96070074": "YT",
+    "Q96070076": "GT",
+    "Q96070087": "cT",
+    "Q96070103": "MT",
+    "Q96070125": "hT",
+    "Q96070145": "fT",
+    "Q96070174": "TT",
+    "Q96070195": "zT",
+    "Q96070247": "yT",
+    "Q96070254": "dT",
+    "Q96070264": "ET",
+    "Q96070276": "m°C",
+    "Q96070318": "dsr",
+    "Q96070329": "nsr",
+    "Q96070341": "psr",
+    "Q96095866": "fsr",
+    "Q96095897": "zsr",
+    "Q96095917": "ysr",
+    "Q96095927": "dasr",
+    "Q96095928": "hsr",
+    "Q96095931": "ksr",
+    "Q96095933": "Msr",
+    "Q96095939": "Gsr",
+    "Q96095941": "μ°C",
+    "Q96095955": "n°C",
+    "Q96095960": "k°C",
+    "Q96106290": "Tsr",
+    "Q96106298": "Psr",
+    "Q96106311": "Esr",
+    "Q96106319": "Zsr",
+    "Q96106332": "Ysr",
+    "Q96106346": "c°C",
+    "Q96106360": "d°C",
+    "Q96106368": "da°C",
+    "Q96106385": "h°C",
+    "Q96106393": "M°C",
+    "Q96236286": "G°C",
+    "Q97059641": "p°C",
+    "Q97059652": "T°C",
+    "Q97143826": "P°C",
+    "Q97143831": "y°C",
+    "Q97143835": "f°C",
+    "Q97143838": "Z°C",
+    "Q97143842": "E°C",
+    "Q97143843": "z°C",
+    "Q97143849": "Y°C",
+    "Q97143851": "a°C",
+    "Q98538634": "eV/m²",
+    "Q98635536": "eV/m",
+    "Q98642859": "eV m²/kg",
+    "Q11229": "%",
+    "Q11570": "kg",
+    "Q11573": "m",
+    "Q11574": "s",
+    "Q11579": "K",
+    "Q11582": "L",
+    "Q12129": "pc",
+    "Q12438": "N",
+    "Q16068": "DM",
+    "Q1811": "ua",
+    "Q20764": "Myr",
+    "Q2101": "e",
+    "Q25235": "h",
+    "Q25236": "W",
+    "Q25250": "V",
+    "Q25267": "°C",
+    "Q25269": "J",
+    "Q25272": "A",
+    "Q25343": "m²",
+    "Q25406": "C",
+    "Q25517": "m³",
+    "Q33680": "rad",
+    "Q35852": "ha",
+    "Q36384": "equiv",
+    "Q3710": "ft",
+    "Q39274": "Sv",
+    "Q39369": "Hz",
+    "Q41509": "mol",
+    "Q41803": "g",
+    "Q42289": "°F",
+    "Q4406": "TV$",
+    "Q44395": "Pa",
+    "Q4587": "Le",
+    "Q4588": "WS$",
+    "Q4592": "F$",
+    "Q4596": "Rs",
+    "Q4597": "$",
+    "Q47083": "Ω",
+    "Q48013": "oz",
+    "Q50094": "Np",
+    "Q50098": "B",
+    "Q531": "ly",
+    "Q5329": "dB",
+    "Q573": "d",
+    "Q577": "a",
+    "Q7727": "min",
+    "Q8799": "B"
+}

+ 156 - 71
searx/engines/duckduckgo_definitions.py

@@ -12,28 +12,53 @@ DuckDuckGo (definitions)
 import json
 from urllib.parse import urlencode
 from lxml import html
-from re import compile
+
+from searx import logger
+from searx.data import WIKIDATA_UNITS
 from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
-from searx.utils import extract_text, html_to_text, match_language
+from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function
+from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
+
+
+logger = logger.getChild('duckduckgo_definitions')
 
-url = 'https://api.duckduckgo.com/'\
+URL = 'https://api.duckduckgo.com/'\
     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
 
-http_regex = compile(r'^http:')
+WIKIDATA_PREFIX = [
+    'http://www.wikidata.org/entity/',
+    'https://www.wikidata.org/entity/'
+]
+
+replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
+
+
+def is_broken_text(text):
+    """ duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>"
 
+    The href URL is broken, the "Related website" may contains some HTML.
 
-def result_to_text(url, text, htmlResult):
+    The best solution seems to ignore these results.
+    """
+    return text.startswith('http') and ' ' in text
+
+
+def result_to_text(text, htmlResult):
     # TODO : remove result ending with "Meaning" or "Category"
+    result = None
     dom = html.fromstring(htmlResult)
     a = dom.xpath('//a')
     if len(a) >= 1:
-        return extract_text(a[0])
+        result = extract_text(a[0])
     else:
-        return text
+        result = text
+    if not is_broken_text(result):
+        return result
+    return None
 
 
 def request(query, params):
-    params['url'] = url.format(query=urlencode({'q': query}))
+    params['url'] = URL.format(query=urlencode({'q': query}))
     language = match_language(params['language'], supported_languages, language_aliases)
     language = language.split('-')[0]
     params['headers']['Accept-Language'] = language
@@ -45,6 +70,14 @@ def response(resp):
 
     search_res = json.loads(resp.text)
 
+    # search_res.get('Entity') possible values (not exhaustive) :
+    # * continent / country / department / location / waterfall
+    # * actor / musician / artist
+    # * book / performing art / film / television  / media franchise / concert tour / playwright
+    # * prepared food
+    # * website / software / os / programming language / file format / software engineer
+    # * compagny
+
     content = ''
     heading = search_res.get('Heading', '')
     attributes = []
@@ -55,7 +88,8 @@ def response(resp):
     # add answer if there is one
     answer = search_res.get('Answer', '')
     if answer:
-        if search_res.get('AnswerType', '') not in ['calc']:
+        logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer)
+        if search_res.get('AnswerType') not in ['calc', 'ip']:
             results.append({'answer': html_to_text(answer)})
 
     # add infobox
@@ -66,42 +100,36 @@ def response(resp):
         content = content + search_res.get('Abstract', '')
 
     # image
-    image = search_res.get('Image', '')
+    image = search_res.get('Image')
     image = None if image == '' else image
 
-    # attributes
-    if 'Infobox' in search_res:
-        infobox = search_res.get('Infobox', None)
-        if 'content' in infobox:
-            for info in infobox.get('content'):
-                attributes.append({'label': info.get('label'),
-                                  'value': info.get('value')})
-
     # urls
+    # Official website, Wikipedia page
     for ddg_result in search_res.get('Results', []):
-        if 'FirstURL' in ddg_result:
-            firstURL = ddg_result.get('FirstURL', '')
-            text = ddg_result.get('Text', '')
+        firstURL = ddg_result.get('FirstURL')
+        text = ddg_result.get('Text')
+        if firstURL is not None and text is not None:
             urls.append({'title': text, 'url': firstURL})
             results.append({'title': heading, 'url': firstURL})
 
     # related topics
     for ddg_result in search_res.get('RelatedTopics', []):
         if 'FirstURL' in ddg_result:
-            suggestion = result_to_text(ddg_result.get('FirstURL', None),
-                                        ddg_result.get('Text', None),
-                                        ddg_result.get('Result', None))
-            if suggestion != heading:
-                results.append({'suggestion': suggestion})
+            firstURL = ddg_result.get('FirstURL')
+            text = ddg_result.get('Text')
+            if not is_broken_text(text):
+                suggestion = result_to_text(text,
+                                            ddg_result.get('Result'))
+                if suggestion != heading and suggestion is not None:
+                    results.append({'suggestion': suggestion})
         elif 'Topics' in ddg_result:
             suggestions = []
             relatedTopics.append({'name': ddg_result.get('Name', ''),
-                                 'suggestions': suggestions})
+                                  'suggestions': suggestions})
             for topic_result in ddg_result.get('Topics', []):
-                suggestion = result_to_text(topic_result.get('FirstURL', None),
-                                            topic_result.get('Text', None),
-                                            topic_result.get('Result', None))
-                if suggestion != heading:
+                suggestion = result_to_text(topic_result.get('Text'),
+                                            topic_result.get('Result'))
+                if suggestion != heading and suggestion is not None:
                     suggestions.append(suggestion)
 
     # abstract
@@ -110,7 +138,10 @@ def response(resp):
         # add as result ? problem always in english
         infobox_id = abstractURL
         urls.append({'title': search_res.get('AbstractSource'),
-                    'url': abstractURL})
+                     'url': abstractURL,
+                     'official': True})
+        results.append({'url': abstractURL,
+                        'title': heading})
 
     # definition
     definitionURL = search_res.get('DefinitionURL', '')
@@ -118,53 +149,107 @@ def response(resp):
         # add as result ? as answer ? problem always in english
         infobox_id = definitionURL
         urls.append({'title': search_res.get('DefinitionSource'),
-                    'url': definitionURL})
+                     'url': definitionURL})
 
     # to merge with wikidata's infobox
     if infobox_id:
-        infobox_id = http_regex.sub('https:', infobox_id)
-
-    # entity
-    entity = search_res.get('Entity', None)
-    # TODO continent / country / department / location / waterfall /
-    #      mountain range :
-    #      link to map search, get weather, near by locations
-    # TODO musician : link to music search
-    # TODO concert tour : ??
-    # TODO film / actor / television  / media franchise :
-    #      links to IMDB / rottentomatoes (or scrap result)
-    # TODO music : link tu musicbrainz / last.fm
-    # TODO book : ??
-    # TODO artist / playwright : ??
-    # TODO compagny : ??
-    # TODO software / os : ??
-    # TODO software engineer : ??
-    # TODO prepared food : ??
-    # TODO website : ??
-    # TODO performing art : ??
-    # TODO prepared food : ??
-    # TODO programming language : ??
-    # TODO file format : ??
+        infobox_id = replace_http_by_https(infobox_id)
+
+    # attributes
+    # some will be converted to urls
+    if 'Infobox' in search_res:
+        infobox = search_res.get('Infobox')
+        if 'content' in infobox:
+            osm_zoom = 17
+            coordinates = None
+            for info in infobox.get('content'):
+                data_type = info.get('data_type')
+                data_label = info.get('label')
+                data_value = info.get('value')
+
+                # Workaround: ddg may return a double quote
+                if data_value == '""':
+                    continue
+
+                # Is it an external URL ?
+                # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
+                # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
+                # * netflix_id
+                external_url = get_external_url(data_type, data_value)
+                if external_url is not None:
+                    urls.append({'title': data_label,
+                                 'url': external_url})
+                elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
+                    # ignore instance: Wikidata value from "Instance Of" (Qxxxx)
+                    # ignore wiki_maps_trigger: reference to a javascript
+                    # ignore google_play_artist_id: service shutdown
+                    pass
+                elif data_type == 'string' and data_label == 'Website':
+                    # There is already an URL for the website
+                    pass
+                elif data_type == 'area':
+                    attributes.append({'label': data_label,
+                                       'value': area_to_str(data_value),
+                                       'entity': 'P2046'})
+                    osm_zoom = area_to_osm_zoom(data_value.get('amount'))
+                elif data_type == 'coordinates':
+                    if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
+                        # coordinate on Earth
+                        # get the zoom information from the area
+                        coordinates = info
+                    else:
+                        # coordinate NOT on Earth
+                        attributes.append({'label': data_label,
+                                           'value': data_value,
+                                           'entity': 'P625'})
+                elif data_type == 'string':
+                    attributes.append({'label': data_label,
+                                       'value': data_value})
+
+            if coordinates:
+                data_label = coordinates.get('label')
+                data_value = coordinates.get('value')
+                latitude = data_value.get('latitude')
+                longitude = data_value.get('longitude')
+                url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
+                urls.append({'title': 'OpenStreetMap',
+                             'url': url,
+                             'entity': 'P625'})
 
     if len(heading) > 0:
         # TODO get infobox.meta.value where .label='article_title'
         if image is None and len(attributes) == 0 and len(urls) == 1 and\
            len(relatedTopics) == 0 and len(content) == 0:
-            results.append({
-                           'url': urls[0]['url'],
-                           'title': heading,
-                           'content': content
-                           })
+            results.append({'url': urls[0]['url'],
+                            'title': heading,
+                            'content': content})
         else:
-            results.append({
-                           'infobox': heading,
-                           'id': infobox_id,
-                           'entity': entity,
-                           'content': content,
-                           'img_src': image,
-                           'attributes': attributes,
-                           'urls': urls,
-                           'relatedTopics': relatedTopics
-                           })
+            results.append({'infobox': heading,
+                            'id': infobox_id,
+                            'content': content,
+                            'img_src': image,
+                            'attributes': attributes,
+                            'urls': urls,
+                            'relatedTopics': relatedTopics})
 
     return results
+
+
+def unit_to_str(unit):
+    for prefix in WIKIDATA_PREFIX:
+        if unit.startswith(prefix):
+            wikidata_entity = unit[len(prefix):]
+            return WIKIDATA_UNITS.get(wikidata_entity, unit)
+    return unit
+
+
+def area_to_str(area):
+    """parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}"""
+    unit = unit_to_str(area.get('unit'))
+    if unit is not None:
+        try:
+            amount = float(area.get('amount'))
+            return '{} {}'.format(amount, unit)
+        except ValueError:
+            pass
+    return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))

+ 637 - 452
searx/engines/wikidata.py

@@ -3,501 +3,686 @@
  Wikidata
 
  @website     https://wikidata.org
- @provide-api yes (https://wikidata.org/w/api.php)
+ @provide-api yes (https://query.wikidata.org/)
 
- @using-api   partially (most things require scraping)
- @results     JSON, HTML
- @stable      no (html can change)
+ @using-api   yes
+ @results     JSON
+ @stable      yes
  @parse       url, infobox
 """
 
-from searx import logger
-from searx.poolrequests import get
-from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
-from searx.utils import extract_text, match_language, eval_xpath
 
 from urllib.parse import urlencode
 from json import loads
-from lxml.html import fromstring
-from lxml import etree
+
+from dateutil.parser import isoparse
+from babel.dates import format_datetime, format_date, format_time, get_datetime_format
+
+from searx import logger
+from searx.data import WIKIDATA_UNITS
+from searx.poolrequests import post, get
+from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
+from searx.utils import match_language, searx_useragent, get_string_replaces_function
+from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
 
 logger = logger.getChild('wikidata')
-result_count = 1
-
-# urls
-wikidata_host = 'https://www.wikidata.org'
-url_search = wikidata_host \
-    + '/w/index.php?{query}&ns0=1'
-
-wikidata_api = wikidata_host + '/w/api.php'
-url_detail = wikidata_api\
-    + '?action=parse&format=json&{query}'\
-    + '&redirects=1&prop=text%7Cdisplaytitle%7Cparsewarnings'\
-    + '&disableeditsection=1&preview=1&sectionpreview=1&disabletoc=1&utf8=1&formatversion=2'
-
-url_map = 'https://www.openstreetmap.org/'\
-    + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
-url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400'
-
-# xpaths
-div_ids_xpath = '//div[@id]'
-wikidata_ids_xpath = '//ul[@class="mw-search-results"]/li//a/@href'
-title_xpath = '//*[contains(@class,"wikibase-title-label")]'
-description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]'
-label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a'
-url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]'
-wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\
-    + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href'
-property_row_xpath = './/div[contains(@class,"wikibase-statementview")]'
-preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]'
-value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\
-    + '/*/div[contains(@class,"wikibase-snakview-value")]'
-language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]'
-calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
-media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a'
-
-
-def get_id_cache(result):
-    id_cache = {}
-    for e in eval_xpath(result, div_ids_xpath):
-        id = e.get('id')
-        if id.startswith('P'):
-            id_cache[id] = e
-    return id_cache
 
+# SPARQL
+SPARQL_ENDPOINT_URL = 'https://query.wikidata.org/sparql'
+SPARQL_EXPLAIN_URL = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql?explain'
+WIKIDATA_PROPERTIES = {
+    'P434': 'MusicBrainz',
+    'P435': 'MusicBrainz',
+    'P436': 'MusicBrainz',
+    'P966': 'MusicBrainz',
+    'P345': 'IMDb',
+    'P2397': 'YouTube',
+    'P1651': 'YouTube',
+    'P2002': 'Twitter',
+    'P2013': 'Facebook',
+    'P2003': 'Instagram',
+}
+
+# SERVICE wikibase:mwapi : https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI
+# SERVICE wikibase:label: https://en.wikibooks.org/wiki/SPARQL/SERVICE_-_Label#Manual_Label_SERVICE
+# https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates
+# https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format#Data_model
+# optmization:
+# * https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization
+# * https://github.com/blazegraph/database/wiki/QueryHints
+QUERY_TEMPLATE = """
+SELECT ?item ?itemLabel ?itemDescription ?lat ?long %SELECT%
+WHERE
+{
+  SERVICE wikibase:mwapi {
+        bd:serviceParam wikibase:endpoint "www.wikidata.org";
+        wikibase:api "EntitySearch";
+        wikibase:limit 1;
+        mwapi:search "%QUERY%";
+        mwapi:language "%LANGUAGE%".
+        ?item wikibase:apiOutputItem mwapi:item.
+  }
+
+  %WHERE%
+
+  SERVICE wikibase:label {
+      bd:serviceParam wikibase:language "%LANGUAGE%,en".
+      ?item rdfs:label ?itemLabel .
+      ?item schema:description ?itemDescription .
+      %WIKIBASE_LABELS%
+  }
+
+}
+GROUP BY ?item ?itemLabel ?itemDescription ?lat ?long %GROUP_BY%
+"""
 
-def request(query, params):
-    params['url'] = url_search.format(
-        query=urlencode({'search': query}))
-    return params
+# Get the calendar names and the property names
+QUERY_PROPERTY_NAMES = """
+SELECT ?item ?name
+WHERE {
+    {
+      SELECT ?item
+      WHERE { ?item wdt:P279* wd:Q12132 }
+    } UNION {
+      VALUES ?item { %ATTRIBUTES% }
+    }
+    OPTIONAL { ?item rdfs:label ?name. }
+}
+"""
 
 
-def response(resp):
-    results = []
-    htmlparser = etree.HTMLParser()
-    html = fromstring(resp.content.decode(), parser=htmlparser)
-    search_results = eval_xpath(html, wikidata_ids_xpath)
+# https://www.w3.org/TR/sparql11-query/#rSTRING_LITERAL1
+# https://lists.w3.org/Archives/Public/public-rdf-dawg/2011OctDec/0175.html
+sparql_string_escape = get_string_replaces_function({'\t': '\\\t',
+                                                     '\n': '\\\n',
+                                                     '\r': '\\\r',
+                                                     '\b': '\\\b',
+                                                     '\f': '\\\f',
+                                                     '\"': '\\\"',
+                                                     '\'': '\\\'',
+                                                     '\\': '\\\\'})
+
+replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
+
+
+def get_headers():
+    # user agent: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits
+    return {
+        'Accept': 'application/sparql-results+json',
+        'User-Agent': searx_useragent()
+    }
+
+
+def get_label_for_entity(entity_id, language):
+    name = WIKIDATA_PROPERTIES.get(entity_id)
+    if name is None:
+        name = WIKIDATA_PROPERTIES.get((entity_id, language))
+    if name is None:
+        name = WIKIDATA_PROPERTIES.get((entity_id, language.split('-')[0]))
+    if name is None:
+        name = WIKIDATA_PROPERTIES.get((entity_id, 'en'))
+    if name is None:
+        name = entity_id
+    return name
+
+
+def send_wikidata_query(query, method='GET'):
+    if method == 'GET':
+        # query will be cached by wikidata
+        http_response = get(SPARQL_ENDPOINT_URL + '?' + urlencode({'query': query}), headers=get_headers())
+    else:
+        # query won't be cached by wikidata
+        http_response = post(SPARQL_ENDPOINT_URL, data={'query': query}, headers=get_headers())
+    if http_response.status_code != 200:
+        logger.debug('SPARQL endpoint error %s', http_response.content.decode())
+    logger.debug('request time %s', str(http_response.elapsed))
+    http_response.raise_for_status()
+    return loads(http_response.content.decode())
+
 
-    if resp.search_params['language'].split('-')[0] == 'all':
+def request(query, params):
+    language = params['language'].split('-')[0]
+    if language == 'all':
         language = 'en'
     else:
-        language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0]
+        language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
+
+    query, attributes = get_query(query, language)
 
-    # TODO: make requests asynchronous to avoid timeout when result_count > 1
-    for search_result in search_results[:result_count]:
-        wikidata_id = search_result.split('/')[-1]
-        url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language}))
-        htmlresponse = get(url)
-        jsonresponse = loads(htmlresponse.content.decode())
-        results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'], htmlparser)
+    params['method'] = 'POST'
+    params['url'] = SPARQL_ENDPOINT_URL
+    params['data'] = {'query': query}
+    params['headers'] = get_headers()
+
+    params['language'] = language
+    params['attributes'] = attributes
+    return params
+
+
+def response(resp):
+    results = []
+    if resp.status_code != 200:
+        logger.debug('SPARQL endpoint error %s', resp.content.decode())
+    resp.raise_for_status()
+    jsonresponse = loads(resp.content.decode())
+
+    language = resp.search_params['language'].lower()
+    attributes = resp.search_params['attributes']
+
+    seen_entities = set()
+
+    for result in jsonresponse.get('results', {}).get('bindings', []):
+        attribute_result = {key: value['value'] for key, value in result.items()}
+        entity_url = attribute_result['item']
+        if entity_url not in seen_entities:
+            seen_entities.add(entity_url)
+            results += get_results(attribute_result, attributes, language)
+        else:
+            logger.debug('The SPARQL request returns duplicate entities: %s', str(attribute_result))
 
     return results
 
 
-def getDetail(jsonresponse, wikidata_id, language, locale, htmlparser):
+def get_results(attribute_result, attributes, language):
     results = []
-    urls = []
-    attributes = []
+    infobox_title = attribute_result.get('itemLabel')
+    infobox_id = attribute_result['item']
+    infobox_id_lang = None
+    infobox_urls = []
+    infobox_attributes = []
+    infobox_content = attribute_result.get('itemDescription')
+    img_src = None
+    img_src_priority = 100
+
+    for attribute in attributes:
+        value = attribute.get_str(attribute_result, language)
+        if value is not None and value != '':
+            attribute_type = type(attribute)
+
+            if attribute_type in (WDURLAttribute, WDArticle):
+                # get_select() method : there is group_concat(distinct ...;separator=", ")
+                # split the value here
+                for url in value.split(', '):
+                    infobox_urls.append({'title': attribute.get_label(language), 'url': url, **attribute.kwargs})
+                    # "normal" results (not infobox) include official website and Wikipedia links.
+                    if attribute.kwargs.get('official') or attribute_type == WDArticle:
+                        results.append({'title': infobox_title, 'url': url})
+                    # update the infobox_id with the wikipedia URL
+                    # first the local wikipedia URL, and as fallback the english wikipedia URL
+                    if attribute_type == WDArticle\
+                       and ((attribute.language == 'en' and infobox_id_lang is None)
+                            or attribute.language != 'en'):
+                        infobox_id_lang = attribute.language
+                        infobox_id = url
+            elif attribute_type == WDImageAttribute:
+                # this attribute is an image.
+                # replace the current image only the priority is lower
+                # (the infobox contain only one image).
+                if attribute.priority < img_src_priority:
+                    img_src = value
+                    img_src_priority = attribute.priority
+            elif attribute_type == WDGeoAttribute:
+                # geocoordinate link
+                # use the area to get the OSM zoom
+                # Note: ignre the unit (must be km² otherwise the calculation is wrong)
+                # Should use normalized value p:P2046/psn:P2046/wikibase:quantityAmount
+                area = attribute_result.get('P2046')
+                osm_zoom = area_to_osm_zoom(area) if area else 19
+                url = attribute.get_str(attribute_result, language, osm_zoom=osm_zoom)
+                if url:
+                    infobox_urls.append({'title': attribute.get_label(language),
+                                         'url': url,
+                                         'entity': attribute.name})
+            else:
+                infobox_attributes.append({'label': attribute.get_label(language),
+                                           'value': value,
+                                           'entity': attribute.name})
+
+    if infobox_id:
+        infobox_id = replace_http_by_https(infobox_id)
 
-    title = jsonresponse.get('parse', {}).get('displaytitle', {})
-    result = jsonresponse.get('parse', {}).get('text', {})
-
-    if not title or not result:
-        return results
-
-    title = fromstring(title, parser=htmlparser)
-    for elem in eval_xpath(title, language_fallback_xpath):
-        elem.getparent().remove(elem)
-    title = extract_text(eval_xpath(title, title_xpath))
-
-    result = fromstring(result, parser=htmlparser)
-    for elem in eval_xpath(result, language_fallback_xpath):
-        elem.getparent().remove(elem)
-
-    description = extract_text(eval_xpath(result, description_xpath))
-
-    id_cache = get_id_cache(result)
-
-    # URLS
-
-    # official website
-    add_url(urls, result, id_cache, 'P856', results=results)
-
-    # wikipedia
-    wikipedia_link_count = 0
-    wikipedia_link = get_wikilink(result, language + 'wiki')
-    if wikipedia_link:
-        wikipedia_link_count += 1
-        urls.append({'title': 'Wikipedia (' + language + ')',
-                     'url': wikipedia_link})
-
-    if language != 'en':
-        wikipedia_en_link = get_wikilink(result, 'enwiki')
-        if wikipedia_en_link:
-            wikipedia_link_count += 1
-            urls.append({'title': 'Wikipedia (en)',
-                         'url': wikipedia_en_link})
-
-    # TODO: get_wiki_firstlanguage
-    # if wikipedia_link_count == 0:
-
-    # more wikis
-    add_url(urls, result, id_cache, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage')
-    add_url(urls, result, id_cache, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote')
-    add_url(urls, result, id_cache, default_label='Wikimedia Commons', link_type='commonswiki')
-
-    add_url(urls, result, id_cache, 'P625', 'OpenStreetMap', link_type='geo')
-
-    # musicbrainz
-    add_url(urls, result, id_cache, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/')
-    add_url(urls, result, id_cache, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/')
-    add_url(urls, result, id_cache, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/')
-    add_url(urls, result, id_cache, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/')
-
-    # IMDb
-    add_url(urls, result, id_cache, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb')
-    # source code repository
-    add_url(urls, result, id_cache, 'P1324')
-    # blog
-    add_url(urls, result, id_cache, 'P1581')
-    # social media links
-    add_url(urls, result, id_cache, 'P2397', 'YouTube', 'https://www.youtube.com/channel/')
-    add_url(urls, result, id_cache, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=')
-    add_url(urls, result, id_cache, 'P2002', 'Twitter', 'https://twitter.com/')
-    add_url(urls, result, id_cache, 'P2013', 'Facebook', 'https://facebook.com/')
-    add_url(urls, result, id_cache, 'P2003', 'Instagram', 'https://instagram.com/')
-
-    urls.append({'title': 'Wikidata',
-                 'url': 'https://www.wikidata.org/wiki/'
-                 + wikidata_id + '?uselang=' + language})
-
-    # INFOBOX ATTRIBUTES (ROWS)
-
-    # DATES
-    # inception date
-    add_attribute(attributes, id_cache, 'P571', date=True)
-    # dissolution date
-    add_attribute(attributes, id_cache, 'P576', date=True)
-    # start date
-    add_attribute(attributes, id_cache, 'P580', date=True)
-    # end date
-    add_attribute(attributes, id_cache, 'P582', date=True)
-    # date of birth
-    add_attribute(attributes, id_cache, 'P569', date=True)
-    # date of death
-    add_attribute(attributes, id_cache, 'P570', date=True)
-    # date of spacecraft launch
-    add_attribute(attributes, id_cache, 'P619', date=True)
-    # date of spacecraft landing
-    add_attribute(attributes, id_cache, 'P620', date=True)
-
-    # nationality
-    add_attribute(attributes, id_cache, 'P27')
-    # country of origin
-    add_attribute(attributes, id_cache, 'P495')
-    # country
-    add_attribute(attributes, id_cache, 'P17')
-    # headquarters
-    add_attribute(attributes, id_cache, 'Q180')
-
-    # PLACES
-    # capital
-    add_attribute(attributes, id_cache, 'P36', trim=True)
-    # head of state
-    add_attribute(attributes, id_cache, 'P35', trim=True)
-    # head of government
-    add_attribute(attributes, id_cache, 'P6', trim=True)
-    # type of government
-    add_attribute(attributes, id_cache, 'P122')
-    # official language
-    add_attribute(attributes, id_cache, 'P37')
-    # population
-    add_attribute(attributes, id_cache, 'P1082', trim=True)
-    # area
-    add_attribute(attributes, id_cache, 'P2046')
-    # currency
-    add_attribute(attributes, id_cache, 'P38', trim=True)
-    # heigth (building)
-    add_attribute(attributes, id_cache, 'P2048')
-
-    # MEDIA
-    # platform (videogames)
-    add_attribute(attributes, id_cache, 'P400')
-    # author
-    add_attribute(attributes, id_cache, 'P50')
-    # creator
-    add_attribute(attributes, id_cache, 'P170')
-    # director
-    add_attribute(attributes, id_cache, 'P57')
-    # performer
-    add_attribute(attributes, id_cache, 'P175')
-    # developer
-    add_attribute(attributes, id_cache, 'P178')
-    # producer
-    add_attribute(attributes, id_cache, 'P162')
-    # manufacturer
-    add_attribute(attributes, id_cache, 'P176')
-    # screenwriter
-    add_attribute(attributes, id_cache, 'P58')
-    # production company
-    add_attribute(attributes, id_cache, 'P272')
-    # record label
-    add_attribute(attributes, id_cache, 'P264')
-    # publisher
-    add_attribute(attributes, id_cache, 'P123')
-    # original network
-    add_attribute(attributes, id_cache, 'P449')
-    # distributor
-    add_attribute(attributes, id_cache, 'P750')
-    # composer
-    add_attribute(attributes, id_cache, 'P86')
-    # publication date
-    add_attribute(attributes, id_cache, 'P577', date=True)
-    # genre
-    add_attribute(attributes, id_cache, 'P136')
-    # original language
-    add_attribute(attributes, id_cache, 'P364')
-    # isbn
-    add_attribute(attributes, id_cache, 'Q33057')
-    # software license
-    add_attribute(attributes, id_cache, 'P275')
-    # programming language
-    add_attribute(attributes, id_cache, 'P277')
-    # version
-    add_attribute(attributes, id_cache, 'P348', trim=True)
-    # narrative location
-    add_attribute(attributes, id_cache, 'P840')
-
-    # LANGUAGES
-    # number of speakers
-    add_attribute(attributes, id_cache, 'P1098')
-    # writing system
-    add_attribute(attributes, id_cache, 'P282')
-    # regulatory body
-    add_attribute(attributes, id_cache, 'P1018')
-    # language code
-    add_attribute(attributes, id_cache, 'P218')
-
-    # OTHER
-    # ceo
-    add_attribute(attributes, id_cache, 'P169', trim=True)
-    # founder
-    add_attribute(attributes, id_cache, 'P112')
-    # legal form (company/organization)
-    add_attribute(attributes, id_cache, 'P1454')
-    # operator
-    add_attribute(attributes, id_cache, 'P137')
-    # crew members (tripulation)
-    add_attribute(attributes, id_cache, 'P1029')
-    # taxon
-    add_attribute(attributes, id_cache, 'P225')
-    # chemical formula
-    add_attribute(attributes, id_cache, 'P274')
-    # winner (sports/contests)
-    add_attribute(attributes, id_cache, 'P1346')
-    # number of deaths
-    add_attribute(attributes, id_cache, 'P1120')
-    # currency code
-    add_attribute(attributes, id_cache, 'P498')
-
-    image = add_image(id_cache)
-
-    if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
+    # add the wikidata URL at the end
+    infobox_urls.append({'title': 'Wikidata', 'url': attribute_result['item']})
+
+    if img_src is None and len(infobox_attributes) == 0 and len(infobox_urls) == 1 and\
+       len(infobox_content) == 0:
         results.append({
-                       'url': urls[0]['url'],
-                       'title': title,
-                       'content': description
-                       })
+            'url': infobox_urls[0]['url'],
+            'title': infobox_title,
+            'content': infobox_content
+        })
     else:
         results.append({
-                       'infobox': title,
-                       'id': wikipedia_link,
-                       'content': description,
-                       'img_src': image,
-                       'attributes': attributes,
-                       'urls': urls
-                       })
-
+            'infobox': infobox_title,
+            'id': infobox_id,
+            'content': infobox_content,
+            'img_src': img_src,
+            'urls': infobox_urls,
+            'attributes': infobox_attributes
+        })
     return results
 
 
-# only returns first match
-def add_image(id_cache):
-    # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon
-    property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910']
+def get_query(query, language):
+    attributes = get_attributes(language)
+    select = [a.get_select() for a in attributes]
+    where = list(filter(lambda s: len(s) > 0, [a.get_where() for a in attributes]))
+    wikibase_label = list(filter(lambda s: len(s) > 0, [a.get_wikibase_label() for a in attributes]))
+    group_by = list(filter(lambda s: len(s) > 0, [a.get_group_by() for a in attributes]))
+    query = QUERY_TEMPLATE\
+        .replace('%QUERY%', sparql_string_escape(query))\
+        .replace('%SELECT%', ' '.join(select))\
+        .replace('%WHERE%', '\n  '.join(where))\
+        .replace('%WIKIBASE_LABELS%', '\n      '.join(wikibase_label))\
+        .replace('%GROUP_BY%', ' '.join(group_by))\
+        .replace('%LANGUAGE%', language)
+    return query, attributes
 
-    for property_id in property_ids:
-        image = id_cache.get(property_id, None)
-        if image is not None:
-            image_name = eval_xpath(image, media_xpath)
-            image_src = url_image.replace('{filename}', extract_text(image_name[0]))
-            return image_src
 
+def get_attributes(language):
+    attributes = []
 
-# setting trim will only returned high ranked rows OR the first row
-def add_attribute(attributes, id_cache, property_id, default_label=None, date=False, trim=False):
-    attribute = id_cache.get(property_id, None)
-    if attribute is not None:
+    def add_value(name):
+        attributes.append(WDAttribute(name))
+
+    def add_amount(name):
+        attributes.append(WDAmountAttribute(name))
+
+    def add_label(name):
+        attributes.append(WDLabelAttribute(name))
+
+    def add_url(name, url_id=None, **kwargs):
+        attributes.append(WDURLAttribute(name, url_id, kwargs))
+
+    def add_image(name, url_id=None, priority=1):
+        attributes.append(WDImageAttribute(name, url_id, priority))
+
+    def add_date(name):
+        attributes.append(WDDateAttribute(name))
+
+    # Dates
+    for p in ['P571',    # inception date
+              'P576',    # dissolution date
+              'P580',    # start date
+              'P582',    # end date
+              'P569',    # date of birth
+              'P570',    # date of death
+              'P619',    # date of spacecraft launch
+              'P620']:   # date of spacecraft landing
+        add_date(p)
+
+    for p in ['P27',     # country of citizenship
+              'P495',    # country of origin
+              'P17',     # country
+              'P159']:   # headquarters location
+        add_label(p)
+
+    # Places
+    for p in ['P36',     # capital
+              'P35',     # head of state
+              'P6',      # head of government
+              'P122',    # basic form of government
+              'P37']:    # official language
+        add_label(p)
+
+    add_value('P1082')   # population
+    add_amount('P2046')  # area
+    add_amount('P281')   # postal code
+    add_label('P38')     # currency
+    add_amount('P2048')  # heigth (building)
+
+    # Media
+    for p in ['P400',    # platform (videogames, computing)
+              'P50',     # author
+              'P170',    # creator
+              'P57',     # director
+              'P175',    # performer
+              'P178',    # developer
+              'P162',    # producer
+              'P176',    # manufacturer
+              'P58',     # screenwriter
+              'P272',    # production company
+              'P264',    # record label
+              'P123',    # publisher
+              'P449',    # original network
+              'P750',    # distributed by
+              'P86']:    # composer
+        add_label(p)
+
+    add_date('P577')     # publication date
+    add_label('P136')    # genre (music, film, artistic...)
+    add_label('P364')    # original language
+    add_value('P212')    # ISBN-13
+    add_value('P957')    # ISBN-10
+    add_label('P275')    # copyright license
+    add_label('P277')    # programming language
+    add_value('P348')    # version
+    add_label('P840')    # narrative location
+
+    # Languages
+    add_value('P1098')   # number of speakers
+    add_label('P282')    # writing system
+    add_label('P1018')   # language regulatory body
+    add_value('P218')    # language code (ISO 639-1)
+
+    # Other
+    add_label('P169')    # ceo
+    add_label('P112')    # founded by
+    add_label('P1454')   # legal form (company, organization)
+    add_label('P137')    # operator (service, facility, ...)
+    add_label('P1029')   # crew members (tripulation)
+    add_label('P225')    # taxon name
+    add_value('P274')    # chemical formula
+    add_label('P1346')   # winner (sports, contests, ...)
+    add_value('P1120')   # number of deaths
+    add_value('P498')    # currency code (ISO 4217)
+
+    # URL
+    add_url('P856', official=True)          # official website
+    attributes.append(WDArticle(language))  # wikipedia (user language)
+    if not language.startswith('en'):
+        attributes.append(WDArticle('en'))  # wikipedia (english)
+
+    add_url('P1324')     # source code repository
+    add_url('P1581')     # blog
+    add_url('P434', url_id='musicbrainz_artist')
+    add_url('P435', url_id='musicbrainz_work')
+    add_url('P436', url_id='musicbrainz_release_group')
+    add_url('P966', url_id='musicbrainz_label')
+    add_url('P345', url_id='imdb_id')
+    add_url('P2397', url_id='youtube_channel')
+    add_url('P1651', url_id='youtube_video')
+    add_url('P2002', url_id='twitter_profile')
+    add_url('P2013', url_id='facebook_profile')
+    add_url('P2003', url_id='instagram_profile')
+
+    # Map
+    attributes.append(WDGeoAttribute('P625'))
+
+    # Image
+    add_image('P15', priority=1, url_id='wikimedia_image')    # route map
+    add_image('P242', priority=2, url_id='wikimedia_image')   # locator map
+    add_image('P154', priority=3, url_id='wikimedia_image')   # logo
+    add_image('P18', priority=4, url_id='wikimedia_image')    # image
+    add_image('P41', priority=5, url_id='wikimedia_image')    # flag
+    add_image('P2716', priority=6, url_id='wikimedia_image')  # collage
+    add_image('P2910', priority=7, url_id='wikimedia_image')  # icon
+
+    return attributes
+
+
+class WDAttribute:
+
+    __slots__ = 'name',
+
+    def __init__(self, name):
+        self.name = name
+
+    def get_select(self):
+        return '(group_concat(distinct ?{name};separator=", ") as ?{name}s)'.replace('{name}', self.name)
+
+    def get_label(self, language):
+        return get_label_for_entity(self.name, language)
+
+    def get_where(self):
+        return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name)
+
+    def get_wikibase_label(self):
+        return ""
+
+    def get_group_by(self):
+        return ""
+
+    def get_str(self, result, language):
+        return result.get(self.name + 's')
 
-        if default_label:
-            label = default_label
-        else:
-            label = extract_text(eval_xpath(attribute, label_xpath))
-            label = label[0].upper() + label[1:]
-
-        if date:
-            trim = True
-            # remove calendar name
-            calendar_name = eval_xpath(attribute, calendar_name_xpath)
-            for calendar in calendar_name:
-                calendar.getparent().remove(calendar)
-
-        concat_values = ""
-        values = []
-        first_value = None
-        for row in eval_xpath(attribute, property_row_xpath):
-            if not first_value or not trim or eval_xpath(row, preferred_rank_xpath):
-                value = eval_xpath(row, value_xpath)
-                if not value:
-                    continue
-                value = extract_text(value)
-
-                # save first value in case no ranked row is found
-                if trim and not first_value:
-                    first_value = value
-                else:
-                    # to avoid duplicate values
-                    if value not in values:
-                        concat_values += value + ", "
-                        values.append(value)
-
-        if trim and not values:
-            attributes.append({'label': label,
-                               'value': first_value})
-        else:
-            attributes.append({'label': label,
-                               'value': concat_values[:-2]})
+    def __repr__(self):
+        return '<' + str(type(self).__name__) + ':' + self.name + '>'
 
 
-# requires property_id unless it's a wiki link (defined in link_type)
-def add_url(urls, result, id_cache, property_id=None, default_label=None, url_prefix=None, results=None,
-            link_type=None, only_first=True):
-    links = []
+class WDAmountAttribute(WDAttribute):
 
-    # wiki links don't have property in wikidata page
-    if link_type and 'wiki' in link_type:
-            links.append(get_wikilink(result, link_type))
-    else:
-        dom_element = id_cache.get(property_id, None)
-        if dom_element is not None:
-            if not default_label:
-                label = extract_text(eval_xpath(dom_element, label_xpath))
-                label = label[0].upper() + label[1:]
+    def get_select(self):
+        return '?{name} ?{name}Unit'.replace('{name}', self.name)
 
-            if link_type == 'geo':
-                links.append(get_geolink(dom_element))
+    def get_where(self):
+        return """  OPTIONAL { ?item p:{name} ?{name}Node .
+    ?{name}Node rdf:type wikibase:BestRank ; ps:{name} ?{name} .
+    OPTIONAL { ?{name}Node psv:{name}/wikibase:quantityUnit ?{name}Unit. } }""".replace('{name}', self.name)
 
-            elif link_type == 'imdb':
-                links.append(get_imdblink(dom_element, url_prefix))
+    def get_group_by(self):
+        return self.get_select()
 
-            else:
-                url_results = eval_xpath(dom_element, url_xpath)
-                for link in url_results:
-                    if link is not None:
-                        if url_prefix:
-                            link = url_prefix + extract_text(link)
-                        else:
-                            link = extract_text(link)
-                        links.append(link)
-
-    # append urls
-    for url in links:
-        if url is not None:
-            u = {'title': default_label or label, 'url': url}
-            if property_id == 'P856':
-                u['official'] = True
-                u['domain'] = url.split('/')[2]
-            urls.append(u)
-            if results is not None:
-                results.append(u)
-            if only_first:
-                break
-
-
-def get_imdblink(result, url_prefix):
-    imdb_id = eval_xpath(result, value_xpath)
-    if imdb_id:
-        imdb_id = extract_text(imdb_id)
-        id_prefix = imdb_id[:2]
-        if id_prefix == 'tt':
-            url = url_prefix + 'title/' + imdb_id
-        elif id_prefix == 'nm':
-            url = url_prefix + 'name/' + imdb_id
-        elif id_prefix == 'ch':
-            url = url_prefix + 'character/' + imdb_id
-        elif id_prefix == 'co':
-            url = url_prefix + 'company/' + imdb_id
-        elif id_prefix == 'ev':
-            url = url_prefix + 'event/' + imdb_id
-        else:
-            url = None
-        return url
+    def get_str(self, result, language):
+        value = result.get(self.name)
+        unit = result.get(self.name + "Unit")
+        if unit is not None:
+            unit = unit.replace('http://www.wikidata.org/entity/', '')
+            return value + " " + get_label_for_entity(unit, language)
+        return value
 
 
-def get_geolink(result):
-    coordinates = eval_xpath(result, value_xpath)
-    if not coordinates:
-        return None
-    coordinates = extract_text(coordinates[0])
-    latitude, longitude = coordinates.split(',')
-
-    # convert to decimal
-    lat = int(latitude[:latitude.find('°')])
-    if latitude.find('\'') >= 0:
-        lat += int(latitude[latitude.find('°') + 1:latitude.find('\'')] or 0) / 60.0
-    if latitude.find('"') >= 0:
-        lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0
-    if latitude.find('S') >= 0:
-        lat *= -1
-    lon = int(longitude[:longitude.find('°')])
-    if longitude.find('\'') >= 0:
-        lon += int(longitude[longitude.find('°') + 1:longitude.find('\'')] or 0) / 60.0
-    if longitude.find('"') >= 0:
-        lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0
-    if longitude.find('W') >= 0:
-        lon *= -1
-
-    # TODO: get precision
-    precision = 0.0002
-    # there is no zoom information, deduce from precision (error prone)
-    # samples :
-    # 13 --> 5
-    # 1 --> 6
-    # 0.016666666666667 --> 9
-    # 0.00027777777777778 --> 19
-    # wolframalpha :
-    # quadratic fit { {13, 5}, {1, 6}, {0.0166666, 9}, {0.0002777777,19}}
-    # 14.1186-8.8322 x+0.625447 x^2
-    if precision < 0.0003:
-        zoom = 19
-    else:
-        zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447)
+class WDArticle(WDAttribute):
+
+    __slots__ = 'language', 'kwargs'
+
+    def __init__(self, language, kwargs=None):
+        super().__init__('wikipedia')
+        self.language = language
+        self.kwargs = kwargs or {}
+
+    def get_label(self, language):
+        # language parameter is ignored
+        return "Wikipedia ({language})".replace('{language}', self.language)
+
+    def get_select(self):
+        return "?article{language} ?articleName{language}".replace('{language}', self.language)
+
+    def get_where(self):
+        return """OPTIONAL { ?article{language} schema:about ?item ;
+             schema:inLanguage "{language}" ;
+             schema:isPartOf <https://{language}.wikipedia.org/> ;
+             schema:name ?articleName{language} . }""".replace('{language}', self.language)
+
+    def get_group_by(self):
+        return self.get_select()
+
+    def get_str(self, result, language):
+        key = 'article{language}'.replace('{language}', self.language)
+        return result.get(key)
+
+
+class WDLabelAttribute(WDAttribute):
+
+    def get_select(self):
+        return '(group_concat(distinct ?{name}Label;separator=", ") as ?{name}Labels)'.replace('{name}', self.name)
 
-    url = url_map\
-        .replace('{latitude}', str(lat))\
-        .replace('{longitude}', str(lon))\
-        .replace('{zoom}', str(zoom))
+    def get_where(self):
+        return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name)
 
-    return url
+    def get_wikibase_label(self):
+        return "?{name} rdfs:label ?{name}Label .".replace('{name}', self.name)
 
+    def get_str(self, result, language):
+        return result.get(self.name + 'Labels')
 
-def get_wikilink(result, wikiid):
-    url = eval_xpath(result, wikilink_xpath.replace('{wikiid}', wikiid))
-    if not url:
+
+class WDURLAttribute(WDAttribute):
+
+    HTTP_WIKIMEDIA_IMAGE = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
+
+    __slots__ = 'url_id', 'kwargs'
+
+    def __init__(self, name, url_id=None, kwargs=None):
+        super().__init__(name)
+        self.url_id = url_id
+        self.kwargs = kwargs
+
+    def get_str(self, result, language):
+        value = result.get(self.name + 's')
+        if self.url_id and value is not None and value != '':
+            value = value.split(',')[0]
+            url_id = self.url_id
+            if value.startswith(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE):
+                value = value[len(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE):]
+                url_id = 'wikimedia_image'
+            return get_external_url(url_id, value)
+        return value
+
+
+class WDGeoAttribute(WDAttribute):
+
+    def get_label(self, language):
+        return "OpenStreetMap"
+
+    def get_select(self):
+        return "?{name}Lat ?{name}Long".replace('{name}', self.name)
+
+    def get_where(self):
+        return """OPTIONAL { ?item p:{name}/psv:{name} [
+    wikibase:geoLatitude ?{name}Lat ;
+    wikibase:geoLongitude ?{name}Long ] }""".replace('{name}', self.name)
+
+    def get_group_by(self):
+        return self.get_select()
+
+    def get_str(self, result, language, osm_zoom=19):
+        latitude = result.get(self.name + 'Lat')
+        longitude = result.get(self.name + 'Long')
+        if latitude and longitude:
+            return get_earth_coordinates_url(latitude, longitude, osm_zoom)
         return None
-    url = url[0]
-    if url.startswith('http://'):
-        url = url.replace('http://', 'https://')
-    elif url.startswith('//'):
-        url = 'https:' + url
-    return url
+
+
+class WDImageAttribute(WDURLAttribute):
+
+    __slots__ = 'priority',
+
+    def __init__(self, name, url_id=None, priority=100):
+        super().__init__(name, url_id)
+        self.priority = priority
+
+
+class WDDateAttribute(WDAttribute):
+
+    def get_select(self):
+        return '?{name} ?{name}timePrecision ?{name}timeZone ?{name}timeCalendar'.replace('{name}', self.name)
+
+    def get_where(self):
+        # To remove duplicate, add
+        # FILTER NOT EXISTS { ?item p:{name}/psv:{name}/wikibase:timeValue ?{name}bis FILTER (?{name}bis < ?{name}) }
+        # this filter is too slow, so the response function ignore duplicate results
+        # (see the seen_entities variable)
+        return """OPTIONAL { ?item p:{name}/psv:{name} [
+    wikibase:timeValue ?{name} ;
+    wikibase:timePrecision ?{name}timePrecision ;
+    wikibase:timeTimezone ?{name}timeZone ;
+    wikibase:timeCalendarModel ?{name}timeCalendar ] . }
+    hint:Prior hint:rangeSafe true;""".replace('{name}', self.name)
+
+    def get_group_by(self):
+        return self.get_select()
+
+    def format_8(self, value, locale):
+        # precision: less than a year
+        return value
+
+    def format_9(self, value, locale):
+        year = int(value)
+        # precision: year
+        if year < 1584:
+            if year < 0:
+                return str(year - 1)
+            return str(year)
+        timestamp = isoparse(value)
+        return format_date(timestamp, format='yyyy', locale=locale)
+
+    def format_10(self, value, locale):
+        # precision: month
+        timestamp = isoparse(value)
+        return format_date(timestamp, format='MMMM y', locale=locale)
+
+    def format_11(self, value, locale):
+        # precision: day
+        timestamp = isoparse(value)
+        return format_date(timestamp, format='full', locale=locale)
+
+    def format_13(self, value, locale):
+        timestamp = isoparse(value)
+        # precision: minute
+        return get_datetime_format(format, locale=locale) \
+            .replace("'", "") \
+            .replace('{0}', format_time(timestamp, 'full', tzinfo=None,
+                                        locale=locale)) \
+            .replace('{1}', format_date(timestamp, 'short', locale=locale))
+
+    def format_14(self, value, locale):
+        # precision: second.
+        return format_datetime(isoparse(value), format='full', locale=locale)
+
+    DATE_FORMAT = {
+        '0': ('format_8', 1000000000),
+        '1': ('format_8', 100000000),
+        '2': ('format_8', 10000000),
+        '3': ('format_8', 1000000),
+        '4': ('format_8', 100000),
+        '5': ('format_8', 10000),
+        '6': ('format_8', 1000),
+        '7': ('format_8', 100),
+        '8': ('format_8', 10),
+        '9': ('format_9', 1),  # year
+        '10': ('format_10', 1),  # month
+        '11': ('format_11', 0),  # day
+        '12': ('format_13', 0),  # hour (not supported by babel, display minute)
+        '13': ('format_13', 0),  # minute
+        '14': ('format_14', 0)  # second
+    }
+
+    def get_str(self, result, language):
+        value = result.get(self.name)
+        if value == '' or value is None:
+            return None
+        precision = result.get(self.name + 'timePrecision')
+        date_format = WDDateAttribute.DATE_FORMAT.get(precision)
+        if date_format is not None:
+            format_method = getattr(self, date_format[0])
+            precision = date_format[1]
+            try:
+                if precision >= 1:
+                    t = value.split('-')
+                    if value.startswith('-'):
+                        value = '-' + t[1]
+                    else:
+                        value = t[0]
+                return format_method(value, language)
+            except Exception:
+                return value
+        return value
+
+
+def debug_explain_wikidata_query(query, method='GET'):
+    if method == 'GET':
+        http_response = get(SPARQL_EXPLAIN_URL + '&' + urlencode({'query': query}), headers=get_headers())
+    else:
+        http_response = post(SPARQL_EXPLAIN_URL, data={'query': query}, headers=get_headers())
+    http_response.raise_for_status()
+    return http_response.content
+
+
+def init(engine_settings=None):
+    # WIKIDATA_PROPERTIES : add unit symbols
+    WIKIDATA_PROPERTIES.update(WIKIDATA_UNITS)
+
+    # WIKIDATA_PROPERTIES : add property labels
+    wikidata_property_names = []
+    for attribute in get_attributes('en'):
+        if type(attribute) in (WDAttribute, WDAmountAttribute, WDURLAttribute, WDDateAttribute, WDLabelAttribute):
+            if attribute.name not in WIKIDATA_PROPERTIES:
+                wikidata_property_names.append("wd:" + attribute.name)
+    query = QUERY_PROPERTY_NAMES.replace('%ATTRIBUTES%', " ".join(wikidata_property_names))
+    jsonresponse = send_wikidata_query(query)
+    for result in jsonresponse.get('results', {}).get('bindings', {}):
+        name = result['name']['value']
+        lang = result['name']['xml:lang']
+        entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '')
+        WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize()

+ 77 - 0
searx/external_urls.py

@@ -0,0 +1,77 @@
+import math
+
+from searx.data import EXTERNAL_URLS
+
+
+IMDB_PREFIX_TO_URL_ID = {
+    'tt': 'imdb_title',
+    'mn': 'imdb_name',
+    'ch': 'imdb_character',
+    'co': 'imdb_company',
+    'ev': 'imdb_event'
+}
+
+
+def get_imdb_url_id(imdb_item_id):
+    id_prefix = imdb_item_id[:2]
+    return IMDB_PREFIX_TO_URL_ID.get(id_prefix)
+
+
+def get_external_url(url_id, item_id, alternative="default"):
+    """Return an external URL or None if url_id is not found.
+
+    url_id can take value from data/external_urls.json
+    The "imdb_id" value is automaticaly converted according to the item_id value.
+
+    If item_id is None, the raw URL with the $1 is returned.
+    """
+    if url_id == 'imdb_id' and item_id is not None:
+        url_id = get_imdb_url_id(item_id)
+
+    url_description = EXTERNAL_URLS.get(url_id)
+    if url_description:
+        url_template = url_description["urls"].get(alternative)
+        if url_template is not None:
+            if item_id is not None:
+                return url_template.replace('$1', item_id)
+            else:
+                return url_template
+    return None
+
+
+def get_earth_coordinates_url(latitude, longitude, osm_zoom, alternative='default'):
+    url = get_external_url('map', None, alternative)\
+        .replace('${latitude}', str(latitude))\
+        .replace('${longitude}', str(longitude))\
+        .replace('${zoom}', str(osm_zoom))
+    return url
+
+
+def area_to_osm_zoom(area):
+    """Convert an area in km² into an OSM zoom. Less reliable if the shape is not round.
+
+    logarithm regression using these data:
+     * 9596961 -> 4 (China)
+     * 3287263 -> 5 (India)
+     * 643801 -> 6 (France)
+     * 6028 -> 9
+     * 1214 -> 10
+     * 891 -> 12
+     * 12 -> 13
+
+    In WolframAlpha:
+        >>> log fit {9596961,15},{3287263, 14},{643801,13},{6028,10},{1214,9},{891,7},{12,6}
+
+    with 15 = 19-4 (China); 14 = 19-5 (India) and so on
+
+    Args:
+        area (int,float,str): area in km²
+
+    Returns:
+        int: OSM zoom or 19 in area is not a number
+    """
+    try:
+        amount = float(area)
+        return max(0, min(19, round(19 - 0.688297 * math.log(226.878 * amount))))
+    except ValueError:
+        return 19

+ 26 - 8
searx/results.py

@@ -20,6 +20,18 @@ def result_content_len(content):
 
 
 def compare_urls(url_a, url_b):
+    """Lazy compare between two URL.
+    "www.example.com" and "example.com" are equals.
+    "www.example.com/path/" and "www.example.com/path" are equals.
+    "https://www.example.com/" and "http://www.example.com/" are equals.
+
+    Args:
+        url_a (ParseResult): first URL
+        url_b (ParseResult): second URL
+
+    Returns:
+        bool: True if url_a and url_b are equals
+    """
     # ignore www. in comparison
     if url_a.netloc.startswith('www.'):
         host_a = url_a.netloc.replace('www.', '', 1)
@@ -68,8 +80,10 @@ def merge_two_infoboxes(infobox1, infobox2):
         for url2 in infobox2.get('urls', []):
             unique_url = True
             parsed_url2 = urlparse(url2.get('url', ''))
+            entity_url2 = url2.get('entity')
             for url1 in urls1:
-                if compare_urls(urlparse(url1.get('url', '')), parsed_url2):
+                if (entity_url2 is not None and url1.get('entity') == entity_url2)\
+                   or compare_urls(urlparse(url1.get('url', '')), parsed_url2):
                     unique_url = False
                     break
             if unique_url:
@@ -86,18 +100,22 @@ def merge_two_infoboxes(infobox1, infobox2):
             infobox1['img_src'] = img2
 
     if 'attributes' in infobox2:
-        attributes1 = infobox1.get('attributes', None)
+        attributes1 = infobox1.get('attributes')
         if attributes1 is None:
-            attributes1 = []
-            infobox1['attributes'] = attributes1
+            infobox1['attributes'] = attributes1 = []
 
         attributeSet = set()
-        for attribute in infobox1.get('attributes', []):
-            if attribute.get('label', None) not in attributeSet:
-                attributeSet.add(attribute.get('label', None))
+        for attribute in attributes1:
+            label = attribute.get('label')
+            if label not in attributeSet:
+                attributeSet.add(label)
+            entity = attribute.get('entity')
+            if entity not in attributeSet:
+                attributeSet.add(entity)
 
         for attribute in infobox2.get('attributes', []):
-            if attribute.get('label', None) not in attributeSet:
+            if attribute.get('label') not in attributeSet\
+               and attribute.get('entity') not in attributeSet:
                 attributes1.append(attribute)
 
     if 'content' in infobox2:

+ 1 - 5
searx/templates/oscar/infobox.html

@@ -25,11 +25,7 @@
                 {%- if attribute.image -%}
                 <td><img class="img-responsive" src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td>
                 {%- else -%}
-                 {% if attribute.label == 'Instance of' %}
-                  <td><bdi><a href="https://wikidata.org/wiki/{{ attribute.value.id }}">{{ attribute.value.id }}</a></bdi></td>
-                 {% else %}
-                  <td><bdi>{{ attribute.value }}</bdi></td>
-                 {%- endif -%}
+                <td><bdi>{{ attribute.value }}</bdi></td>
                 {%- endif -%}
             </tr>
             {% endfor -%}

+ 0 - 1
searx/templates/simple/infobox.html

@@ -1,7 +1,6 @@
 <aside class="infobox">
   <h2><bdi>{{ infobox.infobox }}</bdi></h2>
   {% if infobox.img_src %}<img src="{{ image_proxify(infobox.img_src) }}" title="{{ infobox.infobox|striptags }}" alt="{{ infobox.infobox|striptags }}" />{% endif %}
-  <p><bdi>{{ infobox.entity }}</bdi></p>
   <p><bdi>{{ infobox.content | safe }}</bdi></p>
   {% if infobox.attributes %}
   <div class="attributes">

+ 10 - 0
searx/utils.py

@@ -481,6 +481,16 @@ def ecma_unescape(s):
     return s
 
 
+def get_string_replaces_function(replaces):
+    rep = {re.escape(k): v for k, v in replaces.items()}
+    pattern = re.compile("|".join(rep.keys()))
+
+    def f(text):
+        return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
+
+    return f
+
+
 def get_engine_from_settings(name):
     """Return engine configuration from settings.yml of a given engine name"""
 

+ 47 - 0
utils/fetch_wikidata_units.py

@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+import json
+import collections
+
+# set path
+from sys import path
+from os.path import realpath, dirname, join
+path.append(realpath(dirname(realpath(__file__)) + '/../'))
+
+from searx import searx_dir
+from searx.engines.wikidata import send_wikidata_query
+
+
+SARQL_REQUEST = """
+SELECT DISTINCT ?item ?symbol ?P2370 ?P2370Unit ?P2442 ?P2442Unit
+WHERE
+{
+?item wdt:P31/wdt:P279 wd:Q47574.
+?item wdt:P5061 ?symbol.
+FILTER(LANG(?symbol) = "en").
+}
+ORDER BY ?item
+"""
+
+
+def get_data():
+    def get_key(unit):
+        return unit['item']['value'].replace('http://www.wikidata.org/entity/', '')
+
+    def get_value(unit):
+        return unit['symbol']['value']
+
+    result = send_wikidata_query(SARQL_REQUEST)
+    if result is not None:
+        # sort the unit by entity name
+        # so different fetchs keep the file unchanged.
+        list(result['results']['bindings']).sort(key=get_key)
+        return collections.OrderedDict([(get_key(unit), get_value(unit)) for unit in result['results']['bindings']])
+
+
+def get_wikidata_units_filename():
+    return join(join(searx_dir, "data"), "wikidata_units.json")
+
+
+with open(get_wikidata_units_filename(), 'w') as f:
+    json.dump(get_data(), f, indent=4, ensure_ascii=False)