test_wikidata.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. # -*- coding: utf-8 -*-
  2. from lxml.html import fromstring
  3. from lxml import etree
  4. from collections import defaultdict
  5. import mock
  6. from searx.engines import wikidata
  7. from searx.testing import SearxTestCase
  8. class TestWikidataEngine(SearxTestCase):
  9. def test_request(self):
  10. query = 'test_query'
  11. dicto = defaultdict(dict)
  12. dicto['language'] = 'all'
  13. params = wikidata.request(query, dicto)
  14. self.assertIn('url', params)
  15. self.assertIn(query, params['url'])
  16. self.assertIn('wikidata.org', params['url'])
  17. dicto['language'] = 'es_ES'
  18. params = wikidata.request(query, dicto)
  19. self.assertIn(query, params['url'])
  20. # successful cases are not tested here to avoid sending additional requests
  21. def test_response(self):
  22. self.assertRaises(AttributeError, wikidata.response, None)
  23. self.assertRaises(AttributeError, wikidata.response, [])
  24. self.assertRaises(AttributeError, wikidata.response, '')
  25. self.assertRaises(AttributeError, wikidata.response, '[]')
  26. wikidata.supported_languages = ['en', 'es']
  27. wikidata.language_aliases = {}
  28. response = mock.Mock(content='<html></html>'.encode("utf-8"), search_params={"language": "en"})
  29. self.assertEqual(wikidata.response(response), [])
  30. def test_getDetail(self):
  31. response = {}
  32. results = wikidata.getDetail(response, "Q123", "en", "en-US", etree.HTMLParser())
  33. self.assertEqual(results, [])
  34. title_html = '<div><div class="wikibase-title-label">Test</div></div>'
  35. html = """
  36. <div>
  37. <div class="wikibase-entitytermsview-heading-description">
  38. </div>
  39. <div>
  40. <ul class="wikibase-sitelinklistview-listview">
  41. <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li>
  42. </ul>
  43. </div>
  44. </div>
  45. """
  46. response = {"parse": {"displaytitle": title_html, "text": html}}
  47. results = wikidata.getDetail(response, "Q123", "en", "en-US", etree.HTMLParser())
  48. self.assertEqual(len(results), 1)
  49. self.assertEqual(results[0]['url'], 'https://en.wikipedia.org/wiki/Test')
  50. title_html = """
  51. <div>
  52. <div class="wikibase-title-label">
  53. <span lang="en">Test</span>
  54. <sup class="wb-language-fallback-indicator">English</sup>
  55. </div>
  56. </div>
  57. """
  58. html = """
  59. <div>
  60. <div class="wikibase-entitytermsview-heading-description">
  61. <span lang="en">Description</span>
  62. <sup class="wb-language-fallback-indicator">English</sup>
  63. </div>
  64. <div id="P856">
  65. <div class="wikibase-statementgroupview-property-label">
  66. <a href="/wiki/Property:P856">
  67. <span lang="en">official website</span>
  68. <sup class="wb-language-fallback-indicator">English</sup>
  69. </a>
  70. </div>
  71. <div class="wikibase-statementview-mainsnak">
  72. <a class="external free" href="https://officialsite.com">
  73. https://officialsite.com
  74. </a>
  75. </div>
  76. </div>
  77. <div>
  78. <ul class="wikibase-sitelinklistview-listview">
  79. <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li>
  80. </ul>
  81. </div>
  82. </div>
  83. """
  84. response = {"parse": {"displaytitle": title_html, "text": html}}
  85. results = wikidata.getDetail(response, "Q123", "yua", "yua_MX", etree.HTMLParser())
  86. self.assertEqual(len(results), 2)
  87. self.assertEqual(results[0]['title'], 'Official website')
  88. self.assertEqual(results[0]['url'], 'https://officialsite.com')
  89. self.assertEqual(results[1]['infobox'], 'Test')
  90. self.assertEqual(results[1]['id'], None)
  91. self.assertEqual(results[1]['content'], 'Description')
  92. self.assertEqual(results[1]['attributes'], [])
  93. self.assertEqual(results[1]['urls'][0]['title'], 'Official website')
  94. self.assertEqual(results[1]['urls'][0]['url'], 'https://officialsite.com')
  95. self.assertEqual(results[1]['urls'][1]['title'], 'Wikipedia (en)')
  96. self.assertEqual(results[1]['urls'][1]['url'], 'https://en.wikipedia.org/wiki/Test')
  97. def test_add_image(self):
  98. image_src = wikidata.add_image(fromstring("<div></div>"))
  99. self.assertEqual(image_src, None)
  100. html = u"""
  101. <div>
  102. <div id="P18">
  103. <div class="wikibase-statementgroupview-property-label">
  104. <a href="/wiki/Property:P18">
  105. image
  106. </a>
  107. </div>
  108. <div class="wikibase-statementlistview">
  109. <div class="wikibase-statementview listview-item">
  110. <div class="wikibase-statementview-rankselector">
  111. <span class="wikibase-rankselector-normal"></span>
  112. </div>
  113. <div class="wikibase-statementview-mainsnak">
  114. <div>
  115. <div class="wikibase-snakview-value">
  116. <div class="commons-media-caption">
  117. <a href="https://commons.wikimedia.org/wiki/File:image.png">image.png</a>
  118. <br/>2,687 &#215; 3,356; 1.22 MB
  119. </div>
  120. </div>
  121. </div>
  122. </div>
  123. </div>
  124. </div>
  125. </div>
  126. </div>
  127. """
  128. html_etree = fromstring(html)
  129. id_cache = wikidata.get_id_cache(html_etree)
  130. image_src = wikidata.add_image(id_cache)
  131. self.assertEqual(image_src,
  132. "https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500&height=400")
  133. html = u"""
  134. <div>
  135. <div id="P2910">
  136. <div class="wikibase-statementgroupview-property-label">
  137. <a href="/wiki/Property:P2910">
  138. icon
  139. </a>
  140. </div>
  141. <div class="wikibase-statementlistview">
  142. <div class="wikibase-statementview listview-item">
  143. <div class="wikibase-statementview-rankselector">
  144. <span class="wikibase-rankselector-normal"></span>
  145. </div>
  146. <div class="wikibase-statementview-mainsnak">
  147. <div>
  148. <div class="wikibase-snakview-value">
  149. <div class="commons-media-caption">
  150. <a href="https://commons.wikimedia.org/wiki/File:icon.png">icon.png</a>
  151. <br/>671 &#215; 671; 18 KB</div>
  152. </div>
  153. </div>
  154. </div>
  155. </div>
  156. </div>
  157. </div>
  158. </div>
  159. <div id="P154">
  160. <div class="wikibase-statementgroupview-property-label">
  161. <a href="/wiki/Property:P154">
  162. logo
  163. </a>
  164. </div>
  165. <div class="wikibase-statementlistview">
  166. <div class="wikibase-statementview listview-item">
  167. <div class="wikibase-statementview-rankselector">
  168. <span class="wikibase-rankselector-normal"></span>
  169. </div>
  170. <div class="wikibase-statementview-mainsnak">
  171. <div>
  172. <div class="wikibase-snakview-value">
  173. <div class="commons-media-caption">
  174. <a href="https://commons.wikimedia.org/wiki/File:logo.png">logo.png</a>
  175. <br/>170 &#215; 170; 1 KB
  176. </div>
  177. </div>
  178. </div>
  179. </div>
  180. </div>
  181. </div>
  182. </div>
  183. </div>
  184. """
  185. html_etree = fromstring(html)
  186. id_cache = wikidata.get_id_cache(html_etree)
  187. image_src = wikidata.add_image(id_cache)
  188. self.assertEqual(image_src,
  189. "https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500&height=400")
  190. def test_add_attribute(self):
  191. html = u"""
  192. <div>
  193. <div id="P27">
  194. <div class="wikibase-statementgroupview-property-label">
  195. <a href="/wiki/Property:P27">
  196. country of citizenship
  197. </a>
  198. </div>
  199. <div class="wikibase-statementlistview">
  200. <div class="wikibase-statementview listview-item">
  201. <div class="wikibase-statementview-rankselector">
  202. <span class="wikibase-rankselector-normal"></span>
  203. </div>
  204. <div class="wikibase-statementview-mainsnak">
  205. <div>
  206. <div class="wikibase-snakview-value">
  207. <a href="/wiki/Q145">
  208. United Kingdom
  209. </a>
  210. </div>
  211. </div>
  212. </div>
  213. </div>
  214. </div>
  215. </div>
  216. </div>
  217. """
  218. attributes = []
  219. html_etree = fromstring(html)
  220. id_cache = wikidata.get_id_cache(html_etree)
  221. wikidata.add_attribute(attributes, id_cache, "Fail")
  222. self.assertEqual(attributes, [])
  223. wikidata.add_attribute(attributes, id_cache, "P27")
  224. self.assertEqual(len(attributes), 1)
  225. self.assertEqual(attributes[0]["label"], "Country of citizenship")
  226. self.assertEqual(attributes[0]["value"], "United Kingdom")
  227. html = u"""
  228. <div>
  229. <div id="P569">
  230. <div class="wikibase-statementgroupview-property-label">
  231. <a href="/wiki/Property:P569">
  232. date of birth
  233. </a>
  234. </div>
  235. <div class="wikibase-statementlistview">
  236. <div class="wikibase-statementview listview-item">
  237. <div class="wikibase-statementview-rankselector">
  238. <span class="wikibase-rankselector-normal"></span>
  239. </div>
  240. <div class="wikibase-statementview-mainsnak">
  241. <div>
  242. <div class="wikibase-snakview-value">
  243. 27 January 1832
  244. <sup class="wb-calendar-name">
  245. Gregorian
  246. </sup>
  247. </div>
  248. </div>
  249. </div>
  250. </div>
  251. </div>
  252. </div>
  253. </div>
  254. """
  255. attributes = []
  256. html_etree = fromstring(html)
  257. id_cache = wikidata.get_id_cache(html_etree)
  258. wikidata.add_attribute(attributes, id_cache, "P569", date=True)
  259. self.assertEqual(len(attributes), 1)
  260. self.assertEqual(attributes[0]["label"], "Date of birth")
  261. self.assertEqual(attributes[0]["value"], "27 January 1832")
  262. html = u"""
  263. <div>
  264. <div id="P6">
  265. <div class="wikibase-statementgroupview-property-label">
  266. <a href="/wiki/Property:P27">
  267. head of government
  268. </a>
  269. </div>
  270. <div class="wikibase-statementlistview">
  271. <div class="wikibase-statementview listview-item">
  272. <div class="wikibase-statementview-rankselector">
  273. <span class="wikibase-rankselector-normal"></span>
  274. </div>
  275. <div class="wikibase-statementview-mainsnak">
  276. <div>
  277. <div class="wikibase-snakview-value">
  278. <a href="/wiki/Q206">
  279. Old Prime Minister
  280. </a>
  281. </div>
  282. </div>
  283. </div>
  284. </div>
  285. <div class="wikibase-statementview listview-item">
  286. <div class="wikibase-statementview-rankselector">
  287. <span class="wikibase-rankselector-preferred"></span>
  288. </div>
  289. <div class="wikibase-statementview-mainsnak">
  290. <div>
  291. <div class="wikibase-snakview-value">
  292. <a href="/wiki/Q3099714">
  293. Actual Prime Minister
  294. </a>
  295. </div>
  296. </div>
  297. </div>
  298. </div>
  299. </div>
  300. </div>
  301. </div>
  302. """
  303. attributes = []
  304. html_etree = fromstring(html)
  305. id_cache = wikidata.get_id_cache(html_etree)
  306. wikidata.add_attribute(attributes, id_cache, "P6")
  307. self.assertEqual(len(attributes), 1)
  308. self.assertEqual(attributes[0]["label"], "Head of government")
  309. self.assertEqual(attributes[0]["value"], "Old Prime Minister, Actual Prime Minister")
  310. attributes = []
  311. html_etree = fromstring(html)
  312. id_cache = wikidata.get_id_cache(html_etree)
  313. wikidata.add_attribute(attributes, id_cache, "P6", trim=True)
  314. self.assertEqual(len(attributes), 1)
  315. self.assertEqual(attributes[0]["value"], "Actual Prime Minister")
  316. def test_add_url(self):
  317. html = u"""
  318. <div>
  319. <div id="P856">
  320. <div class="wikibase-statementgroupview-property-label">
  321. <a href="/wiki/Property:P856">
  322. official website
  323. </a>
  324. </div>
  325. <div class="wikibase-statementlistview">
  326. <div class="wikibase-statementview listview-item">
  327. <div class="wikibase-statementview-mainsnak">
  328. <div>
  329. <div class="wikibase-snakview-value">
  330. <a class="external free" href="https://searx.me">
  331. https://searx.me/
  332. </a>
  333. </div>
  334. </div>
  335. </div>
  336. </div>
  337. </div>
  338. </div>
  339. </div>
  340. """
  341. urls = []
  342. html_etree = fromstring(html)
  343. id_cache = wikidata.get_id_cache(html_etree)
  344. wikidata.add_url(urls, html_etree, id_cache, 'P856')
  345. self.assertEquals(len(urls), 1)
  346. self.assertIn({'title': 'Official website', 'url': 'https://searx.me/'}, urls)
  347. urls = []
  348. results = []
  349. wikidata.add_url(urls, html_etree, id_cache, 'P856', 'custom label', results=results)
  350. self.assertEquals(len(urls), 1)
  351. self.assertEquals(len(results), 1)
  352. self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, urls)
  353. self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, results)
  354. html = u"""
  355. <div>
  356. <div id="P856">
  357. <div class="wikibase-statementgroupview-property-label">
  358. <a href="/wiki/Property:P856">
  359. official website
  360. </a>
  361. </div>
  362. <div class="wikibase-statementlistview">
  363. <div class="wikibase-statementview listview-item">
  364. <div class="wikibase-statementview-mainsnak">
  365. <div>
  366. <div class="wikibase-snakview-value">
  367. <a class="external free" href="http://www.worldofwarcraft.com">
  368. http://www.worldofwarcraft.com
  369. </a>
  370. </div>
  371. </div>
  372. </div>
  373. </div>
  374. <div class="wikibase-statementview listview-item">
  375. <div class="wikibase-statementview-mainsnak">
  376. <div>
  377. <div class="wikibase-snakview-value">
  378. <a class="external free" href="http://eu.battle.net/wow/en/">
  379. http://eu.battle.net/wow/en/
  380. </a>
  381. </div>
  382. </div>
  383. </div>
  384. </div>
  385. </div>
  386. </div>
  387. </div>
  388. """
  389. urls = []
  390. html_etree = fromstring(html)
  391. id_cache = wikidata.get_id_cache(html_etree)
  392. wikidata.add_url(urls, html_etree, id_cache, 'P856')
  393. self.assertEquals(len(urls), 2)
  394. self.assertIn({'title': 'Official website', 'url': 'http://www.worldofwarcraft.com'}, urls)
  395. self.assertIn({'title': 'Official website', 'url': 'http://eu.battle.net/wow/en/'}, urls)
  396. def test_get_imdblink(self):
  397. html = u"""
  398. <div>
  399. <div class="wikibase-statementview-mainsnak">
  400. <div>
  401. <div class="wikibase-snakview-value">
  402. <a class="wb-external-id" href="http://www.imdb.com/tt0433664">
  403. tt0433664
  404. </a>
  405. </div>
  406. </div>
  407. </div>
  408. </div>
  409. """
  410. html_etree = fromstring(html)
  411. imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/')
  412. html = u"""
  413. <div>
  414. <div class="wikibase-statementview-mainsnak">
  415. <div>
  416. <div class="wikibase-snakview-value">
  417. <a class="wb-external-id"
  418. href="href="http://tools.wmflabs.org/...http://www.imdb.com/&id=nm4915994"">
  419. nm4915994
  420. </a>
  421. </div>
  422. </div>
  423. </div>
  424. </div>
  425. """
  426. html_etree = fromstring(html)
  427. imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/')
  428. self.assertIn('https://www.imdb.com/name/nm4915994', imdblink)
  429. def test_get_geolink(self):
  430. html = u"""
  431. <div>
  432. <div class="wikibase-statementview-mainsnak">
  433. <div>
  434. <div class="wikibase-snakview-value">
  435. 60°N, 40°E
  436. </div>
  437. </div>
  438. </div>
  439. </div>
  440. """
  441. html_etree = fromstring(html)
  442. geolink = wikidata.get_geolink(html_etree)
  443. self.assertIn('https://www.openstreetmap.org/', geolink)
  444. self.assertIn('lat=60&lon=40', geolink)
  445. html = u"""
  446. <div>
  447. <div class="wikibase-statementview-mainsnak">
  448. <div>
  449. <div class="wikibase-snakview-value">
  450. 34°35'59"S, 58°22'55"W
  451. </div>
  452. </div>
  453. </div>
  454. </div>
  455. """
  456. html_etree = fromstring(html)
  457. geolink = wikidata.get_geolink(html_etree)
  458. self.assertIn('https://www.openstreetmap.org/', geolink)
  459. self.assertIn('lat=-34.59', geolink)
  460. self.assertIn('lon=-58.38', geolink)
  461. def test_get_wikilink(self):
  462. html = """
  463. <div>
  464. <div>
  465. <ul class="wikibase-sitelinklistview-listview">
  466. <li data-wb-siteid="arwiki"><a href="http://ar.wikipedia.org/wiki/Test">Test</a></li>
  467. <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li>
  468. </ul>
  469. </div>
  470. <div>
  471. <ul class="wikibase-sitelinklistview-listview">
  472. <li data-wb-siteid="enwikiquote"><a href="https://en.wikiquote.org/wiki/Test">Test</a></li>
  473. </ul>
  474. </div>
  475. </div>
  476. """
  477. html_etree = fromstring(html)
  478. wikilink = wikidata.get_wikilink(html_etree, 'nowiki')
  479. self.assertEqual(wikilink, None)
  480. wikilink = wikidata.get_wikilink(html_etree, 'enwiki')
  481. self.assertEqual(wikilink, 'https://en.wikipedia.org/wiki/Test')
  482. wikilink = wikidata.get_wikilink(html_etree, 'arwiki')
  483. self.assertEqual(wikilink, 'https://ar.wikipedia.org/wiki/Test')
  484. wikilink = wikidata.get_wikilink(html_etree, 'enwikiquote')
  485. self.assertEqual(wikilink, 'https://en.wikiquote.org/wiki/Test')