sjp.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Słownik Języka Polskiego
  3. Dictionary of the polish language from PWN (sjp.pwn)
  4. """
  5. from lxml.html import fromstring
  6. from searx import logger
  7. from searx.utils import extract_text
  8. from searx.network import raise_for_httperror
  9. logger = logger.getChild('sjp engine')
  10. # about
  11. about = {
  12. "website": 'https://sjp.pwn.pl',
  13. "wikidata_id": 'Q55117369',
  14. "official_api_documentation": None,
  15. "use_official_api": False,
  16. "require_api_key": False,
  17. "results": 'HTML',
  18. "language": 'pl',
  19. }
  20. categories = ['dictionaries']
  21. paging = False
  22. URL = 'https://sjp.pwn.pl'
  23. SEARCH_URL = URL + '/szukaj/{query}.html'
  24. word_xpath = '//div[@class="query"]'
  25. dict_xpath = [
  26. '//div[@class="wyniki sjp-so-wyniki sjp-so-anchor"]',
  27. '//div[@class="wyniki sjp-wyniki sjp-anchor"]',
  28. '//div[@class="wyniki sjp-doroszewski-wyniki sjp-doroszewski-anchor"]',
  29. ]
  30. def request(query, params):
  31. params['url'] = SEARCH_URL.format(query=query)
  32. logger.debug(f"query_url --> {params['url']}")
  33. return params
  34. def response(resp):
  35. results = []
  36. raise_for_httperror(resp)
  37. dom = fromstring(resp.text)
  38. word = extract_text(dom.xpath(word_xpath))
  39. definitions = []
  40. for dict_src in dict_xpath:
  41. for src in dom.xpath(dict_src):
  42. src_text = extract_text(src.xpath('.//span[@class="entry-head-title"]/text()')).strip()
  43. src_defs = []
  44. for def_item in src.xpath('.//div[contains(@class, "ribbon-element")]'):
  45. if def_item.xpath('./div[@class="znacz"]'):
  46. sub_defs = []
  47. for def_sub_item in def_item.xpath('./div[@class="znacz"]'):
  48. def_sub_text = extract_text(def_sub_item).lstrip('0123456789. ')
  49. sub_defs.append(def_sub_text)
  50. src_defs.append((word, sub_defs))
  51. else:
  52. def_text = extract_text(def_item).strip()
  53. def_link = def_item.xpath('./span/a/@href')
  54. if 'doroszewski' in def_link[0]:
  55. def_text = f"<a href='{def_link[0]}'>{def_text}</a>"
  56. src_defs.append((def_text, ''))
  57. definitions.append((src_text, src_defs))
  58. if not definitions:
  59. return results
  60. infobox = ''
  61. for src in definitions:
  62. infobox += f"<div><small>{src[0]}</small>"
  63. infobox += "<ul>"
  64. for def_text, sub_def in src[1]:
  65. infobox += f"<li>{def_text}</li>"
  66. if sub_def:
  67. infobox += "<ol>"
  68. for sub_def_text in sub_def:
  69. infobox += f"<li>{sub_def_text}</li>"
  70. infobox += "</ol>"
  71. infobox += "</ul></div>"
  72. results.append(
  73. {
  74. 'infobox': word,
  75. 'content': infobox,
  76. }
  77. )
  78. return results