sjp.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Słownik Języka Polskiego
  4. Dictionary of the polish language from PWN (sjp.pwn)
  5. """
  6. from lxml.html import fromstring
  7. from searx import logger
  8. from searx.utils import extract_text
  9. from searx.network import raise_for_httperror
  10. logger = logger.getChild('sjp engine')
  11. # about
  12. about = {
  13. "website": 'https://sjp.pwn.pl',
  14. "wikidata_id": 'Q55117369',
  15. "official_api_documentation": None,
  16. "use_official_api": False,
  17. "require_api_key": False,
  18. "results": 'HTML',
  19. "language": 'pl',
  20. }
  21. categories = ['dictionaries']
  22. paging = False
  23. URL = 'https://sjp.pwn.pl'
  24. SEARCH_URL = URL + '/szukaj/{query}.html'
  25. word_xpath = '//div[@class="query"]'
  26. dict_xpath = [
  27. '//div[@class="wyniki sjp-so-wyniki sjp-so-anchor"]',
  28. '//div[@class="wyniki sjp-wyniki sjp-anchor"]',
  29. '//div[@class="wyniki sjp-doroszewski-wyniki sjp-doroszewski-anchor"]',
  30. ]
  31. def request(query, params):
  32. params['url'] = SEARCH_URL.format(query=query)
  33. logger.debug(f"query_url --> {params['url']}")
  34. return params
  35. def response(resp):
  36. results = []
  37. raise_for_httperror(resp)
  38. dom = fromstring(resp.text)
  39. word = extract_text(dom.xpath(word_xpath))
  40. definitions = []
  41. for dict_src in dict_xpath:
  42. for src in dom.xpath(dict_src):
  43. src_text = extract_text(src.xpath('.//span[@class="entry-head-title"]/text()')).strip()
  44. src_defs = []
  45. for def_item in src.xpath('.//div[contains(@class, "ribbon-element")]'):
  46. if def_item.xpath('./div[@class="znacz"]'):
  47. sub_defs = []
  48. for def_sub_item in def_item.xpath('./div[@class="znacz"]'):
  49. def_sub_text = extract_text(def_sub_item).lstrip('0123456789. ')
  50. sub_defs.append(def_sub_text)
  51. src_defs.append((word, sub_defs))
  52. else:
  53. def_text = extract_text(def_item).strip()
  54. def_link = def_item.xpath('./span/a/@href')
  55. if 'doroszewski' in def_link[0]:
  56. def_text = f"<a href='{def_link[0]}'>{def_text}</a>"
  57. src_defs.append((def_text, ''))
  58. definitions.append((src_text, src_defs))
  59. if not definitions:
  60. return results
  61. infobox = ''
  62. for src in definitions:
  63. infobox += f"<div><small>{src[0]}</small>"
  64. infobox += "<ul>"
  65. for def_text, sub_def in src[1]:
  66. infobox += f"<li>{def_text}</li>"
  67. if sub_def:
  68. infobox += "<ol>"
  69. for sub_def_text in sub_def:
  70. infobox += f"<li>{sub_def_text}</li>"
  71. infobox += "</ol>"
  72. infobox += "</ul></div>"
  73. results.append(
  74. {
  75. 'infobox': word,
  76. 'content': infobox,
  77. }
  78. )
  79. return results