sjp.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Słownik Języka Polskiego (general)
  3. """
  4. from lxml.html import fromstring
  5. from searx import logger
  6. from searx.utils import extract_text
  7. from searx.raise_for_httperror import raise_for_httperror
  8. logger = logger.getChild('sjp engine')
  9. # about
  10. about = {
  11. "website": 'https://sjp.pwn.pl',
  12. "wikidata_id": 'Q55117369',
  13. "official_api_documentation": None,
  14. "use_official_api": False,
  15. "require_api_key": False,
  16. "results": 'HTML',
  17. }
  18. categories = ['general']
  19. paging = False
  20. URL = 'https://sjp.pwn.pl'
  21. SEARCH_URL = URL + '/szukaj/{query}.html'
  22. def request(query, params):
  23. params['url'] = SEARCH_URL.format(query=query)
  24. logger.debug(f"query_url --> {params['url']}")
  25. return params
  26. def response(resp):
  27. results = []
  28. raise_for_httperror(resp)
  29. dom = fromstring(resp.text)
  30. word = extract_text(dom.xpath('//*[@id="content"]/div/div[1]/div/div[1]/div[1]/div[2]/div/div/div[2]/div/div'))
  31. definitions = []
  32. for src in dom.xpath('//*[@id="content"]/div/div[1]/div/div[1]/div[1]/div[2]/div/div/div/div/div/div'):
  33. src_text = extract_text(src.xpath('./h1/span[@class="entry-head-title"]/text()')).strip()
  34. src_defs = []
  35. for def_item in src.xpath('./div/div[contains(@class, "ribbon-element")]'):
  36. if def_item.xpath('./div[@class="znacz"]'):
  37. sub_defs = []
  38. for def_sub_item in def_item.xpath('./div[@class="znacz"]'):
  39. def_sub_text = extract_text(def_sub_item).lstrip('0123456789. ')
  40. sub_defs.append(def_sub_text)
  41. src_defs.append((word, sub_defs))
  42. else:
  43. def_text = extract_text(def_item).strip()
  44. src_defs.append((def_text, ''))
  45. definitions.append((src_text, src_defs))
  46. if not definitions:
  47. return results
  48. infobox = ''
  49. for src in definitions:
  50. infobox += f"<div><small>{src[0]}</small>"
  51. infobox += "<ul>"
  52. for (def_text, sub_def) in src[1]:
  53. infobox += f"<li>{def_text}</li>"
  54. if sub_def:
  55. infobox += "<ol>"
  56. for sub_def_text in sub_def:
  57. infobox += f"<li>{sub_def_text}</li>"
  58. infobox += "</ol>"
  59. infobox += "</ul></div>"
  60. results.append({
  61. 'infobox': word,
  62. 'content': infobox,
  63. })
  64. return results