dictzone.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Dictzone
  4. """
  5. import urllib.parse
  6. from lxml import html
  7. from searx.utils import eval_xpath, extract_text
  8. from searx.result_types import EngineResults
  9. from searx.network import get as http_get # https://github.com/searxng/searxng/issues/762
  10. # about
  11. about = {
  12. "website": 'https://dictzone.com/',
  13. "wikidata_id": None,
  14. "official_api_documentation": None,
  15. "use_official_api": False,
  16. "require_api_key": False,
  17. "results": 'HTML',
  18. }
  19. engine_type = 'online_dictionary'
  20. categories = ['general', 'translate']
  21. base_url = "https://dictzone.com"
  22. weight = 100
  23. https_support = True
  24. def request(query, params): # pylint: disable=unused-argument
  25. from_lang = params["from_lang"][2] # "english"
  26. to_lang = params["to_lang"][2] # "german"
  27. query = params["query"]
  28. params["url"] = f"{base_url}/{from_lang}-{to_lang}-dictionary/{urllib.parse.quote_plus(query)}"
  29. return params
  30. def _clean_up_node(node):
  31. for x in ["./i", "./span", "./button"]:
  32. for n in node.xpath(x):
  33. n.getparent().remove(n)
  34. def response(resp) -> EngineResults:
  35. results = EngineResults()
  36. item_list = []
  37. if not resp.ok:
  38. return results
  39. dom = html.fromstring(resp.text)
  40. for result in eval_xpath(dom, ".//table[@id='r']//tr"):
  41. # each row is an Translations.Item
  42. td_list = result.xpath("./td")
  43. if len(td_list) != 2:
  44. # ignore header columns "tr/th"
  45. continue
  46. col_from, col_to = td_list
  47. _clean_up_node(col_from)
  48. text = f"{extract_text(col_from)}"
  49. synonyms = []
  50. p_list = col_to.xpath(".//p")
  51. for i, p_item in enumerate(p_list):
  52. smpl: str = extract_text(p_list[i].xpath("./i[@class='smpl']")) # type: ignore
  53. _clean_up_node(p_item)
  54. p_text: str = extract_text(p_item) # type: ignore
  55. if smpl:
  56. p_text += " // " + smpl
  57. if i == 0:
  58. text += f" : {p_text}"
  59. continue
  60. synonyms.append(p_text)
  61. item = results.types.Translations.Item(text=text, synonyms=synonyms)
  62. item_list.append(item)
  63. # the "autotranslate" of dictzone is loaded by the JS from URL:
  64. # https://dictzone.com/trans/hello%20world/en_de
  65. from_lang = resp.search_params["from_lang"][1] # "en"
  66. to_lang = resp.search_params["to_lang"][1] # "de"
  67. query = resp.search_params["query"]
  68. # works only sometimes?
  69. autotranslate = http_get(f"{base_url}/trans/{query}/{from_lang}_{to_lang}", timeout=1.0)
  70. if autotranslate.ok and autotranslate.text:
  71. item_list.insert(0, results.types.Translations.Item(text=autotranslate.text))
  72. if item_list:
  73. results.add(results.types.Translations(translations=item_list, url=resp.search_params["url"]))
  74. return results