annas_archive.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Anna's Archive
  4. """
  5. from typing import List, Dict, Any, Optional
  6. from urllib.parse import quote
  7. from lxml import html
  8. from searx.utils import extract_text, eval_xpath, eval_xpath_list
  9. from searx.enginelib.traits import EngineTraits
  10. # about
  11. about: Dict[str, Any] = {
  12. "website": "https://annas-archive.org/",
  13. "wikidata_id": "Q115288326",
  14. "official_api_documentation": None,
  15. "use_official_api": False,
  16. "require_api_key": False,
  17. "results": "HTML",
  18. }
  19. # engine dependent config
  20. categories: List[str] = ["files"]
  21. paging: bool = False
  22. # search-url
  23. base_url: str = "https://annas-archive.org"
  24. # xpath queries
  25. xpath_results: str = '//main//a[starts-with(@href,"/md5")]'
  26. xpath_url: str = ".//@href"
  27. xpath_title: str = ".//h3/text()[1]"
  28. xpath_authors: str = './/div[contains(@class, "italic")]'
  29. xpath_publisher: str = './/div[contains(@class, "text-sm")]'
  30. xpath_file_info: str = './/div[contains(@class, "text-xs")]'
  31. def request(query, params: Dict[str, Any]) -> Dict[str, Any]:
  32. search_url: str = base_url + "/search?q={search_query}&lang={lang}"
  33. lang: str = ""
  34. if params["language"] != "all":
  35. lang = params["language"]
  36. params["url"] = search_url.format(search_query=quote(query), lang=lang)
  37. return params
  38. def response(resp) -> List[Dict[str, Optional[str]]]:
  39. results: List[Dict[str, Optional[str]]] = []
  40. dom = html.fromstring(resp.text)
  41. for item in dom.xpath(xpath_results):
  42. result: Dict[str, Optional[str]] = {}
  43. result["url"] = base_url + item.xpath(xpath_url)[0]
  44. result["title"] = extract_text(eval_xpath(item, xpath_title))
  45. result["content"] = "{publisher}. {authors}. {file_info}".format(
  46. authors=extract_text(eval_xpath(item, xpath_authors)),
  47. publisher=extract_text(eval_xpath(item, xpath_publisher)),
  48. file_info=extract_text(eval_xpath(item, xpath_file_info)),
  49. )
  50. results.append(result)
  51. return results
  52. def fetch_traits(engine_traits: EngineTraits):
  53. """Fetch languages and other search arguments from Anna's search form."""
  54. # pylint: disable=import-outside-toplevel
  55. import babel
  56. from searx.network import get # see https://github.com/searxng/searxng/issues/762
  57. from searx.locales import language_tag
  58. engine_traits.all_locale = ''
  59. engine_traits.custom['content'] = []
  60. engine_traits.custom['ext'] = []
  61. engine_traits.custom['sort'] = []
  62. resp = get(base_url + '/search')
  63. if not resp.ok: # type: ignore
  64. raise RuntimeError("Response from Anna's search page is not OK.")
  65. dom = html.fromstring(resp.text) # type: ignore
  66. # supported language codes
  67. lang_map = {}
  68. for x in eval_xpath_list(dom, "//form//select[@name='lang']//option"):
  69. eng_lang = x.get("value")
  70. if eng_lang in ('', '_empty', 'nl-BE', 'und'):
  71. continue
  72. try:
  73. locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
  74. except babel.UnknownLocaleError:
  75. # silently ignore unknown languages
  76. # print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
  77. continue
  78. sxng_lang = language_tag(locale)
  79. conflict = engine_traits.languages.get(sxng_lang)
  80. if conflict:
  81. if conflict != eng_lang:
  82. print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
  83. continue
  84. engine_traits.languages[sxng_lang] = eng_lang
  85. for x in eval_xpath_list(dom, "//form//select[@name='content']//option"):
  86. engine_traits.custom['content'].append(x.get("value"))
  87. for x in eval_xpath_list(dom, "//form//select[@name='ext']//option"):
  88. engine_traits.custom['ext'].append(x.get("value"))
  89. for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"):
  90. engine_traits.custom['sort'].append(x.get("value"))