mullvad_leta.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Mullvad Leta is a search engine proxy. Currently Leta only offers text
  3. search results not image, news or any other types of search result. Leta acts
  4. as a proxy to Google and Brave search results. You can select which backend
  5. search engine you wish to use, see (:py:obj:`leta_engine`).
  6. .. hint::
  7. Leta caches each search for up to 30 days. For example, if you use search
  8. terms like ``news``, contrary to your intention you'll get very old results!
  9. Configuration
  10. =============
  11. The engine has the following additional settings:
  12. - :py:obj:`leta_engine` (:py:obj:`LetaEnginesType`)
  13. You can configure one Leta engine for Google and one for Brave:
  14. .. code:: yaml
  15. - name: mullvadleta
  16. engine: mullvad_leta
  17. leta_engine: google
  18. shortcut: ml
  19. - name: mullvadleta brave
  20. engine: mullvad_leta
  21. network: mullvadleta # use network from engine "mullvadleta" configured above
  22. leta_engine: brave
  23. shortcut: mlb
  24. Implementations
  25. ===============
  26. """
  27. from __future__ import annotations
  28. import typing
  29. from urllib.parse import urlencode
  30. import babel
  31. from httpx import Response
  32. from lxml import html
  33. from searx.enginelib.traits import EngineTraits
  34. from searx.locales import get_official_locales, language_tag, region_tag
  35. from searx.utils import eval_xpath_list
  36. from searx.result_types import EngineResults, MainResult
  37. if typing.TYPE_CHECKING:
  38. import logging
  39. logger = logging.getLogger()
  40. traits: EngineTraits
  41. search_url = "https://leta.mullvad.net"
  42. # about
  43. about = {
  44. "website": search_url,
  45. "wikidata_id": 'Q47008412', # the Mullvad id - not leta, but related
  46. "official_api_documentation": 'https://leta.mullvad.net/faq',
  47. "use_official_api": False,
  48. "require_api_key": False,
  49. "results": 'HTML',
  50. }
  51. # engine dependent config
  52. categories = ["general", "web"]
  53. paging = True
  54. max_page = 10
  55. time_range_support = True
  56. time_range_dict = {
  57. "day": "d",
  58. "week": "w",
  59. "month": "m",
  60. "year": "y",
  61. }
  62. LetaEnginesType = typing.Literal["google", "brave"]
  63. """Engine types supported by mullvadleta."""
  64. leta_engine: LetaEnginesType = "google"
  65. """Select Leta's engine type from :py:obj:`LetaEnginesType`."""
  66. def init(_):
  67. l = typing.get_args(LetaEnginesType)
  68. if leta_engine not in l:
  69. raise ValueError(f"leta_engine '{leta_engine}' is invalid, use one of {', '.join(l)}")
  70. class DataNodeQueryMetaDataIndices(typing.TypedDict):
  71. """Indices into query metadata."""
  72. success: int
  73. q: int # pylint: disable=invalid-name
  74. country: int
  75. language: int
  76. lastUpdated: int
  77. engine: int
  78. items: int
  79. infobox: int
  80. news: int
  81. timestamp: int
  82. altered: int
  83. page: int
  84. next: int # if -1, there no more results are available
  85. previous: int
  86. class DataNodeResultIndices(typing.TypedDict):
  87. """Indices into query resultsdata."""
  88. link: int
  89. snippet: int
  90. title: int
  91. favicon: int
  92. def request(query: str, params: dict):
  93. params["method"] = "GET"
  94. args = {
  95. "q": query,
  96. "engine": leta_engine,
  97. "x-sveltekit-invalidated": "001", # hardcoded from all requests seen
  98. }
  99. country = traits.get_region(params.get("searxng_locale"), traits.all_locale) # type: ignore
  100. if country:
  101. args["country"] = country
  102. language = traits.get_language(params.get("searxng_locale"), traits.all_locale) # type: ignore
  103. if language:
  104. args["language"] = language
  105. if params["time_range"] in time_range_dict:
  106. args["lastUpdated"] = time_range_dict[params["time_range"]]
  107. if params["pageno"] > 1:
  108. args["page"] = params["pageno"]
  109. params["url"] = f"{search_url}/search/__data.json?{urlencode(args)}"
  110. return params
  111. def response(resp: Response) -> EngineResults:
  112. json_response = resp.json()
  113. nodes = json_response["nodes"]
  114. # 0: is None
  115. # 1: has "connected=True", not useful
  116. # 2: query results within "data"
  117. data_nodes = nodes[2]["data"]
  118. # Instead of nested object structure, all objects are flattened into a
  119. # list. Rather, the first object in data_node provides indices into the
  120. # "data_nodes" to access each searchresult (which is an object of more
  121. # indices)
  122. #
  123. # Read the relative TypedDict definitions for details
  124. query_meta_data: DataNodeQueryMetaDataIndices = data_nodes[0]
  125. query_items_indices = query_meta_data["items"]
  126. results = EngineResults()
  127. for idx in data_nodes[query_items_indices]:
  128. query_item_indices: DataNodeResultIndices = data_nodes[idx]
  129. results.add(
  130. MainResult(
  131. url=data_nodes[query_item_indices["link"]],
  132. title=data_nodes[query_item_indices["title"]],
  133. content=data_nodes[query_item_indices["snippet"]],
  134. )
  135. )
  136. return results
  137. def fetch_traits(engine_traits: EngineTraits) -> None:
  138. """Fetch languages and regions from Mullvad-Leta"""
  139. def extract_table_data(table):
  140. for row in table.xpath(".//tr")[2:]:
  141. cells = row.xpath(".//td | .//th") # includes headers and data
  142. if len(cells) > 1: # ensure the column exists
  143. cell0 = cells[0].text_content().strip()
  144. cell1 = cells[1].text_content().strip()
  145. yield [cell0, cell1]
  146. # pylint: disable=import-outside-toplevel
  147. # see https://github.com/searxng/searxng/issues/762
  148. from searx.network import get as http_get
  149. # pylint: enable=import-outside-toplevel
  150. resp = http_get(f"{search_url}/documentation")
  151. if not isinstance(resp, Response):
  152. print("ERROR: failed to get response from mullvad-leta. Are you connected to the VPN?")
  153. return
  154. if not resp.ok:
  155. print("ERROR: response from mullvad-leta is not OK. Are you connected to the VPN?")
  156. return
  157. dom = html.fromstring(resp.text)
  158. # There are 4 HTML tables on the documentation page for extracting information:
  159. # 0. Keyboard Shortcuts
  160. # 1. Query Parameters (shoutout to Mullvad for accessible docs for integration)
  161. # 2. Country Codes [Country, Code]
  162. # 3. Language Codes [Language, Code]
  163. tables = eval_xpath_list(dom.body, "//table")
  164. if tables is None or len(tables) <= 0:
  165. print("ERROR: could not find any tables. Was the page updated?")
  166. language_table = tables[3]
  167. lang_map = {
  168. "zh-hant": "zh_Hans",
  169. "zh-hans": "zh_Hant",
  170. "jp": "ja",
  171. }
  172. for language, code in extract_table_data(language_table):
  173. locale_tag = lang_map.get(code, code).replace("-", "_") # type: ignore
  174. try:
  175. locale = babel.Locale.parse(locale_tag)
  176. except babel.UnknownLocaleError:
  177. print(f"ERROR: Mullvad-Leta language {language} ({code}) is unknown by babel")
  178. continue
  179. sxng_tag = language_tag(locale)
  180. engine_traits.languages[sxng_tag] = code
  181. country_table = tables[2]
  182. country_map = {
  183. "cn": "zh-CN",
  184. "hk": "zh-HK",
  185. "jp": "ja-JP",
  186. "my": "ms-MY",
  187. "tw": "zh-TW",
  188. "uk": "en-GB",
  189. "us": "en-US",
  190. }
  191. for country, code in extract_table_data(country_table):
  192. sxng_tag = country_map.get(code)
  193. if sxng_tag:
  194. engine_traits.regions[sxng_tag] = code
  195. continue
  196. try:
  197. locale = babel.Locale.parse(f"{code.lower()}_{code.upper()}")
  198. except babel.UnknownLocaleError:
  199. locale = None
  200. if locale:
  201. engine_traits.regions[region_tag(locale)] = code
  202. continue
  203. official_locales = get_official_locales(code, engine_traits.languages.keys(), regional=True)
  204. if not official_locales:
  205. print(f"ERROR: Mullvad-Leta country '{code}' ({country}) could not be mapped as expected.")
  206. continue
  207. for locale in official_locales:
  208. engine_traits.regions[region_tag(locale)] = code