mullvad_leta.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """This is the implementation of the Mullvad-Leta meta-search engine.
  3. This engine **REQUIRES** that searxng operate within a Mullvad VPN
  4. If using docker, consider using gluetun for easily connecting to the Mullvad
  5. - https://github.com/qdm12/gluetun
  6. Otherwise, follow instructions provided by Mullvad for enabling the VPN on Linux
  7. - https://mullvad.net/en/help/install-mullvad-app-linux
  8. .. hint::
  9. The :py:obj:`EngineTraits` is empty by default. Maintainers have to run
  10. ``make data.traits`` (in the Mullvad VPN / :py:obj:`fetch_traits`) and rebase
  11. the modified JSON file ``searx/data/engine_traits.json`` on every single
  12. update of SearXNG!
  13. """
  14. from __future__ import annotations
  15. from typing import TYPE_CHECKING
  16. from httpx import Response
  17. from lxml import html
  18. from searx.enginelib.traits import EngineTraits
  19. from searx.locales import region_tag, get_official_locales
  20. from searx.utils import eval_xpath, extract_text, eval_xpath_list
  21. from searx.exceptions import SearxEngineResponseException
  22. if TYPE_CHECKING:
  23. import logging
  24. logger = logging.getLogger()
  25. traits: EngineTraits
  26. use_cache: bool = True # non-cache use only has 100 searches per day!
  27. leta_engine: str = 'google'
  28. search_url = "https://leta.mullvad.net"
  29. # about
  30. about = {
  31. "website": search_url,
  32. "wikidata_id": 'Q47008412', # the Mullvad id - not leta, but related
  33. "official_api_documentation": 'https://leta.mullvad.net/faq',
  34. "use_official_api": False,
  35. "require_api_key": False,
  36. "results": 'HTML',
  37. }
  38. # engine dependent config
  39. categories = ['general', 'web']
  40. paging = True
  41. max_page = 50
  42. time_range_support = True
  43. time_range_dict = {
  44. "day": "d1",
  45. "week": "w1",
  46. "month": "m1",
  47. "year": "y1",
  48. }
  49. available_leta_engines = [
  50. 'google', # first will be default if provided engine is invalid
  51. 'brave',
  52. ]
  53. def is_vpn_connected(dom: html.HtmlElement) -> bool:
  54. """Returns true if the VPN is connected, False otherwise"""
  55. connected_text = extract_text(eval_xpath(dom, '//main/div/p[1]'))
  56. return connected_text != 'You are not connected to Mullvad VPN.'
  57. def assign_headers(headers: dict) -> dict:
  58. """Assigns the headers to make a request to Mullvad Leta"""
  59. headers['Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
  60. headers['Content-Type'] = "application/x-www-form-urlencoded"
  61. headers['Host'] = "leta.mullvad.net"
  62. headers['Origin'] = "https://leta.mullvad.net"
  63. return headers
  64. def request(query: str, params: dict):
  65. country = traits.get_region(params.get('searxng_locale', 'all'), traits.all_locale) # type: ignore
  66. result_engine = leta_engine
  67. if leta_engine not in available_leta_engines:
  68. result_engine = available_leta_engines[0]
  69. logger.warning(
  70. 'Configured engine "%s" not one of the available engines %s, defaulting to "%s"',
  71. leta_engine,
  72. available_leta_engines,
  73. result_engine,
  74. )
  75. params['url'] = search_url
  76. params['method'] = 'POST'
  77. params['data'] = {
  78. "q": query,
  79. "gl": country if country is str else '',
  80. 'engine': result_engine,
  81. }
  82. # pylint: disable=undefined-variable
  83. if use_cache:
  84. params['data']['oc'] = "on"
  85. # pylint: enable=undefined-variable
  86. if params['time_range'] in time_range_dict:
  87. params['dateRestrict'] = time_range_dict[params['time_range']]
  88. else:
  89. params['dateRestrict'] = ''
  90. if params['pageno'] > 1:
  91. # Page 1 is n/a, Page 2 is 11, page 3 is 21, ...
  92. params['data']['start'] = ''.join([str(params['pageno'] - 1), "1"])
  93. if params['headers'] is None:
  94. params['headers'] = {}
  95. assign_headers(params['headers'])
  96. return params
  97. def extract_result(dom_result: list[html.HtmlElement]):
  98. # Infoboxes sometimes appear in the beginning and will have a length of 0
  99. if len(dom_result) == 3:
  100. [a_elem, h3_elem, p_elem] = dom_result
  101. elif len(dom_result) == 4:
  102. [_, a_elem, h3_elem, p_elem] = dom_result
  103. else:
  104. return None
  105. return {
  106. 'url': extract_text(a_elem.text),
  107. 'title': extract_text(h3_elem),
  108. 'content': extract_text(p_elem),
  109. }
  110. def extract_results(search_results: html.HtmlElement):
  111. for search_result in search_results:
  112. dom_result = eval_xpath_list(search_result, 'div/div/*')
  113. result = extract_result(dom_result)
  114. if result is not None:
  115. yield result
  116. def response(resp: Response):
  117. """Checks if connected to Mullvad VPN, then extracts the search results from
  118. the DOM resp: requests response object"""
  119. dom = html.fromstring(resp.text)
  120. if not is_vpn_connected(dom):
  121. raise SearxEngineResponseException('Not connected to Mullvad VPN')
  122. search_results = eval_xpath(dom.body, '//main/div[2]/div')
  123. return list(extract_results(search_results))
  124. def fetch_traits(engine_traits: EngineTraits):
  125. """Fetch languages and regions from Mullvad-Leta
  126. .. warning::
  127. Fetching the engine traits also requires a Mullvad VPN connection. If
  128. not connected, then an error message will print and no traits will be
  129. updated.
  130. """
  131. # pylint: disable=import-outside-toplevel
  132. # see https://github.com/searxng/searxng/issues/762
  133. from searx.network import post as http_post
  134. # pylint: enable=import-outside-toplevel
  135. resp = http_post(search_url, headers=assign_headers({}))
  136. if not isinstance(resp, Response):
  137. print("ERROR: failed to get response from mullvad-leta. Are you connected to the VPN?")
  138. return
  139. if not resp.ok:
  140. print("ERROR: response from mullvad-leta is not OK. Are you connected to the VPN?")
  141. return
  142. dom = html.fromstring(resp.text)
  143. if not is_vpn_connected(dom):
  144. print('ERROR: Not connected to Mullvad VPN')
  145. return
  146. # supported region codes
  147. options = eval_xpath_list(dom.body, '//main/div/form/div[2]/div/select[1]/option')
  148. if options is None or len(options) <= 0:
  149. print('ERROR: could not find any results. Are you connected to the VPN?')
  150. for x in options:
  151. eng_country = x.get("value")
  152. sxng_locales = get_official_locales(eng_country, engine_traits.languages.keys(), regional=True)
  153. if not sxng_locales:
  154. print(
  155. "ERROR: can't map from Mullvad-Leta country %s (%s) to a babel region."
  156. % (x.get('data-name'), eng_country)
  157. )
  158. continue
  159. for sxng_locale in sxng_locales:
  160. engine_traits.regions[region_tag(sxng_locale)] = eng_country