utils.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import sys
  4. import csv
  5. import hashlib
  6. import hmac
  7. import re
  8. import json
  9. from codecs import getincrementalencoder
  10. from imp import load_source
  11. from numbers import Number
  12. from os.path import splitext, join
  13. from io import open, StringIO
  14. from random import choice
  15. from html.parser import HTMLParser
  16. from lxml.etree import XPath
  17. from babel.core import get_global
  18. from babel.dates import format_date
  19. from searx import settings
  20. from searx.version import VERSION_STRING
  21. from searx.languages import language_codes
  22. from searx import settings
  23. from searx import logger
  24. logger = logger.getChild('utils')
  25. blocked_tags = ('script',
  26. 'style')
  27. ecma_unescape4_re = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
  28. ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
  29. useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__))
  30. + "/data/useragents.json", 'r', encoding='utf-8').read())
  31. xpath_cache = dict()
  32. lang_to_lc_cache = dict()
  33. def searx_useragent():
  34. return 'searx/{searx_version} {suffix}'.format(
  35. searx_version=VERSION_STRING,
  36. suffix=settings['outgoing'].get('useragent_suffix', ''))
  37. def gen_useragent(os=None):
  38. return str(useragents['ua'].format(os=os or choice(useragents['os']), version=choice(useragents['versions'])))
  39. def highlight_content(content, query):
  40. if not content:
  41. return None
  42. # ignoring html contents
  43. # TODO better html content detection
  44. if content.find('<') != -1:
  45. return content
  46. if content.lower().find(query.lower()) > -1:
  47. query_regex = '({0})'.format(re.escape(query))
  48. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  49. content, flags=re.I | re.U)
  50. else:
  51. regex_parts = []
  52. for chunk in query.split():
  53. if len(chunk) == 1:
  54. regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk)))
  55. else:
  56. regex_parts.append('{0}'.format(re.escape(chunk)))
  57. query_regex = '({0})'.format('|'.join(regex_parts))
  58. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  59. content, flags=re.I | re.U)
  60. return content
  61. class HTMLTextExtractor(HTMLParser):
  62. def __init__(self):
  63. HTMLParser.__init__(self)
  64. self.result = []
  65. self.tags = []
  66. def handle_starttag(self, tag, attrs):
  67. self.tags.append(tag)
  68. def handle_endtag(self, tag):
  69. if not self.tags:
  70. return
  71. if tag != self.tags[-1]:
  72. raise Exception("invalid html")
  73. self.tags.pop()
  74. def is_valid_tag(self):
  75. return not self.tags or self.tags[-1] not in blocked_tags
  76. def handle_data(self, d):
  77. if not self.is_valid_tag():
  78. return
  79. self.result.append(d)
  80. def handle_charref(self, number):
  81. if not self.is_valid_tag():
  82. return
  83. if number[0] in ('x', 'X'):
  84. codepoint = int(number[1:], 16)
  85. else:
  86. codepoint = int(number)
  87. self.result.append(chr(codepoint))
  88. def handle_entityref(self, name):
  89. if not self.is_valid_tag():
  90. return
  91. # codepoint = htmlentitydefs.name2codepoint[name]
  92. # self.result.append(chr(codepoint))
  93. self.result.append(name)
  94. def get_text(self):
  95. return ''.join(self.result).strip()
  96. def html_to_text(html):
  97. html = html.replace('\n', ' ')
  98. html = ' '.join(html.split())
  99. s = HTMLTextExtractor()
  100. s.feed(html)
  101. return s.get_text()
  102. class UnicodeWriter:
  103. """
  104. A CSV writer which will write rows to CSV file "f",
  105. which is encoded in the given encoding.
  106. """
  107. def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
  108. # Redirect output to a queue
  109. self.queue = StringIO()
  110. self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
  111. self.stream = f
  112. self.encoder = getincrementalencoder(encoding)()
  113. def writerow(self, row):
  114. self.writer.writerow(row)
  115. # Fetch UTF-8 output from the queue ...
  116. data = self.queue.getvalue()
  117. data = data.strip('\x00')
  118. # ... and reencode it into the target encoding
  119. data = self.encoder.encode(data)
  120. # write to the target stream
  121. self.stream.write(data.decode())
  122. # empty queue
  123. self.queue.truncate(0)
  124. def writerows(self, rows):
  125. for row in rows:
  126. self.writerow(row)
  127. def get_resources_directory(searx_directory, subdirectory, resources_directory):
  128. if not resources_directory:
  129. resources_directory = os.path.join(searx_directory, subdirectory)
  130. if not os.path.isdir(resources_directory):
  131. raise Exception(resources_directory + " is not a directory")
  132. return resources_directory
  133. def get_themes(templates_path):
  134. """Returns available themes list."""
  135. themes = os.listdir(templates_path)
  136. if '__common__' in themes:
  137. themes.remove('__common__')
  138. return themes
  139. def get_static_files(static_path):
  140. static_files = set()
  141. static_path_length = len(static_path) + 1
  142. for directory, _, files in os.walk(static_path):
  143. for filename in files:
  144. f = os.path.join(directory[static_path_length:], filename)
  145. static_files.add(f)
  146. return static_files
  147. def get_result_templates(templates_path):
  148. result_templates = set()
  149. templates_path_length = len(templates_path) + 1
  150. for directory, _, files in os.walk(templates_path):
  151. if directory.endswith('result_templates'):
  152. for filename in files:
  153. f = os.path.join(directory[templates_path_length:], filename)
  154. result_templates.add(f)
  155. return result_templates
  156. def format_date_by_locale(date, locale_string):
  157. # strftime works only on dates after 1900
  158. if date.year <= 1900:
  159. return date.isoformat().split('T')[0]
  160. if locale_string == 'all':
  161. locale_string = settings['ui']['default_locale'] or 'en_US'
  162. # to avoid crashing if locale is not supported by babel
  163. try:
  164. formatted_date = format_date(date, locale=locale_string)
  165. except:
  166. formatted_date = format_date(date, "YYYY-MM-dd")
  167. return formatted_date
  168. def dict_subset(d, properties):
  169. result = {}
  170. for k in properties:
  171. if k in d:
  172. result[k] = d[k]
  173. return result
  174. def prettify_url(url, max_length=74):
  175. if len(url) > max_length:
  176. chunk_len = int(max_length / 2 + 1)
  177. return '{0}[...]{1}'.format(url[:chunk_len], url[-chunk_len:])
  178. else:
  179. return url
  180. # get element in list or default value
  181. def list_get(a_list, index, default=None):
  182. if len(a_list) > index:
  183. return a_list[index]
  184. else:
  185. return default
  186. def get_torrent_size(filesize, filesize_multiplier):
  187. try:
  188. filesize = float(filesize)
  189. if filesize_multiplier == 'TB':
  190. filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
  191. elif filesize_multiplier == 'GB':
  192. filesize = int(filesize * 1024 * 1024 * 1024)
  193. elif filesize_multiplier == 'MB':
  194. filesize = int(filesize * 1024 * 1024)
  195. elif filesize_multiplier == 'KB':
  196. filesize = int(filesize * 1024)
  197. elif filesize_multiplier == 'TiB':
  198. filesize = int(filesize * 1000 * 1000 * 1000 * 1000)
  199. elif filesize_multiplier == 'GiB':
  200. filesize = int(filesize * 1000 * 1000 * 1000)
  201. elif filesize_multiplier == 'MiB':
  202. filesize = int(filesize * 1000 * 1000)
  203. elif filesize_multiplier == 'KiB':
  204. filesize = int(filesize * 1000)
  205. except:
  206. filesize = None
  207. return filesize
  208. def convert_str_to_int(number_str):
  209. if number_str.isdigit():
  210. return int(number_str)
  211. else:
  212. return 0
  213. # convert a variable to integer or return 0 if it's not a number
  214. def int_or_zero(num):
  215. if isinstance(num, list):
  216. if len(num) < 1:
  217. return 0
  218. num = num[0]
  219. return convert_str_to_int(num)
  220. def is_valid_lang(lang):
  221. is_abbr = (len(lang) == 2)
  222. lang = lang.lower().decode()
  223. if is_abbr:
  224. for l in language_codes:
  225. if l[0][:2] == lang:
  226. return (True, l[0][:2], l[3].lower())
  227. return False
  228. else:
  229. for l in language_codes:
  230. if l[1].lower() == lang or l[3].lower() == lang:
  231. return (True, l[0][:2], l[3].lower())
  232. return False
  233. def _get_lang_to_lc_dict(lang_list):
  234. key = str(lang_list)
  235. value = lang_to_lc_cache.get(key, None)
  236. if value is None:
  237. value = dict()
  238. for lc in lang_list:
  239. value.setdefault(lc.split('-')[0], lc)
  240. lang_to_lc_cache[key] = value
  241. return value
  242. # auxiliary function to match lang_code in lang_list
  243. def _match_language(lang_code, lang_list=[], custom_aliases={}):
  244. # replace language code with a custom alias if necessary
  245. if lang_code in custom_aliases:
  246. lang_code = custom_aliases[lang_code]
  247. if lang_code in lang_list:
  248. return lang_code
  249. # try to get the most likely country for this language
  250. subtags = get_global('likely_subtags').get(lang_code)
  251. if subtags:
  252. subtag_parts = subtags.split('_')
  253. new_code = subtag_parts[0] + '-' + subtag_parts[-1]
  254. if new_code in custom_aliases:
  255. new_code = custom_aliases[new_code]
  256. if new_code in lang_list:
  257. return new_code
  258. # try to get the any supported country for this language
  259. return _get_lang_to_lc_dict(lang_list).get(lang_code, None)
  260. # get the language code from lang_list that best matches locale_code
  261. def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):
  262. # try to get language from given locale_code
  263. language = _match_language(locale_code, lang_list, custom_aliases)
  264. if language:
  265. return language
  266. locale_parts = locale_code.split('-')
  267. lang_code = locale_parts[0]
  268. # try to get language using an equivalent country code
  269. if len(locale_parts) > 1:
  270. country_alias = get_global('territory_aliases').get(locale_parts[-1])
  271. if country_alias:
  272. language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
  273. if language:
  274. return language
  275. # try to get language using an equivalent language code
  276. alias = get_global('language_aliases').get(lang_code)
  277. if alias:
  278. language = _match_language(alias, lang_list, custom_aliases)
  279. if language:
  280. return language
  281. if lang_code != locale_code:
  282. # try to get language from given language without giving the country
  283. language = _match_language(lang_code, lang_list, custom_aliases)
  284. return language or fallback
  285. def load_module(filename, module_dir):
  286. modname = splitext(filename)[0]
  287. if modname in sys.modules:
  288. del sys.modules[modname]
  289. filepath = join(module_dir, filename)
  290. module = load_source(modname, filepath)
  291. module.name = modname
  292. return module
  293. def new_hmac(secret_key, url):
  294. try:
  295. secret_key_bytes = bytes(secret_key, 'utf-8')
  296. except TypeError as err:
  297. if isinstance(secret_key, bytes):
  298. secret_key_bytes = secret_key
  299. else:
  300. raise err
  301. return hmac.new(secret_key_bytes, url, hashlib.sha256).hexdigest()
  302. def to_string(obj):
  303. if isinstance(obj, str):
  304. return obj
  305. if isinstance(obj, Number):
  306. return str(obj)
  307. if hasattr(obj, '__str__'):
  308. return obj.__str__()
  309. if hasattr(obj, '__repr__'):
  310. return obj.__repr__()
  311. def ecma_unescape(s):
  312. """
  313. python implementation of the unescape javascript function
  314. https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
  315. https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
  316. """
  317. # s = unicode(s)
  318. # "%u5409" becomes "吉"
  319. s = ecma_unescape4_re.sub(lambda e: chr(int(e.group(1), 16)), s)
  320. # "%20" becomes " ", "%F3" becomes "ó"
  321. s = ecma_unescape2_re.sub(lambda e: chr(int(e.group(1), 16)), s)
  322. return s
  323. def get_engine_from_settings(name):
  324. """Return engine configuration from settings.yml of a given engine name"""
  325. if 'engines' not in settings:
  326. return {}
  327. for engine in settings['engines']:
  328. if 'name' not in engine:
  329. continue
  330. if name == engine['name']:
  331. return engine
  332. return {}
  333. def get_xpath(xpath_str):
  334. result = xpath_cache.get(xpath_str, None)
  335. if result is None:
  336. result = XPath(xpath_str)
  337. xpath_cache[xpath_str] = result
  338. return result
  339. def eval_xpath(element, xpath_str):
  340. xpath = get_xpath(xpath_str)
  341. return xpath(element)