utils.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import sys
  4. import csv
  5. import hashlib
  6. import hmac
  7. import re
  8. import json
  9. from codecs import getincrementalencoder
  10. from imp import load_source
  11. from numbers import Number
  12. from os.path import splitext, join
  13. from io import open, StringIO
  14. from random import choice
  15. from html.parser import HTMLParser
  16. from lxml.etree import XPath
  17. from babel.core import get_global
  18. from babel.dates import format_date
  19. from searx import settings
  20. from searx.version import VERSION_STRING
  21. from searx.languages import language_codes
  22. from searx import settings
  23. from searx import logger
  24. logger = logger.getChild('utils')
  25. blocked_tags = ('script',
  26. 'style')
  27. ecma_unescape4_re = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
  28. ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
  29. useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__))
  30. + "/data/useragents.json", 'r', encoding='utf-8').read())
  31. xpath_cache = dict()
  32. lang_to_lc_cache = dict()
  33. def searx_useragent():
  34. return 'searx/{searx_version} {suffix}'.format(
  35. searx_version=VERSION_STRING,
  36. suffix=settings['outgoing'].get('useragent_suffix', ''))
  37. def gen_useragent(os=None):
  38. return str(useragents['ua'].format(os=os or choice(useragents['os']), version=choice(useragents['versions'])))
  39. def highlight_content(content, query):
  40. if not content:
  41. return None
  42. # ignoring html contents
  43. # TODO better html content detection
  44. if content.find('<') != -1:
  45. return content
  46. if content.lower().find(query.lower()) > -1:
  47. query_regex = '({0})'.format(re.escape(query))
  48. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  49. content, flags=re.I | re.U)
  50. else:
  51. regex_parts = []
  52. for chunk in query.split():
  53. if len(chunk) == 1:
  54. regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk)))
  55. else:
  56. regex_parts.append('{0}'.format(re.escape(chunk)))
  57. query_regex = '({0})'.format('|'.join(regex_parts))
  58. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  59. content, flags=re.I | re.U)
  60. return content
  61. class HTMLTextExtractorException(Exception):
  62. pass
  63. class HTMLTextExtractor(HTMLParser):
  64. def __init__(self):
  65. HTMLParser.__init__(self)
  66. self.result = []
  67. self.tags = []
  68. def handle_starttag(self, tag, attrs):
  69. self.tags.append(tag)
  70. def handle_endtag(self, tag):
  71. if not self.tags:
  72. return
  73. if tag != self.tags[-1]:
  74. raise HTMLTextExtractorException()
  75. self.tags.pop()
  76. def is_valid_tag(self):
  77. return not self.tags or self.tags[-1] not in blocked_tags
  78. def handle_data(self, d):
  79. if not self.is_valid_tag():
  80. return
  81. self.result.append(d)
  82. def handle_charref(self, number):
  83. if not self.is_valid_tag():
  84. return
  85. if number[0] in ('x', 'X'):
  86. codepoint = int(number[1:], 16)
  87. else:
  88. codepoint = int(number)
  89. self.result.append(chr(codepoint))
  90. def handle_entityref(self, name):
  91. if not self.is_valid_tag():
  92. return
  93. # codepoint = htmlentitydefs.name2codepoint[name]
  94. # self.result.append(chr(codepoint))
  95. self.result.append(name)
  96. def get_text(self):
  97. return ''.join(self.result).strip()
  98. def html_to_text(html):
  99. html = html.replace('\n', ' ')
  100. html = ' '.join(html.split())
  101. s = HTMLTextExtractor()
  102. try:
  103. s.feed(html)
  104. except HTMLTextExtractorException:
  105. logger.debug("HTMLTextExtractor: invalid HTML\n%s", html)
  106. return s.get_text()
  107. class UnicodeWriter:
  108. """
  109. A CSV writer which will write rows to CSV file "f",
  110. which is encoded in the given encoding.
  111. """
  112. def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
  113. # Redirect output to a queue
  114. self.queue = StringIO()
  115. self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
  116. self.stream = f
  117. self.encoder = getincrementalencoder(encoding)()
  118. def writerow(self, row):
  119. self.writer.writerow(row)
  120. # Fetch UTF-8 output from the queue ...
  121. data = self.queue.getvalue()
  122. data = data.strip('\x00')
  123. # ... and reencode it into the target encoding
  124. data = self.encoder.encode(data)
  125. # write to the target stream
  126. self.stream.write(data.decode())
  127. # empty queue
  128. self.queue.truncate(0)
  129. def writerows(self, rows):
  130. for row in rows:
  131. self.writerow(row)
  132. def get_resources_directory(searx_directory, subdirectory, resources_directory):
  133. if not resources_directory:
  134. resources_directory = os.path.join(searx_directory, subdirectory)
  135. if not os.path.isdir(resources_directory):
  136. raise Exception(resources_directory + " is not a directory")
  137. return resources_directory
  138. def get_themes(templates_path):
  139. """Returns available themes list."""
  140. themes = os.listdir(templates_path)
  141. if '__common__' in themes:
  142. themes.remove('__common__')
  143. return themes
  144. def get_static_files(static_path):
  145. static_files = set()
  146. static_path_length = len(static_path) + 1
  147. for directory, _, files in os.walk(static_path):
  148. for filename in files:
  149. f = os.path.join(directory[static_path_length:], filename)
  150. static_files.add(f)
  151. return static_files
  152. def get_result_templates(templates_path):
  153. result_templates = set()
  154. templates_path_length = len(templates_path) + 1
  155. for directory, _, files in os.walk(templates_path):
  156. if directory.endswith('result_templates'):
  157. for filename in files:
  158. f = os.path.join(directory[templates_path_length:], filename)
  159. result_templates.add(f)
  160. return result_templates
  161. def format_date_by_locale(date, locale_string):
  162. # strftime works only on dates after 1900
  163. if date.year <= 1900:
  164. return date.isoformat().split('T')[0]
  165. if locale_string == 'all':
  166. locale_string = settings['ui']['default_locale'] or 'en_US'
  167. # to avoid crashing if locale is not supported by babel
  168. try:
  169. formatted_date = format_date(date, locale=locale_string)
  170. except:
  171. formatted_date = format_date(date, "YYYY-MM-dd")
  172. return formatted_date
  173. def dict_subset(d, properties):
  174. result = {}
  175. for k in properties:
  176. if k in d:
  177. result[k] = d[k]
  178. return result
  179. def prettify_url(url, max_length=74):
  180. if len(url) > max_length:
  181. chunk_len = int(max_length / 2 + 1)
  182. return '{0}[...]{1}'.format(url[:chunk_len], url[-chunk_len:])
  183. else:
  184. return url
  185. # get element in list or default value
  186. def list_get(a_list, index, default=None):
  187. if len(a_list) > index:
  188. return a_list[index]
  189. else:
  190. return default
  191. def get_torrent_size(filesize, filesize_multiplier):
  192. try:
  193. filesize = float(filesize)
  194. if filesize_multiplier == 'TB':
  195. filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
  196. elif filesize_multiplier == 'GB':
  197. filesize = int(filesize * 1024 * 1024 * 1024)
  198. elif filesize_multiplier == 'MB':
  199. filesize = int(filesize * 1024 * 1024)
  200. elif filesize_multiplier == 'KB':
  201. filesize = int(filesize * 1024)
  202. elif filesize_multiplier == 'TiB':
  203. filesize = int(filesize * 1000 * 1000 * 1000 * 1000)
  204. elif filesize_multiplier == 'GiB':
  205. filesize = int(filesize * 1000 * 1000 * 1000)
  206. elif filesize_multiplier == 'MiB':
  207. filesize = int(filesize * 1000 * 1000)
  208. elif filesize_multiplier == 'KiB':
  209. filesize = int(filesize * 1000)
  210. except:
  211. filesize = None
  212. return filesize
  213. def convert_str_to_int(number_str):
  214. if number_str.isdigit():
  215. return int(number_str)
  216. else:
  217. return 0
  218. # convert a variable to integer or return 0 if it's not a number
  219. def int_or_zero(num):
  220. if isinstance(num, list):
  221. if len(num) < 1:
  222. return 0
  223. num = num[0]
  224. return convert_str_to_int(num)
  225. def is_valid_lang(lang):
  226. if isinstance(lang, bytes):
  227. lang = lang.decode()
  228. is_abbr = (len(lang) == 2)
  229. lang = lang.lower()
  230. if is_abbr:
  231. for l in language_codes:
  232. if l[0][:2] == lang:
  233. return (True, l[0][:2], l[3].lower())
  234. return False
  235. else:
  236. for l in language_codes:
  237. if l[1].lower() == lang or l[3].lower() == lang:
  238. return (True, l[0][:2], l[3].lower())
  239. return False
  240. def _get_lang_to_lc_dict(lang_list):
  241. key = str(lang_list)
  242. value = lang_to_lc_cache.get(key, None)
  243. if value is None:
  244. value = dict()
  245. for lc in lang_list:
  246. value.setdefault(lc.split('-')[0], lc)
  247. lang_to_lc_cache[key] = value
  248. return value
  249. # auxiliary function to match lang_code in lang_list
  250. def _match_language(lang_code, lang_list=[], custom_aliases={}):
  251. # replace language code with a custom alias if necessary
  252. if lang_code in custom_aliases:
  253. lang_code = custom_aliases[lang_code]
  254. if lang_code in lang_list:
  255. return lang_code
  256. # try to get the most likely country for this language
  257. subtags = get_global('likely_subtags').get(lang_code)
  258. if subtags:
  259. subtag_parts = subtags.split('_')
  260. new_code = subtag_parts[0] + '-' + subtag_parts[-1]
  261. if new_code in custom_aliases:
  262. new_code = custom_aliases[new_code]
  263. if new_code in lang_list:
  264. return new_code
  265. # try to get the any supported country for this language
  266. return _get_lang_to_lc_dict(lang_list).get(lang_code, None)
  267. # get the language code from lang_list that best matches locale_code
  268. def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):
  269. # try to get language from given locale_code
  270. language = _match_language(locale_code, lang_list, custom_aliases)
  271. if language:
  272. return language
  273. locale_parts = locale_code.split('-')
  274. lang_code = locale_parts[0]
  275. # try to get language using an equivalent country code
  276. if len(locale_parts) > 1:
  277. country_alias = get_global('territory_aliases').get(locale_parts[-1])
  278. if country_alias:
  279. language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
  280. if language:
  281. return language
  282. # try to get language using an equivalent language code
  283. alias = get_global('language_aliases').get(lang_code)
  284. if alias:
  285. language = _match_language(alias, lang_list, custom_aliases)
  286. if language:
  287. return language
  288. if lang_code != locale_code:
  289. # try to get language from given language without giving the country
  290. language = _match_language(lang_code, lang_list, custom_aliases)
  291. return language or fallback
  292. def load_module(filename, module_dir):
  293. modname = splitext(filename)[0]
  294. if modname in sys.modules:
  295. del sys.modules[modname]
  296. filepath = join(module_dir, filename)
  297. module = load_source(modname, filepath)
  298. module.name = modname
  299. return module
  300. def new_hmac(secret_key, url):
  301. try:
  302. secret_key_bytes = bytes(secret_key, 'utf-8')
  303. except TypeError as err:
  304. if isinstance(secret_key, bytes):
  305. secret_key_bytes = secret_key
  306. else:
  307. raise err
  308. return hmac.new(secret_key_bytes, url, hashlib.sha256).hexdigest()
  309. def to_string(obj):
  310. if isinstance(obj, str):
  311. return obj
  312. if isinstance(obj, Number):
  313. return str(obj)
  314. if hasattr(obj, '__str__'):
  315. return obj.__str__()
  316. if hasattr(obj, '__repr__'):
  317. return obj.__repr__()
  318. def ecma_unescape(s):
  319. """
  320. python implementation of the unescape javascript function
  321. https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
  322. https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
  323. """
  324. # s = unicode(s)
  325. # "%u5409" becomes "吉"
  326. s = ecma_unescape4_re.sub(lambda e: chr(int(e.group(1), 16)), s)
  327. # "%20" becomes " ", "%F3" becomes "ó"
  328. s = ecma_unescape2_re.sub(lambda e: chr(int(e.group(1), 16)), s)
  329. return s
  330. def get_engine_from_settings(name):
  331. """Return engine configuration from settings.yml of a given engine name"""
  332. if 'engines' not in settings:
  333. return {}
  334. for engine in settings['engines']:
  335. if 'name' not in engine:
  336. continue
  337. if name == engine['name']:
  338. return engine
  339. return {}
  340. def get_xpath(xpath_str):
  341. result = xpath_cache.get(xpath_str, None)
  342. if result is None:
  343. result = XPath(xpath_str)
  344. xpath_cache[xpath_str] = result
  345. return result
  346. def eval_xpath(element, xpath_str):
  347. xpath = get_xpath(xpath_str)
  348. return xpath(element)