utils.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. # -*- coding: utf-8 -*-
  2. import csv
  3. import hashlib
  4. import hmac
  5. import os
  6. import re
  7. from babel.core import get_global
  8. from babel.dates import format_date
  9. from codecs import getincrementalencoder
  10. from imp import load_source
  11. from numbers import Number
  12. from os.path import splitext, join
  13. from io import open
  14. from random import choice
  15. import sys
  16. import json
  17. from searx import settings
  18. from searx.version import VERSION_STRING
  19. from searx.languages import language_codes
  20. from searx import settings
  21. from searx import logger
  22. try:
  23. from cStringIO import StringIO
  24. except:
  25. from io import StringIO
  26. try:
  27. from HTMLParser import HTMLParser
  28. except:
  29. from html.parser import HTMLParser
  30. if sys.version_info[0] == 3:
  31. unichr = chr
  32. unicode = str
  33. IS_PY2 = False
  34. basestring = str
  35. else:
  36. IS_PY2 = True
  37. logger = logger.getChild('utils')
  38. blocked_tags = ('script',
  39. 'style')
  40. ecma_unescape4_re = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
  41. ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
  42. useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__))
  43. + "/data/useragents.json", 'r', encoding='utf-8').read())
  44. lang_to_lc_cache = dict()
  45. def searx_useragent():
  46. return 'searx/{searx_version} {suffix}'.format(
  47. searx_version=VERSION_STRING,
  48. suffix=settings['outgoing'].get('useragent_suffix', ''))
  49. def gen_useragent(os=None):
  50. return str(useragents['ua'].format(os=os or choice(useragents['os']), version=choice(useragents['versions'])))
  51. def highlight_content(content, query):
  52. if not content:
  53. return None
  54. # ignoring html contents
  55. # TODO better html content detection
  56. if content.find('<') != -1:
  57. return content
  58. query = query.decode('utf-8')
  59. if content.lower().find(query.lower()) > -1:
  60. query_regex = u'({0})'.format(re.escape(query))
  61. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  62. content, flags=re.I | re.U)
  63. else:
  64. regex_parts = []
  65. for chunk in query.split():
  66. if len(chunk) == 1:
  67. regex_parts.append(u'\\W+{0}\\W+'.format(re.escape(chunk)))
  68. else:
  69. regex_parts.append(u'{0}'.format(re.escape(chunk)))
  70. query_regex = u'({0})'.format('|'.join(regex_parts))
  71. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  72. content, flags=re.I | re.U)
  73. return content
  74. class HTMLTextExtractor(HTMLParser):
  75. def __init__(self):
  76. HTMLParser.__init__(self)
  77. self.result = []
  78. self.tags = []
  79. def handle_starttag(self, tag, attrs):
  80. self.tags.append(tag)
  81. def handle_endtag(self, tag):
  82. if not self.tags:
  83. return
  84. if tag != self.tags[-1]:
  85. raise Exception("invalid html")
  86. self.tags.pop()
  87. def is_valid_tag(self):
  88. return not self.tags or self.tags[-1] not in blocked_tags
  89. def handle_data(self, d):
  90. if not self.is_valid_tag():
  91. return
  92. self.result.append(d)
  93. def handle_charref(self, number):
  94. if not self.is_valid_tag():
  95. return
  96. if number[0] in (u'x', u'X'):
  97. codepoint = int(number[1:], 16)
  98. else:
  99. codepoint = int(number)
  100. self.result.append(unichr(codepoint))
  101. def handle_entityref(self, name):
  102. if not self.is_valid_tag():
  103. return
  104. # codepoint = htmlentitydefs.name2codepoint[name]
  105. # self.result.append(unichr(codepoint))
  106. self.result.append(name)
  107. def get_text(self):
  108. return u''.join(self.result).strip()
  109. def html_to_text(html):
  110. html = html.replace('\n', ' ')
  111. html = ' '.join(html.split())
  112. s = HTMLTextExtractor()
  113. s.feed(html)
  114. return s.get_text()
  115. class UnicodeWriter:
  116. """
  117. A CSV writer which will write rows to CSV file "f",
  118. which is encoded in the given encoding.
  119. """
  120. def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
  121. # Redirect output to a queue
  122. self.queue = StringIO()
  123. self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
  124. self.stream = f
  125. self.encoder = getincrementalencoder(encoding)()
  126. def writerow(self, row):
  127. if IS_PY2:
  128. row = [s.encode("utf-8") if hasattr(s, 'encode') else s for s in row]
  129. self.writer.writerow(row)
  130. # Fetch UTF-8 output from the queue ...
  131. data = self.queue.getvalue()
  132. if IS_PY2:
  133. data = data.decode("utf-8")
  134. else:
  135. data = data.strip('\x00')
  136. # ... and reencode it into the target encoding
  137. data = self.encoder.encode(data)
  138. # write to the target stream
  139. if IS_PY2:
  140. self.stream.write(data)
  141. else:
  142. self.stream.write(data.decode("utf-8"))
  143. # empty queue
  144. self.queue.truncate(0)
  145. def writerows(self, rows):
  146. for row in rows:
  147. self.writerow(row)
  148. def get_resources_directory(searx_directory, subdirectory, resources_directory):
  149. if not resources_directory:
  150. resources_directory = os.path.join(searx_directory, subdirectory)
  151. if not os.path.isdir(resources_directory):
  152. raise Exception(resources_directory + " is not a directory")
  153. return resources_directory
  154. def get_themes(templates_path):
  155. """Returns available themes list."""
  156. themes = os.listdir(templates_path)
  157. if '__common__' in themes:
  158. themes.remove('__common__')
  159. return themes
  160. def get_static_files(static_path):
  161. static_files = set()
  162. static_path_length = len(static_path) + 1
  163. for directory, _, files in os.walk(static_path):
  164. for filename in files:
  165. f = os.path.join(directory[static_path_length:], filename)
  166. static_files.add(f)
  167. return static_files
  168. def get_result_templates(templates_path):
  169. result_templates = set()
  170. templates_path_length = len(templates_path) + 1
  171. for directory, _, files in os.walk(templates_path):
  172. if directory.endswith('result_templates'):
  173. for filename in files:
  174. f = os.path.join(directory[templates_path_length:], filename)
  175. result_templates.add(f)
  176. return result_templates
  177. def format_date_by_locale(date, locale_string):
  178. # strftime works only on dates after 1900
  179. if date.year <= 1900:
  180. return date.isoformat().split('T')[0]
  181. if locale_string == 'all':
  182. locale_string = settings['ui']['default_locale'] or 'en_US'
  183. # to avoid crashing if locale is not supported by babel
  184. try:
  185. formatted_date = format_date(date, locale=locale_string)
  186. except:
  187. formatted_date = format_date(date, "YYYY-MM-dd")
  188. return formatted_date
  189. def dict_subset(d, properties):
  190. result = {}
  191. for k in properties:
  192. if k in d:
  193. result[k] = d[k]
  194. return result
  195. def prettify_url(url, max_length=74):
  196. if len(url) > max_length:
  197. chunk_len = int(max_length / 2 + 1)
  198. return u'{0}[...]{1}'.format(url[:chunk_len], url[-chunk_len:])
  199. else:
  200. return url
  201. # get element in list or default value
  202. def list_get(a_list, index, default=None):
  203. if len(a_list) > index:
  204. return a_list[index]
  205. else:
  206. return default
  207. def get_torrent_size(filesize, filesize_multiplier):
  208. try:
  209. filesize = float(filesize)
  210. if filesize_multiplier == 'TB':
  211. filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
  212. elif filesize_multiplier == 'GB':
  213. filesize = int(filesize * 1024 * 1024 * 1024)
  214. elif filesize_multiplier == 'MB':
  215. filesize = int(filesize * 1024 * 1024)
  216. elif filesize_multiplier == 'KB':
  217. filesize = int(filesize * 1024)
  218. elif filesize_multiplier == 'TiB':
  219. filesize = int(filesize * 1000 * 1000 * 1000 * 1000)
  220. elif filesize_multiplier == 'GiB':
  221. filesize = int(filesize * 1000 * 1000 * 1000)
  222. elif filesize_multiplier == 'MiB':
  223. filesize = int(filesize * 1000 * 1000)
  224. elif filesize_multiplier == 'KiB':
  225. filesize = int(filesize * 1000)
  226. except:
  227. filesize = None
  228. return filesize
  229. def convert_str_to_int(number_str):
  230. if number_str.isdigit():
  231. return int(number_str)
  232. else:
  233. return 0
  234. # convert a variable to integer or return 0 if it's not a number
  235. def int_or_zero(num):
  236. if isinstance(num, list):
  237. if len(num) < 1:
  238. return 0
  239. num = num[0]
  240. return convert_str_to_int(num)
  241. def is_valid_lang(lang):
  242. is_abbr = (len(lang) == 2)
  243. lang = lang.lower().decode('utf-8')
  244. if is_abbr:
  245. for l in language_codes:
  246. if l[0][:2] == lang:
  247. return (True, l[0][:2], l[3].lower())
  248. return False
  249. else:
  250. for l in language_codes:
  251. if l[1].lower() == lang or l[3].lower() == lang:
  252. return (True, l[0][:2], l[3].lower())
  253. return False
  254. def _get_lang_to_lc_dict(lang_list):
  255. key = str(lang_list)
  256. value = lang_to_lc_cache.get(key, None)
  257. if value is None:
  258. value = dict()
  259. for lc in lang_list:
  260. value.setdefault(lc.split('-')[0], lc)
  261. lang_to_lc_cache[key] = value
  262. return value
  263. # auxiliary function to match lang_code in lang_list
  264. def _match_language(lang_code, lang_list=[], custom_aliases={}):
  265. # replace language code with a custom alias if necessary
  266. if lang_code in custom_aliases:
  267. lang_code = custom_aliases[lang_code]
  268. if lang_code in lang_list:
  269. return lang_code
  270. # try to get the most likely country for this language
  271. subtags = get_global('likely_subtags').get(lang_code)
  272. if subtags:
  273. subtag_parts = subtags.split('_')
  274. new_code = subtag_parts[0] + '-' + subtag_parts[-1]
  275. if new_code in custom_aliases:
  276. new_code = custom_aliases[new_code]
  277. if new_code in lang_list:
  278. return new_code
  279. # try to get the any supported country for this language
  280. return _get_lang_to_lc_dict(lang_list).get(lang_code, None)
  281. # get the language code from lang_list that best matches locale_code
  282. def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):
  283. # try to get language from given locale_code
  284. language = _match_language(locale_code, lang_list, custom_aliases)
  285. if language:
  286. return language
  287. locale_parts = locale_code.split('-')
  288. lang_code = locale_parts[0]
  289. # try to get language using an equivalent country code
  290. if len(locale_parts) > 1:
  291. country_alias = get_global('territory_aliases').get(locale_parts[-1])
  292. if country_alias:
  293. language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
  294. if language:
  295. return language
  296. # try to get language using an equivalent language code
  297. alias = get_global('language_aliases').get(lang_code)
  298. if alias:
  299. language = _match_language(alias, lang_list, custom_aliases)
  300. if language:
  301. return language
  302. if lang_code != locale_code:
  303. # try to get language from given language without giving the country
  304. language = _match_language(lang_code, lang_list, custom_aliases)
  305. return language or fallback
  306. def load_module(filename, module_dir):
  307. modname = splitext(filename)[0]
  308. if modname in sys.modules:
  309. del sys.modules[modname]
  310. filepath = join(module_dir, filename)
  311. module = load_source(modname, filepath)
  312. module.name = modname
  313. return module
  314. def new_hmac(secret_key, url):
  315. try:
  316. secret_key_bytes = bytes(secret_key, 'utf-8')
  317. except TypeError as err:
  318. if isinstance(secret_key, bytes):
  319. secret_key_bytes = secret_key
  320. else:
  321. raise err
  322. if sys.version_info[0] == 2:
  323. return hmac.new(bytes(secret_key), url, hashlib.sha256).hexdigest()
  324. else:
  325. return hmac.new(secret_key_bytes, url, hashlib.sha256).hexdigest()
  326. def to_string(obj):
  327. if isinstance(obj, basestring):
  328. return obj
  329. if isinstance(obj, Number):
  330. return unicode(obj)
  331. if hasattr(obj, '__str__'):
  332. return obj.__str__()
  333. if hasattr(obj, '__repr__'):
  334. return obj.__repr__()
  335. def ecma_unescape(s):
  336. """
  337. python implementation of the unescape javascript function
  338. https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
  339. https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
  340. """
  341. # s = unicode(s)
  342. # "%u5409" becomes "吉"
  343. s = ecma_unescape4_re.sub(lambda e: unichr(int(e.group(1), 16)), s)
  344. # "%20" becomes " ", "%F3" becomes "ó"
  345. s = ecma_unescape2_re.sub(lambda e: unichr(int(e.group(1), 16)), s)
  346. return s