utils.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. import csv
  2. import hashlib
  3. import hmac
  4. import os
  5. import re
  6. from babel.core import get_global
  7. from babel.dates import format_date
  8. from codecs import getincrementalencoder
  9. from imp import load_source
  10. from numbers import Number
  11. from os.path import splitext, join
  12. from io import open
  13. from random import choice
  14. import sys
  15. import json
  16. from searx import settings
  17. from searx.version import VERSION_STRING
  18. from searx.languages import language_codes
  19. from searx import settings
  20. from searx import logger
  21. try:
  22. from cStringIO import StringIO
  23. except:
  24. from io import StringIO
  25. try:
  26. from HTMLParser import HTMLParser
  27. except:
  28. from html.parser import HTMLParser
  29. if sys.version_info[0] == 3:
  30. unichr = chr
  31. unicode = str
  32. IS_PY2 = False
  33. basestring = str
  34. else:
  35. IS_PY2 = True
  36. logger = logger.getChild('utils')
  37. blocked_tags = ('script',
  38. 'style')
  39. useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__))
  40. + "/data/useragents.json", 'r', encoding='utf-8').read())
  41. lang_to_lc_cache = dict()
  42. def searx_useragent():
  43. return 'searx/{searx_version} {suffix}'.format(
  44. searx_version=VERSION_STRING,
  45. suffix=settings['outgoing'].get('useragent_suffix', ''))
  46. def gen_useragent(os=None):
  47. return str(useragents['ua'].format(os=os or choice(useragents['os']), version=choice(useragents['versions'])))
  48. def highlight_content(content, query):
  49. if not content:
  50. return None
  51. # ignoring html contents
  52. # TODO better html content detection
  53. if content.find('<') != -1:
  54. return content
  55. query = query.decode('utf-8')
  56. if content.lower().find(query.lower()) > -1:
  57. query_regex = u'({0})'.format(re.escape(query))
  58. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  59. content, flags=re.I | re.U)
  60. else:
  61. regex_parts = []
  62. for chunk in query.split():
  63. if len(chunk) == 1:
  64. regex_parts.append(u'\\W+{0}\\W+'.format(re.escape(chunk)))
  65. else:
  66. regex_parts.append(u'{0}'.format(re.escape(chunk)))
  67. query_regex = u'({0})'.format('|'.join(regex_parts))
  68. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  69. content, flags=re.I | re.U)
  70. return content
  71. class HTMLTextExtractor(HTMLParser):
  72. def __init__(self):
  73. HTMLParser.__init__(self)
  74. self.result = []
  75. self.tags = []
  76. def handle_starttag(self, tag, attrs):
  77. self.tags.append(tag)
  78. def handle_endtag(self, tag):
  79. if not self.tags:
  80. return
  81. if tag != self.tags[-1]:
  82. raise Exception("invalid html")
  83. self.tags.pop()
  84. def is_valid_tag(self):
  85. return not self.tags or self.tags[-1] not in blocked_tags
  86. def handle_data(self, d):
  87. if not self.is_valid_tag():
  88. return
  89. self.result.append(d)
  90. def handle_charref(self, number):
  91. if not self.is_valid_tag():
  92. return
  93. if number[0] in (u'x', u'X'):
  94. codepoint = int(number[1:], 16)
  95. else:
  96. codepoint = int(number)
  97. self.result.append(unichr(codepoint))
  98. def handle_entityref(self, name):
  99. if not self.is_valid_tag():
  100. return
  101. # codepoint = htmlentitydefs.name2codepoint[name]
  102. # self.result.append(unichr(codepoint))
  103. self.result.append(name)
  104. def get_text(self):
  105. return u''.join(self.result).strip()
  106. def html_to_text(html):
  107. html = html.replace('\n', ' ')
  108. html = ' '.join(html.split())
  109. s = HTMLTextExtractor()
  110. s.feed(html)
  111. return s.get_text()
  112. class UnicodeWriter:
  113. """
  114. A CSV writer which will write rows to CSV file "f",
  115. which is encoded in the given encoding.
  116. """
  117. def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
  118. # Redirect output to a queue
  119. self.queue = StringIO()
  120. self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
  121. self.stream = f
  122. self.encoder = getincrementalencoder(encoding)()
  123. def writerow(self, row):
  124. if IS_PY2:
  125. row = [s.encode("utf-8") if hasattr(s, 'encode') else s for s in row]
  126. self.writer.writerow(row)
  127. # Fetch UTF-8 output from the queue ...
  128. data = self.queue.getvalue()
  129. if IS_PY2:
  130. data = data.decode("utf-8")
  131. else:
  132. data = data.strip('\x00')
  133. # ... and reencode it into the target encoding
  134. data = self.encoder.encode(data)
  135. # write to the target stream
  136. if IS_PY2:
  137. self.stream.write(data)
  138. else:
  139. self.stream.write(data.decode("utf-8"))
  140. # empty queue
  141. self.queue.truncate(0)
  142. def writerows(self, rows):
  143. for row in rows:
  144. self.writerow(row)
  145. def get_resources_directory(searx_directory, subdirectory, resources_directory):
  146. if not resources_directory:
  147. resources_directory = os.path.join(searx_directory, subdirectory)
  148. if not os.path.isdir(resources_directory):
  149. raise Exception(resources_directory + " is not a directory")
  150. return resources_directory
  151. def get_themes(templates_path):
  152. """Returns available themes list."""
  153. themes = os.listdir(templates_path)
  154. if '__common__' in themes:
  155. themes.remove('__common__')
  156. return themes
  157. def get_static_files(static_path):
  158. static_files = set()
  159. static_path_length = len(static_path) + 1
  160. for directory, _, files in os.walk(static_path):
  161. for filename in files:
  162. f = os.path.join(directory[static_path_length:], filename)
  163. static_files.add(f)
  164. return static_files
  165. def get_result_templates(templates_path):
  166. result_templates = set()
  167. templates_path_length = len(templates_path) + 1
  168. for directory, _, files in os.walk(templates_path):
  169. if directory.endswith('result_templates'):
  170. for filename in files:
  171. f = os.path.join(directory[templates_path_length:], filename)
  172. result_templates.add(f)
  173. return result_templates
  174. def format_date_by_locale(date, locale_string):
  175. # strftime works only on dates after 1900
  176. if date.year <= 1900:
  177. return date.isoformat().split('T')[0]
  178. if locale_string == 'all':
  179. locale_string = settings['ui']['default_locale'] or 'en_US'
  180. # to avoid crashing if locale is not supported by babel
  181. try:
  182. formatted_date = format_date(date, locale=locale_string)
  183. except:
  184. formatted_date = format_date(date, "YYYY-MM-dd")
  185. return formatted_date
  186. def dict_subset(d, properties):
  187. result = {}
  188. for k in properties:
  189. if k in d:
  190. result[k] = d[k]
  191. return result
  192. def prettify_url(url, max_length=74):
  193. if len(url) > max_length:
  194. chunk_len = int(max_length / 2 + 1)
  195. return u'{0}[...]{1}'.format(url[:chunk_len], url[-chunk_len:])
  196. else:
  197. return url
  198. # get element in list or default value
  199. def list_get(a_list, index, default=None):
  200. if len(a_list) > index:
  201. return a_list[index]
  202. else:
  203. return default
  204. def get_torrent_size(filesize, filesize_multiplier):
  205. try:
  206. filesize = float(filesize)
  207. if filesize_multiplier == 'TB':
  208. filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
  209. elif filesize_multiplier == 'GB':
  210. filesize = int(filesize * 1024 * 1024 * 1024)
  211. elif filesize_multiplier == 'MB':
  212. filesize = int(filesize * 1024 * 1024)
  213. elif filesize_multiplier == 'KB':
  214. filesize = int(filesize * 1024)
  215. elif filesize_multiplier == 'TiB':
  216. filesize = int(filesize * 1000 * 1000 * 1000 * 1000)
  217. elif filesize_multiplier == 'GiB':
  218. filesize = int(filesize * 1000 * 1000 * 1000)
  219. elif filesize_multiplier == 'MiB':
  220. filesize = int(filesize * 1000 * 1000)
  221. elif filesize_multiplier == 'KiB':
  222. filesize = int(filesize * 1000)
  223. except:
  224. filesize = None
  225. return filesize
  226. def convert_str_to_int(number_str):
  227. if number_str.isdigit():
  228. return int(number_str)
  229. else:
  230. return 0
  231. # convert a variable to integer or return 0 if it's not a number
  232. def int_or_zero(num):
  233. if isinstance(num, list):
  234. if len(num) < 1:
  235. return 0
  236. num = num[0]
  237. return convert_str_to_int(num)
  238. def is_valid_lang(lang):
  239. is_abbr = (len(lang) == 2)
  240. if is_abbr:
  241. for l in language_codes:
  242. if l[0][:2] == lang.lower():
  243. return (True, l[0][:2], l[3].lower())
  244. return False
  245. else:
  246. for l in language_codes:
  247. if l[1].lower() == lang.lower():
  248. return (True, l[0][:2], l[3].lower())
  249. return False
  250. def _get_lang_to_lc_dict(lang_list):
  251. key = str(lang_list)
  252. value = lang_to_lc_cache.get(key, None)
  253. if value is None:
  254. value = dict()
  255. for lc in lang_list:
  256. value.setdefault(lc.split('-')[0], lc)
  257. lang_to_lc_cache[key] = value
  258. return value
  259. # auxiliary function to match lang_code in lang_list
  260. def _match_language(lang_code, lang_list=[], custom_aliases={}):
  261. # replace language code with a custom alias if necessary
  262. if lang_code in custom_aliases:
  263. lang_code = custom_aliases[lang_code]
  264. if lang_code in lang_list:
  265. return lang_code
  266. # try to get the most likely country for this language
  267. subtags = get_global('likely_subtags').get(lang_code)
  268. if subtags:
  269. subtag_parts = subtags.split('_')
  270. new_code = subtag_parts[0] + '-' + subtag_parts[-1]
  271. if new_code in custom_aliases:
  272. new_code = custom_aliases[new_code]
  273. if new_code in lang_list:
  274. return new_code
  275. # try to get the any supported country for this language
  276. return _get_lang_to_lc_dict(lang_list).get(lang_code, None)
  277. # get the language code from lang_list that best matches locale_code
  278. def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):
  279. # try to get language from given locale_code
  280. language = _match_language(locale_code, lang_list, custom_aliases)
  281. if language:
  282. return language
  283. locale_parts = locale_code.split('-')
  284. lang_code = locale_parts[0]
  285. # try to get language using an equivalent country code
  286. if len(locale_parts) > 1:
  287. country_alias = get_global('territory_aliases').get(locale_parts[-1])
  288. if country_alias:
  289. language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
  290. if language:
  291. return language
  292. # try to get language using an equivalent language code
  293. alias = get_global('language_aliases').get(lang_code)
  294. if alias:
  295. language = _match_language(alias, lang_list, custom_aliases)
  296. if language:
  297. return language
  298. if lang_code != locale_code:
  299. # try to get language from given language without giving the country
  300. language = _match_language(lang_code, lang_list, custom_aliases)
  301. return language or fallback
  302. def load_module(filename, module_dir):
  303. modname = splitext(filename)[0]
  304. if modname in sys.modules:
  305. del sys.modules[modname]
  306. filepath = join(module_dir, filename)
  307. module = load_source(modname, filepath)
  308. module.name = modname
  309. return module
  310. def new_hmac(secret_key, url):
  311. try:
  312. secret_key_bytes = bytes(secret_key, 'utf-8')
  313. except TypeError as err:
  314. if isinstance(secret_key, bytes):
  315. secret_key_bytes = secret_key
  316. else:
  317. raise err
  318. if sys.version_info[0] == 2:
  319. return hmac.new(bytes(secret_key), url, hashlib.sha256).hexdigest()
  320. else:
  321. return hmac.new(secret_key_bytes, url, hashlib.sha256).hexdigest()
  322. def to_string(obj):
  323. if isinstance(obj, basestring):
  324. return obj
  325. if isinstance(obj, Number):
  326. return unicode(obj)
  327. if hasattr(obj, '__str__'):
  328. return obj.__str__()
  329. if hasattr(obj, '__repr__'):
  330. return obj.__repr__()