__init__.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. from os.path import realpath, dirname, splitext, join
  15. from imp import load_source
  16. import grequests
  17. from itertools import izip_longest, chain
  18. from operator import itemgetter
  19. from urlparse import urlparse
  20. from searx import settings
  21. import ConfigParser
  22. import sys
  23. import re
  24. from datetime import datetime
  25. engine_dir = dirname(realpath(__file__))
  26. searx_dir = join(engine_dir, '../../')
  27. engines_config = ConfigParser.SafeConfigParser()
  28. engines_config.read(join(searx_dir, 'engines.cfg'))
  29. number_of_searches = 0
  30. engines = {}
  31. categories = {'general': []}
  32. def load_module(filename):
  33. modname = splitext(filename)[0]
  34. if modname in sys.modules:
  35. del sys.modules[modname]
  36. filepath = join(engine_dir, filename)
  37. module = load_source(modname, filepath)
  38. module.name = modname
  39. return module
  40. if not engines_config.sections():
  41. print '[E] Error no engines found. Edit your engines.cfg'
  42. exit(2)
  43. for engine_config_name in engines_config.sections():
  44. engine_data = engines_config.options(engine_config_name)
  45. engine = load_module(engines_config.get(engine_config_name, 'engine')+'.py')
  46. engine.name = engine_config_name
  47. for param_name in engine_data:
  48. if param_name == 'engine':
  49. continue
  50. if param_name == 'categories':
  51. if engines_config.get(engine_config_name, param_name) == 'none':
  52. engine.categories = []
  53. else:
  54. engine.categories = map(str.strip, engines_config.get(engine_config_name, param_name).split(','))
  55. continue
  56. setattr(engine, param_name, engines_config.get(engine_config_name, param_name))
  57. for engine_attr in dir(engine):
  58. if engine_attr.startswith('_'):
  59. continue
  60. if getattr(engine, engine_attr) == None:
  61. print '[E] Engine config error: Missing attribute "{0}.{1}"'.format(engine.name, engine_attr)
  62. sys.exit(1)
  63. engines[engine.name] = engine
  64. engine.stats = {'result_count': 0, 'search_count': 0, 'page_load_time': 0, 'score_count': 0, 'errors': 0}
  65. if hasattr(engine, 'categories'):
  66. for category_name in engine.categories:
  67. categories.setdefault(category_name, []).append(engine)
  68. else:
  69. categories['general'].append(engine)
  70. def default_request_params():
  71. return {'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}
  72. def make_callback(engine_name, results, suggestions, callback, params):
  73. # creating a callback wrapper for the search engine results
  74. def process_callback(response, **kwargs):
  75. cb_res = []
  76. response.search_params = params
  77. engines[engine_name].stats['page_load_time'] += (datetime.now() - params['started']).total_seconds()
  78. try:
  79. search_results = callback(response)
  80. except Exception, e:
  81. engines[engine_name].stats['errors'] += 1
  82. results[engine_name] = cb_res
  83. print '[E] Error with engine "{0}":\n\t{1}'.format(engine_name, str(e))
  84. return
  85. for result in search_results:
  86. result['engine'] = engine_name
  87. if 'suggestion' in result:
  88. # TODO type checks
  89. suggestions.add(result['suggestion'])
  90. continue
  91. cb_res.append(result)
  92. results[engine_name] = cb_res
  93. return process_callback
  94. def highlight_content(content, query):
  95. if not content:
  96. return None
  97. # ignoring html contents
  98. # TODO better html content detection
  99. if content.find('<') != -1:
  100. return content
  101. query = query.decode('utf-8')
  102. if content.lower().find(query.lower()) > -1:
  103. query_regex = u'({0})'.format(re.escape(query))
  104. content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
  105. else:
  106. regex_parts = []
  107. for chunk in query.split():
  108. if len(chunk) == 1:
  109. regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
  110. else:
  111. regex_parts.append(u'{0}'.format(re.escape(chunk)))
  112. query_regex = u'({0})'.format('|'.join(regex_parts))
  113. content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
  114. return content
  115. def score_results(results):
  116. flat_res = filter(None, chain.from_iterable(izip_longest(*results.values())))
  117. flat_len = len(flat_res)
  118. engines_len = len(results)
  119. results = []
  120. # deduplication + scoring
  121. for i,res in enumerate(flat_res):
  122. res['parsed_url'] = urlparse(res['url'])
  123. res['engines'] = [res['engine']]
  124. weight = 1.0
  125. if hasattr(engines[res['engine']], 'weight'):
  126. weight = float(engines[res['engine']].weight)
  127. elif res['engine'] in settings.weights:
  128. weight = float(settings.weights[res['engine']])
  129. score = int((flat_len - i)/engines_len)*weight+1
  130. duplicated = False
  131. for new_res in results:
  132. p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path
  133. p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path
  134. if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\
  135. p1 == p2 and\
  136. res['parsed_url'].query == new_res['parsed_url'].query and\
  137. res.get('template') == new_res.get('template'):
  138. duplicated = new_res
  139. break
  140. if duplicated:
  141. if len(res.get('content', '')) > len(duplicated.get('content', '')):
  142. duplicated['content'] = res['content']
  143. duplicated['score'] += score
  144. duplicated['engines'].append(res['engine'])
  145. if duplicated['parsed_url'].scheme == 'https':
  146. continue
  147. elif res['parsed_url'].scheme == 'https':
  148. duplicated['url'] = res['parsed_url'].geturl()
  149. duplicated['parsed_url'] = res['parsed_url']
  150. else:
  151. res['score'] = score
  152. results.append(res)
  153. return sorted(results, key=itemgetter('score'), reverse=True)
  154. def search(query, request, selected_engines):
  155. global engines, categories, number_of_searches
  156. requests = []
  157. results = {}
  158. suggestions = set()
  159. number_of_searches += 1
  160. user_agent = request.headers.get('User-Agent', '')
  161. for selected_engine in selected_engines:
  162. if selected_engine['name'] not in engines:
  163. continue
  164. engine = engines[selected_engine['name']]
  165. request_params = default_request_params()
  166. request_params['headers']['User-Agent'] = user_agent
  167. request_params['category'] = selected_engine['category']
  168. request_params['started'] = datetime.now()
  169. request_params = engine.request(query, request_params)
  170. callback = make_callback(selected_engine['name'], results, suggestions, engine.response, request_params)
  171. request_args = dict(headers = request_params['headers']
  172. ,hooks = dict(response=callback)
  173. ,cookies = request_params['cookies']
  174. ,timeout = settings.request_timeout
  175. )
  176. if request_params['method'] == 'GET':
  177. req = grequests.get
  178. else:
  179. req = grequests.post
  180. request_args['data'] = request_params['data']
  181. # ignoring empty urls
  182. if not request_params['url']:
  183. continue
  184. requests.append(req(request_params['url'], **request_args))
  185. grequests.map(requests)
  186. for engine_name,engine_results in results.items():
  187. engines[engine_name].stats['search_count'] += 1
  188. engines[engine_name].stats['result_count'] += len(engine_results)
  189. results = score_results(results)
  190. for result in results:
  191. if 'content' in result:
  192. result['content'] = highlight_content(result['content'], query)
  193. for res_engine in result['engines']:
  194. engines[result['engine']].stats['score_count'] += result['score']
  195. return results, suggestions
  196. def get_engines_stats():
  197. pageloads = []
  198. results = []
  199. scores = []
  200. errors = []
  201. max_pageload = max_results = max_score = max_errors = 0
  202. for engine in engines.values():
  203. if engine.stats['search_count'] == 0:
  204. continue
  205. results_num = engine.stats['result_count']/float(engine.stats['search_count'])
  206. load_times = engine.stats['page_load_time']/float(engine.stats['search_count'])
  207. if results_num:
  208. score = engine.stats['score_count'] / float(engine.stats['search_count'])
  209. else:
  210. score = 0
  211. max_results = max(results_num, max_results)
  212. max_pageload = max(load_times, max_pageload)
  213. max_score = max(score, max_score)
  214. max_errors = max(max_errors, engine.stats['errors'])
  215. pageloads.append({'avg': load_times, 'name': engine.name})
  216. results.append({'avg': results_num, 'name': engine.name})
  217. scores.append({'avg': score, 'name': engine.name})
  218. errors.append({'avg': engine.stats['errors'], 'name': engine.name})
  219. for engine in pageloads:
  220. engine['percentage'] = int(engine['avg']/max_pageload*100)
  221. for engine in results:
  222. engine['percentage'] = int(engine['avg']/max_results*100)
  223. for engine in scores:
  224. engine['percentage'] = int(engine['avg']/max_score*100)
  225. for engine in errors:
  226. if max_errors:
  227. engine['percentage'] = int(engine['avg']/max_errors*100)
  228. else:
  229. engine['percentage'] = 0
  230. return [('Page loads (sec)', sorted(pageloads, key=itemgetter('avg')))
  231. ,('Number of results', sorted(results, key=itemgetter('avg'), reverse=True))
  232. ,('Scores', sorted(scores, key=itemgetter('avg'), reverse=True))
  233. ,('Errors', sorted(errors, key=itemgetter('avg'), reverse=True))
  234. ]