filecrop.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. from json import loads
  2. from urllib import urlencode
  3. from searx.utils import html_to_text
  4. from HTMLParser import HTMLParser
  5. url = 'http://www.filecrop.com/'
  6. search_url = url + '/search.php?w={query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1'
  7. class FilecropResultParser(HTMLParser):
  8. def __init__(self):
  9. HTMLParser.__init__(self)
  10. self.__start_processing = False
  11. self.results = []
  12. self.result = {}
  13. self.tr_counter = 0
  14. self.data_counter = 0
  15. def handle_starttag(self, tag, attrs):
  16. if tag == 'tr':
  17. if ('bgcolor', '#edeff5') in attrs or ('bgcolor', '#ffffff') in attrs:
  18. self.__start_processing = True
  19. if not self.__start_processing:
  20. return
  21. if tag == 'label':
  22. self.result['title'] = [attr[1] for attr in attrs if attr[0] == 'title'][0]
  23. elif tag == 'a' and ('rel', 'nofollow') in attrs and ('class', 'sourcelink') in attrs:
  24. if 'content' in self.result:
  25. self.result['content'] += [attr[1] for attr in attrs if attr[0] == 'title'][0]
  26. else:
  27. self.result['content'] = [attr[1] for attr in attrs if attr[0] == 'title'][0]
  28. self.result['content'] += ' '
  29. elif tag == 'a':
  30. self.result['url'] = url + [attr[1] for attr in attrs if attr[0] == 'href'][0]
  31. def handle_endtag(self, tag):
  32. if self.__start_processing is False:
  33. return
  34. if tag == 'tr':
  35. self.tr_counter += 1
  36. if self.tr_counter == 2:
  37. self.__start_processing = False
  38. self.tr_counter = 0
  39. self.data_counter = 0
  40. self.results.append(self.result)
  41. self.result = {}
  42. def handle_data(self, data):
  43. if not self.__start_processing:
  44. return
  45. print data
  46. if 'content' in self.result:
  47. self.result['content'] += data + ' '
  48. else:
  49. self.result['content'] = data + ' '
  50. self.data_counter += 1
  51. def request(query, params):
  52. params['url'] = search_url.format(query=urlencode({'q': query}))
  53. return params
  54. def response(resp):
  55. parser = FilecropResultParser()
  56. parser.feed(resp.text)
  57. return parser.results