json_engine.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. from urllib import urlencode
  2. from json import loads
  3. from collections import Iterable
  4. search_url = None
  5. url_query = None
  6. content_query = None
  7. title_query = None
  8. # suggestion_xpath = ''
  9. # parameters for engines with paging support
  10. #
  11. # number of results on each page
  12. # (only needed if the site requires not a page number, but an offset)
  13. page_size = 1
  14. # number of the first page (usually 0 or 1)
  15. first_page_num = 1
  16. def iterate(iterable):
  17. if type(iterable) == dict:
  18. it = iterable.iteritems()
  19. else:
  20. it = enumerate(iterable)
  21. for index, value in it:
  22. yield str(index), value
  23. def is_iterable(obj):
  24. if type(obj) == str:
  25. return False
  26. if type(obj) == unicode:
  27. return False
  28. return isinstance(obj, Iterable)
  29. def parse(query):
  30. q = []
  31. for part in query.split('/'):
  32. if part == '':
  33. continue
  34. else:
  35. q.append(part)
  36. return q
  37. def do_query(data, q):
  38. ret = []
  39. if not q:
  40. return ret
  41. qkey = q[0]
  42. for key, value in iterate(data):
  43. if len(q) == 1:
  44. if key == qkey:
  45. ret.append(value)
  46. elif is_iterable(value):
  47. ret.extend(do_query(value, q))
  48. else:
  49. if not is_iterable(value):
  50. continue
  51. if key == qkey:
  52. ret.extend(do_query(value, q[1:]))
  53. else:
  54. ret.extend(do_query(value, q))
  55. return ret
  56. def query(data, query_string):
  57. q = parse(query_string)
  58. return do_query(data, q)
  59. def request(query, params):
  60. query = urlencode({'q': query})[2:]
  61. fp = {'query': query}
  62. if paging and search_url.find('{pageno}') >= 0:
  63. fp['pageno'] = (params['pageno'] + first_page_num - 1) * page_size
  64. params['url'] = search_url.format(**fp)
  65. params['query'] = query
  66. return params
  67. def response(resp):
  68. results = []
  69. json = loads(resp.text)
  70. urls = query(json, url_query)
  71. contents = query(json, content_query)
  72. titles = query(json, title_query)
  73. for url, title, content in zip(urls, titles, contents):
  74. results.append({'url': url, 'title': title, 'content': content})
  75. return results