google_videos.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. """
  2. Google (Videos)
  3. @website https://www.google.com
  4. @provide-api yes (https://developers.google.com/custom-search/)
  5. @using-api no
  6. @results HTML
  7. @stable no
  8. @parse url, title, content, thumbnail
  9. """
  10. from datetime import date, timedelta
  11. from json import loads
  12. from lxml import html
  13. from searx.engines import logger
  14. from searx.engines.xpath import extract_text
  15. from searx.url_utils import urlencode
  16. import re
  17. # engine dependent config
  18. categories = ['videos']
  19. paging = True
  20. safesearch = True
  21. time_range_support = True
  22. number_of_results = 10
  23. search_url = 'https://www.google.com/search'\
  24. '?q={query}'\
  25. '&tbm=vid'\
  26. '&{search_options}'
  27. time_range_attr = "qdr:{range}"
  28. time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}"
  29. time_range_dict = {'day': 'd',
  30. 'week': 'w',
  31. 'month': 'm'}
  32. # do search-request
  33. def request(query, params):
  34. search_options = {
  35. 'ijn': params['pageno'] - 1,
  36. 'start': (params['pageno'] - 1) * number_of_results
  37. }
  38. if params['time_range'] in time_range_dict:
  39. search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']])
  40. elif params['time_range'] == 'year':
  41. now = date.today()
  42. then = now - timedelta(days=365)
  43. start = then.strftime('%m/%d/%Y')
  44. end = now.strftime('%m/%d/%Y')
  45. search_options['tbs'] = time_range_custom_attr.format(start=start, end=end)
  46. if safesearch and params['safesearch']:
  47. search_options['safe'] = 'on'
  48. params['url'] = search_url.format(query=urlencode({'q': query}),
  49. search_options=urlencode(search_options))
  50. return params
  51. # get response from search-request
  52. def response(resp):
  53. results = []
  54. dom = html.fromstring(resp.text)
  55. # parse results
  56. for result in dom.xpath('//div[@class="g"]'):
  57. title = extract_text(result.xpath('.//h3'))
  58. url = result.xpath('.//div[@class="r"]/a/@href')[0]
  59. content = extract_text(result.xpath('.//span[@class="st"]'))
  60. # get thumbnails
  61. script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
  62. id = result.xpath('.//div[@class="s"]//img/@id')[0]
  63. thumbnails_data = re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + id,
  64. script)
  65. logger.debug('google video engine: ' + id + ' matched ' + str(len(thumbnails_data)) + ' times (thumbnail)')
  66. tmp = []
  67. if len(thumbnails_data) != 0:
  68. tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
  69. thumbnail = ''
  70. if len(tmp) != 0:
  71. thumbnail = tmp[-1]
  72. # append result
  73. results.append({'url': url,
  74. 'title': title,
  75. 'content': content,
  76. 'thumbnail': thumbnail,
  77. 'template': 'videos.html'})
  78. return results