google_videos.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. """
  2. Google (Videos)
  3. @website https://www.google.com
  4. @provide-api yes (https://developers.google.com/custom-search/)
  5. @using-api no
  6. @results HTML
  7. @stable no
  8. @parse url, title, content, thumbnail
  9. """
  10. from datetime import date, timedelta
  11. from json import loads
  12. from urllib.parse import urlencode
  13. from lxml import html
  14. from searx.engines.xpath import extract_text
  15. import re
  16. # engine dependent config
  17. categories = ['videos']
  18. paging = True
  19. safesearch = True
  20. time_range_support = True
  21. number_of_results = 10
  22. search_url = 'https://www.google.com/search'\
  23. '?q={query}'\
  24. '&tbm=vid'\
  25. '&{search_options}'
  26. time_range_attr = "qdr:{range}"
  27. time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}"
  28. time_range_dict = {'day': 'd',
  29. 'week': 'w',
  30. 'month': 'm'}
  31. # do search-request
  32. def request(query, params):
  33. search_options = {
  34. 'ijn': params['pageno'] - 1,
  35. 'start': (params['pageno'] - 1) * number_of_results
  36. }
  37. if params['time_range'] in time_range_dict:
  38. search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']])
  39. elif params['time_range'] == 'year':
  40. now = date.today()
  41. then = now - timedelta(days=365)
  42. start = then.strftime('%m/%d/%Y')
  43. end = now.strftime('%m/%d/%Y')
  44. search_options['tbs'] = time_range_custom_attr.format(start=start, end=end)
  45. if safesearch and params['safesearch']:
  46. search_options['safe'] = 'on'
  47. params['url'] = search_url.format(query=urlencode({'q': query}),
  48. search_options=urlencode(search_options))
  49. return params
  50. # get response from search-request
  51. def response(resp):
  52. results = []
  53. dom = html.fromstring(resp.text)
  54. # parse results
  55. for result in dom.xpath('//div[@class="g"]'):
  56. title = extract_text(result.xpath('.//h3'))
  57. url = result.xpath('.//div[@class="r"]/a/@href')[0]
  58. content = extract_text(result.xpath('.//span[@class="st"]'))
  59. # get thumbnails
  60. script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
  61. ids = result.xpath('.//div[@class="s"]//img/@id')
  62. if len(ids) > 0:
  63. thumbnails_data = \
  64. re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0],
  65. script)
  66. tmp = []
  67. if len(thumbnails_data) != 0:
  68. tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
  69. thumbnail = ''
  70. if len(tmp) != 0:
  71. thumbnail = tmp[-1]
  72. # append result
  73. results.append({'url': url,
  74. 'title': title,
  75. 'content': content,
  76. 'thumbnail': thumbnail,
  77. 'template': 'videos.html'})
  78. return results