ina.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. # INA (Videos)
  2. #
  3. # @website https://www.ina.fr/
  4. # @provide-api no
  5. #
  6. # @using-api no
  7. # @results HTML (using search portal)
  8. # @stable no (HTML can change)
  9. # @parse url, title, content, publishedDate, thumbnail
  10. #
  11. # @todo set content-parameter with correct data
  12. # @todo embedded (needs some md5 from video page)
  13. from json import loads
  14. from html import unescape
  15. from urllib.parse import urlencode
  16. from lxml import html
  17. from dateutil import parser
  18. from searx.utils import extract_text
  19. # engine dependent config
  20. categories = ['videos']
  21. paging = True
  22. page_size = 48
  23. # search-url
  24. base_url = 'https://www.ina.fr'
  25. search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}'
  26. # specific xpath variables
  27. results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]'
  28. url_xpath = './/a/@href'
  29. title_xpath = './/h3[@class="h3--title media-heading"]'
  30. thumbnail_xpath = './/img/@src'
  31. publishedDate_xpath = './/span[@class="broadcast"]'
  32. content_xpath = './/p[@class="media-body__summary"]'
  33. # do search-request
  34. def request(query, params):
  35. params['url'] = search_url.format(ps=page_size,
  36. start=params['pageno'] * page_size,
  37. query=urlencode({'q': query}))
  38. return params
  39. # get response from search-request
  40. def response(resp):
  41. results = []
  42. # we get html in a JSON container...
  43. response = loads(resp.text)
  44. if "content" not in response:
  45. return []
  46. dom = html.fromstring(response["content"])
  47. # parse results
  48. for result in dom.xpath(results_xpath):
  49. videoid = result.xpath(url_xpath)[0]
  50. url = base_url + videoid
  51. title = unescape(extract_text(result.xpath(title_xpath)))
  52. try:
  53. thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
  54. except:
  55. thumbnail = ''
  56. if thumbnail and thumbnail[0] == '/':
  57. thumbnail = base_url + thumbnail
  58. d = extract_text(result.xpath(publishedDate_xpath)[0])
  59. d = d.split('/')
  60. # force ISO date to avoid wrong parsing
  61. d = "%s-%s-%s" % (d[2], d[1], d[0])
  62. publishedDate = parser.parse(d)
  63. content = extract_text(result.xpath(content_xpath))
  64. # append result
  65. results.append({'url': url,
  66. 'title': title,
  67. 'content': content,
  68. 'template': 'videos.html',
  69. 'publishedDate': publishedDate,
  70. 'thumbnail': thumbnail})
  71. # return results
  72. return results