ina.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. # INA (Videos)
  2. #
  3. # @website https://www.ina.fr/
  4. # @provide-api no
  5. #
  6. # @using-api no
  7. # @results HTML (using search portal)
  8. # @stable no (HTML can change)
  9. # @parse url, title, content, publishedDate, thumbnail
  10. #
  11. # @todo set content-parameter with correct data
  12. # @todo embedded (needs some md5 from video page)
  13. from json import loads
  14. from urllib import urlencode
  15. from lxml import html
  16. from HTMLParser import HTMLParser
  17. from searx.engines.xpath import extract_text
  18. from dateutil import parser
  19. # engine dependent config
  20. categories = ['videos']
  21. paging = True
  22. page_size = 48
  23. # search-url
  24. base_url = 'https://www.ina.fr'
  25. search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}'
  26. # specific xpath variables
  27. results_xpath = '//div[contains(@class,"search-results--list")]/div[@class="media"]'
  28. url_xpath = './/a/@href'
  29. title_xpath = './/h3[@class="h3--title media-heading"]'
  30. thumbnail_xpath = './/img/@src'
  31. publishedDate_xpath = './/span[@class="broadcast"]'
  32. content_xpath = './/p[@class="media-body__summary"]'
  33. # do search-request
  34. def request(query, params):
  35. params['url'] = search_url.format(ps=page_size,
  36. start=params['pageno'] * page_size,
  37. query=urlencode({'q': query}))
  38. return params
  39. # get response from search-request
  40. def response(resp):
  41. results = []
  42. # we get html in a JSON container...
  43. response = loads(resp.text)
  44. if "content" not in response:
  45. return []
  46. dom = html.fromstring(response["content"])
  47. p = HTMLParser()
  48. # parse results
  49. for result in dom.xpath(results_xpath):
  50. videoid = result.xpath(url_xpath)[0]
  51. url = base_url + videoid
  52. title = p.unescape(extract_text(result.xpath(title_xpath)))
  53. thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
  54. if thumbnail[0] == '/':
  55. thumbnail = base_url + thumbnail
  56. d = extract_text(result.xpath(publishedDate_xpath)[0])
  57. d = d.split('/')
  58. # force ISO date to avoid wrong parsing
  59. d = "%s-%s-%s" % (d[2], d[1], d[0])
  60. publishedDate = parser.parse(d)
  61. content = extract_text(result.xpath(content_xpath))
  62. # append result
  63. results.append({'url': url,
  64. 'title': title,
  65. 'content': content,
  66. 'template': 'videos.html',
  67. 'publishedDate': publishedDate,
  68. 'thumbnail': thumbnail})
  69. # return results
  70. return results