ina.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. # INA (Videos)
  2. #
  3. # @website https://www.ina.fr/
  4. # @provide-api no
  5. #
  6. # @using-api no
  7. # @results HTML (using search portal)
  8. # @stable no (HTML can change)
  9. # @parse url, title, content, publishedDate, thumbnail
  10. #
  11. # @todo set content-parameter with correct data
  12. # @todo embedded (needs some md5 from video page)
  13. from json import loads
  14. from urllib.parse import urlencode
  15. from lxml import html
  16. from dateutil import parser
  17. from html.parser import HTMLParser
  18. from searx.engines.xpath import extract_text
  19. # engine dependent config
  20. categories = ['videos']
  21. paging = True
  22. page_size = 48
  23. # search-url
  24. base_url = 'https://www.ina.fr'
  25. search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}'
  26. # specific xpath variables
  27. results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]'
  28. url_xpath = './/a/@href'
  29. title_xpath = './/h3[@class="h3--title media-heading"]'
  30. thumbnail_xpath = './/img/@src'
  31. publishedDate_xpath = './/span[@class="broadcast"]'
  32. content_xpath = './/p[@class="media-body__summary"]'
  33. # do search-request
  34. def request(query, params):
  35. params['url'] = search_url.format(ps=page_size,
  36. start=params['pageno'] * page_size,
  37. query=urlencode({'q': query}))
  38. return params
  39. # get response from search-request
  40. def response(resp):
  41. results = []
  42. # we get html in a JSON container...
  43. response = loads(resp.text)
  44. if "content" not in response:
  45. return []
  46. dom = html.fromstring(response["content"])
  47. p = HTMLParser()
  48. # parse results
  49. for result in dom.xpath(results_xpath):
  50. videoid = result.xpath(url_xpath)[0]
  51. url = base_url + videoid
  52. title = p.unescape(extract_text(result.xpath(title_xpath)))
  53. try:
  54. thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
  55. except:
  56. thumbnail = ''
  57. if thumbnail and thumbnail[0] == '/':
  58. thumbnail = base_url + thumbnail
  59. d = extract_text(result.xpath(publishedDate_xpath)[0])
  60. d = d.split('/')
  61. # force ISO date to avoid wrong parsing
  62. d = "%s-%s-%s" % (d[2], d[1], d[0])
  63. publishedDate = parser.parse(d)
  64. content = extract_text(result.xpath(content_xpath))
  65. # append result
  66. results.append({'url': url,
  67. 'title': title,
  68. 'content': content,
  69. 'template': 'videos.html',
  70. 'publishedDate': publishedDate,
  71. 'thumbnail': thumbnail})
  72. # return results
  73. return results