ina.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. INA (Videos)
  4. """
  5. from json import loads
  6. from html import unescape
  7. from urllib.parse import urlencode
  8. from lxml import html
  9. from dateutil import parser
  10. from searx.utils import extract_text
  11. # about
  12. about = {
  13. "website": 'https://www.ina.fr/',
  14. "wikidata_id": 'Q1665109',
  15. "official_api_documentation": None,
  16. "use_official_api": False,
  17. "require_api_key": False,
  18. "results": 'HTML',
  19. "language": 'fr',
  20. }
  21. # engine dependent config
  22. categories = ['videos']
  23. paging = True
  24. page_size = 48
  25. # search-url
  26. base_url = 'https://www.ina.fr'
  27. search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}'
  28. # specific xpath variables
  29. results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]'
  30. url_xpath = './/a/@href'
  31. title_xpath = './/h3[@class="h3--title media-heading"]'
  32. thumbnail_xpath = './/img/@src'
  33. publishedDate_xpath = './/span[@class="broadcast"]'
  34. content_xpath = './/p[@class="media-body__summary"]'
  35. # do search-request
  36. def request(query, params):
  37. params['url'] = search_url.format(ps=page_size, start=params['pageno'] * page_size, query=urlencode({'q': query}))
  38. return params
  39. # get response from search-request
  40. def response(resp):
  41. results = []
  42. # we get html in a JSON container...
  43. response = loads(resp.text)
  44. dom = html.fromstring(response)
  45. # parse results
  46. for result in dom.xpath(results_xpath):
  47. videoid = result.xpath(url_xpath)[0]
  48. url = base_url + videoid
  49. title = unescape(extract_text(result.xpath(title_xpath)))
  50. try:
  51. thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
  52. except:
  53. thumbnail = ''
  54. if thumbnail and thumbnail[0] == '/':
  55. thumbnail = base_url + thumbnail
  56. d = extract_text(result.xpath(publishedDate_xpath)[0])
  57. d = d.split('/')
  58. # force ISO date to avoid wrong parsing
  59. d = "%s-%s-%s" % (d[2], d[1], d[0])
  60. publishedDate = parser.parse(d)
  61. content = extract_text(result.xpath(content_xpath))
  62. # append result
  63. results.append(
  64. {
  65. 'url': url,
  66. 'title': title,
  67. 'content': content,
  68. 'template': 'videos.html',
  69. 'publishedDate': publishedDate,
  70. 'thumbnail': thumbnail,
  71. }
  72. )
  73. # return results
  74. return results