findx.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. """
  2. FindX (General, Images, Videos)
  3. @website https://www.findx.com
  4. @provide-api no
  5. @using-api no
  6. @results HTML
  7. @stable no
  8. @parse url, title, content, embedded, img_src, thumbnail_src
  9. """
  10. from dateutil import parser
  11. from json import loads
  12. import re
  13. from lxml import html
  14. from searx import logger
  15. from searx.engines.xpath import extract_text
  16. from searx.engines.youtube_noapi import base_youtube_url, embedded_url
  17. from searx.url_utils import urlencode
  18. paging = True
  19. results_xpath = '//script[@id="initial-state"]'
  20. search_url = 'https://www.findx.com/{category}?{q}'
  21. type_map = {
  22. 'none': 'web',
  23. 'general': 'web',
  24. 'images': 'images',
  25. 'videos': 'videos',
  26. }
  27. def request(query, params):
  28. params['url'] = search_url.format(
  29. category=type_map[params['category']],
  30. q=urlencode({
  31. 'q': query,
  32. 'page': params['pageno']
  33. })
  34. )
  35. return params
  36. def response(resp):
  37. dom = html.fromstring(resp.text)
  38. results_raw_json = dom.xpath(results_xpath)
  39. results_json = loads(extract_text(results_raw_json))
  40. if len(results_json['web']['results']) > 0:
  41. return _general_results(results_json['web']['results']['webSearch']['results'])
  42. if len(results_json['images']['results']) > 0:
  43. return _images_results(results_json['images']['results'])
  44. if len(results_json['video']['results']) > 0:
  45. return _videos_results(results_json['video']['results'])
  46. return []
  47. def _general_results(general_results):
  48. results = []
  49. for result in general_results:
  50. results.append({
  51. 'url': result['url'],
  52. 'title': result['title'],
  53. 'content': result['sum'],
  54. })
  55. return results
  56. def _images_results(image_results):
  57. results = []
  58. for result in image_results:
  59. results.append({
  60. 'url': result['sourceURL'],
  61. 'title': result['title'],
  62. 'content': result['source'],
  63. 'thumbnail_src': _extract_url(result['assets']['thumb']['url']),
  64. 'img_src': _extract_url(result['assets']['file']['url']),
  65. 'template': 'images.html',
  66. })
  67. return results
  68. def _videos_results(video_results):
  69. results = []
  70. for result in video_results:
  71. if not result['kind'].startswith('youtube'):
  72. logger.warn('Unknown video kind in findx: {}'.format(result['kind']))
  73. continue
  74. description = result['snippet']['description']
  75. if len(description) > 300:
  76. description = description[:300] + '...'
  77. results.append({
  78. 'url': base_youtube_url + result['id'],
  79. 'title': result['snippet']['title'],
  80. 'content': description,
  81. 'thumbnail': _extract_url(result['snippet']['thumbnails']['default']['url']),
  82. 'publishedDate': parser.parse(result['snippet']['publishedAt']),
  83. 'embedded': embedded_url.format(videoid=result['id']),
  84. 'template': 'videos.html',
  85. })
  86. return results
  87. def _extract_url(url):
  88. matching = re.search('(/https?://[^)]+)', url)
  89. if matching:
  90. return matching.group(0)[1:]
  91. return ''