soundcloud.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. """
  2. Soundcloud (Music)
  3. @website https://soundcloud.com
  4. @provide-api yes (https://developers.soundcloud.com/)
  5. @using-api yes
  6. @results JSON
  7. @stable yes
  8. @parse url, title, content, publishedDate, embedded
  9. """
  10. import re
  11. from json import loads
  12. from lxml import html
  13. from dateutil import parser
  14. from urllib.parse import quote_plus, urlencode
  15. from searx import logger
  16. from searx.poolrequests import get as http_get
  17. # engine dependent config
  18. categories = ['music']
  19. paging = True
  20. # search-url
  21. # missing attribute: user_id, app_version, app_locale
  22. url = 'https://api-v2.soundcloud.com/'
  23. search_url = url + 'search?{query}'\
  24. '&variant_ids='\
  25. '&facet=model'\
  26. '&limit=20'\
  27. '&offset={offset}'\
  28. '&linked_partitioning=1'\
  29. '&client_id={client_id}' # noqa
  30. embedded_url = '<iframe width="100%" height="166" ' +\
  31. 'scrolling="no" frameborder="no" ' +\
  32. 'data-src="https://w.soundcloud.com/player/?url={uri}"></iframe>'
  33. cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U)
  34. guest_client_id = ''
  35. def get_client_id():
  36. response = http_get("https://soundcloud.com")
  37. if response.ok:
  38. tree = html.fromstring(response.content)
  39. # script_tags has been moved from /assets/app/ to /assets/ path. I
  40. # found client_id in https://a-v2.sndcdn.com/assets/49-a0c01933-3.js
  41. script_tags = tree.xpath("//script[contains(@src, '/assets/')]")
  42. app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None]
  43. # extracts valid app_js urls from soundcloud.com content
  44. for app_js_url in app_js_urls:
  45. # gets app_js and searches for the clientid
  46. response = http_get(app_js_url)
  47. if response.ok:
  48. cids = cid_re.search(response.content.decode())
  49. if cids is not None and len(cids.groups()):
  50. return cids.groups()[0]
  51. logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!")
  52. return ""
  53. def init(engine_settings=None):
  54. global guest_client_id
  55. # api-key
  56. guest_client_id = get_client_id()
  57. # do search-request
  58. def request(query, params):
  59. offset = (params['pageno'] - 1) * 20
  60. params['url'] = search_url.format(query=urlencode({'q': query}),
  61. offset=offset,
  62. client_id=guest_client_id)
  63. return params
  64. # get response from search-request
  65. def response(resp):
  66. results = []
  67. search_res = loads(resp.text)
  68. # parse results
  69. for result in search_res.get('collection', []):
  70. if result['kind'] in ('track', 'playlist'):
  71. title = result['title']
  72. content = result['description'] or ''
  73. publishedDate = parser.parse(result['last_modified'])
  74. uri = quote_plus(result['uri'])
  75. embedded = embedded_url.format(uri=uri)
  76. # append result
  77. results.append({'url': result['permalink_url'],
  78. 'title': title,
  79. 'publishedDate': publishedDate,
  80. 'embedded': embedded,
  81. 'content': content})
  82. # return results
  83. return results