soundcloud.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. """
  2. Soundcloud (Music)
  3. @website https://soundcloud.com
  4. @provide-api yes (https://developers.soundcloud.com/)
  5. @using-api yes
  6. @results JSON
  7. @stable yes
  8. @parse url, title, content, publishedDate, embedded
  9. """
  10. import re
  11. from json import loads
  12. from lxml import html
  13. from dateutil import parser
  14. from searx import logger
  15. from searx.poolrequests import get as http_get
  16. from searx.url_utils import quote_plus, urlencode
  17. try:
  18. from cStringIO import StringIO
  19. except:
  20. from io import StringIO
  21. # engine dependent config
  22. categories = ['music']
  23. paging = True
  24. # search-url
  25. url = 'https://api.soundcloud.com/'
  26. search_url = url + 'search?{query}'\
  27. '&facet=model'\
  28. '&limit=20'\
  29. '&offset={offset}'\
  30. '&linked_partitioning=1'\
  31. '&client_id={client_id}' # noqa
  32. embedded_url = '<iframe width="100%" height="166" ' +\
  33. 'scrolling="no" frameborder="no" ' +\
  34. 'data-src="https://w.soundcloud.com/player/?url={uri}"></iframe>'
  35. cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U)
  36. def get_client_id():
  37. response = http_get("https://soundcloud.com")
  38. if response.ok:
  39. tree = html.fromstring(response.content)
  40. script_tags = tree.xpath("//script[contains(@src, '/assets/app')]")
  41. app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None]
  42. # extracts valid app_js urls from soundcloud.com content
  43. for app_js_url in app_js_urls:
  44. # gets app_js and searches for the clientid
  45. response = http_get(app_js_url)
  46. if response.ok:
  47. cids = cid_re.search(response.text)
  48. if cids is not None and len(cids.groups()):
  49. return cids.groups()[0]
  50. logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!")
  51. return ""
  52. # api-key
  53. guest_client_id = get_client_id()
  54. # do search-request
  55. def request(query, params):
  56. offset = (params['pageno'] - 1) * 20
  57. params['url'] = search_url.format(query=urlencode({'q': query}),
  58. offset=offset,
  59. client_id=guest_client_id)
  60. return params
  61. # get response from search-request
  62. def response(resp):
  63. results = []
  64. search_res = loads(resp.text)
  65. # parse results
  66. for result in search_res.get('collection', []):
  67. if result['kind'] in ('track', 'playlist'):
  68. title = result['title']
  69. content = result['description']
  70. publishedDate = parser.parse(result['last_modified'])
  71. uri = quote_plus(result['uri'])
  72. embedded = embedded_url.format(uri=uri)
  73. # append result
  74. results.append({'url': result['permalink_url'],
  75. 'title': title,
  76. 'publishedDate': publishedDate,
  77. 'embedded': embedded,
  78. 'content': content})
  79. # return results
  80. return results