soundcloud.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """SoundCloud is a German audio streaming service."""
  3. from __future__ import annotations
  4. import re
  5. import typing
  6. import datetime
  7. from urllib.parse import quote_plus, urlencode
  8. from dateutil import parser
  9. from lxml import html
  10. from searx.network import get as http_get
  11. from searx.enginelib import EngineCache
  12. if typing.TYPE_CHECKING:
  13. import logging
  14. logger: logging.Logger
  15. about = {
  16. "website": "https://soundcloud.com",
  17. "wikidata_id": "Q568769",
  18. "official_api_documentation": "https://developers.soundcloud.com/docs/api/guide",
  19. "use_official_api": False,
  20. "require_api_key": False,
  21. "results": 'JSON',
  22. }
  23. categories = ["music"]
  24. paging = True
  25. search_url = "https://api-v2.soundcloud.com/search"
  26. """This is not the official (developer) url, it is the API which is used by the
  27. HTML frontend of the common WEB site.
  28. """
  29. cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U)
  30. results_per_page = 10
  31. soundcloud_facet = "model"
  32. app_locale_map = {
  33. "de": "de",
  34. "en": "en",
  35. "es": "es",
  36. "fr": "fr",
  37. "oc": "fr",
  38. "it": "it",
  39. "nl": "nl",
  40. "pl": "pl",
  41. "szl": "pl",
  42. "pt": "pt_BR",
  43. "pap": "pt_BR",
  44. "sv": "sv",
  45. }
  46. CACHE: EngineCache
  47. """Persistent (SQLite) key/value cache that deletes its values after ``expire``
  48. seconds."""
  49. def request(query, params):
  50. # missing attributes: user_id, app_version
  51. # - user_id=451561-497874-703312-310156
  52. # - app_version=1740727428
  53. guest_client_id = CACHE.get("guest_client_id")
  54. if guest_client_id is None:
  55. guest_client_id = get_client_id()
  56. if guest_client_id:
  57. CACHE.set(key="guest_client_id", value=guest_client_id)
  58. args = {
  59. "q": query,
  60. "offset": (params['pageno'] - 1) * results_per_page,
  61. "limit": results_per_page,
  62. "facet": soundcloud_facet,
  63. "client_id": guest_client_id,
  64. "app_locale": app_locale_map.get(params["language"].split("-")[0], "en"),
  65. }
  66. params['url'] = f"{search_url}?{urlencode(args)}"
  67. return params
  68. def response(resp):
  69. results = []
  70. data = resp.json()
  71. for result in data.get("collection", []):
  72. if result["kind"] in ("track", "playlist"):
  73. url = result.get("permalink_url")
  74. if not url:
  75. continue
  76. uri = quote_plus(result.get("uri"))
  77. content = [
  78. result.get("description"),
  79. result.get("label_name"),
  80. ]
  81. res = {
  82. "url": url,
  83. "title": result["title"],
  84. "content": " / ".join([c for c in content if c]),
  85. "publishedDate": parser.parse(result["last_modified"]),
  86. "iframe_src": "https://w.soundcloud.com/player/?url=" + uri,
  87. "views": result.get("likes_count"),
  88. }
  89. thumbnail = result["artwork_url"] or result["user"]["avatar_url"]
  90. res["thumbnail"] = thumbnail or None
  91. length = int(result.get("duration", 0) / 1000)
  92. if length:
  93. length = datetime.timedelta(seconds=length)
  94. res["length"] = length
  95. res["views"] = result.get("playback_count", 0) or None
  96. res["author"] = result.get("user", {}).get("full_name") or None
  97. results.append(res)
  98. return results
  99. def init(engine_settings): # pylint: disable=unused-argument
  100. global CACHE # pylint: disable=global-statement
  101. CACHE = EngineCache(engine_settings["name"]) # type:ignore
  102. def get_client_id() -> str | None:
  103. client_id = ""
  104. url = "https://soundcloud.com"
  105. resp = http_get(url, timeout=10)
  106. if not resp.ok:
  107. logger.error("init: GET %s failed", url)
  108. return client_id
  109. tree = html.fromstring(resp.content)
  110. script_tags = tree.xpath("//script[contains(@src, '/assets/')]")
  111. app_js_urls = [tag.get("src") for tag in script_tags if tag is not None]
  112. # extracts valid app_js urls from soundcloud.com content
  113. for url in app_js_urls[::-1]:
  114. # gets app_js and search for the client_id
  115. resp = http_get(url)
  116. if not resp.ok:
  117. logger.error("init: app_js GET %s failed", url)
  118. continue
  119. cids = cid_re.search(resp.content.decode())
  120. if cids and len(cids.groups()):
  121. client_id = cids.groups()[0]
  122. break
  123. if client_id:
  124. logger.info("using client_id '%s' for soundclud queries", client_id)
  125. else:
  126. logger.warning("missing valid client_id for soundclud queries")
  127. return client_id or None