2023-04-20 19:21:59 +02:00
|
|
|
"""
|
|
|
|
Apple Music.
|
|
|
|
|
|
|
|
Scraping the website directly.
|
|
|
|
|
|
|
|
- Why not using Apple Music API?
|
|
|
|
- It requires Apple Developer Membership ($99 per year) to obtain a token.
|
|
|
|
|
|
|
|
"""
|
2024-06-02 14:50:07 -04:00
|
|
|
|
2023-04-20 19:21:59 +02:00
|
|
|
import json
|
2025-03-09 11:23:58 -04:00
|
|
|
from datetime import timedelta
|
2023-08-01 23:35:22 -04:00
|
|
|
|
2025-03-09 11:23:58 -04:00
|
|
|
from django.utils.dateparse import parse_duration
|
2025-02-22 17:11:35 -05:00
|
|
|
from loguru import logger
|
2023-04-20 19:21:59 +02:00
|
|
|
|
2023-08-01 23:35:22 -04:00
|
|
|
from catalog.common import *
|
|
|
|
from catalog.models import *
|
2024-07-13 00:16:47 -04:00
|
|
|
from common.models.lang import (
|
2024-11-30 13:23:21 -05:00
|
|
|
SITE_DEFAULT_LANGUAGE,
|
|
|
|
SITE_PREFERRED_LANGUAGES,
|
2024-07-13 00:16:47 -04:00
|
|
|
)
|
|
|
|
from common.models.misc import uniq
|
2023-08-01 23:35:22 -04:00
|
|
|
|
|
|
|
from .douban import *
|
2023-04-20 19:21:59 +02:00
|
|
|
|
|
|
|
|
|
|
|
@SiteManager.register
|
|
|
|
class AppleMusic(AbstractSite):
|
|
|
|
SITE_NAME = SiteName.AppleMusic
|
|
|
|
ID_TYPE = IdType.AppleMusic
|
2023-05-28 12:16:56 -04:00
|
|
|
URL_PATTERNS = [
|
|
|
|
r"https://music\.apple\.com/[a-z]{2}/album/[\w%-]+/(\d+)",
|
|
|
|
r"https://music\.apple\.com/[a-z]{2}/album/(\d+)",
|
|
|
|
r"https://music\.apple\.com/album/(\d+)",
|
2023-05-26 18:19:37 +02:00
|
|
|
]
|
2023-04-20 19:21:59 +02:00
|
|
|
WIKI_PROPERTY_ID = "?"
|
|
|
|
DEFAULT_MODEL = Album
|
2024-01-06 00:12:43 -05:00
|
|
|
headers = {
|
|
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",
|
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
|
|
"Accept-Encoding": "gzip, deflate",
|
|
|
|
"Connection": "keep-alive",
|
|
|
|
"DNT": "1",
|
|
|
|
"Upgrade-Insecure-Requests": "1",
|
|
|
|
"Cache-Control": "no-cache",
|
|
|
|
}
|
2023-04-20 19:21:59 +02:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def id_to_url(cls, id_value):
|
2023-05-28 12:16:56 -04:00
|
|
|
return f"https://music.apple.com/album/{id_value}"
|
|
|
|
|
2024-07-13 00:16:47 -04:00
|
|
|
def get_locales(self):
|
|
|
|
locales = {}
|
2024-11-30 13:23:21 -05:00
|
|
|
for lang in SITE_PREFERRED_LANGUAGES:
|
2024-07-27 03:22:27 -04:00
|
|
|
match lang:
|
2024-07-13 00:16:47 -04:00
|
|
|
case "zh":
|
|
|
|
locales.update({"zh": ["cn", "tw", "hk", "sg"]})
|
|
|
|
case "en":
|
|
|
|
locales.update({"en": ["us", "gb", "ca"]})
|
|
|
|
case "ja":
|
|
|
|
locales.update({"ja": ["jp"]})
|
|
|
|
case "ko":
|
|
|
|
locales.update({"ko": ["kr"]})
|
|
|
|
case "fr":
|
|
|
|
locales.update({"fr": ["fr", "ca"]})
|
|
|
|
if not locales:
|
|
|
|
locales = {"en": ["us"]}
|
|
|
|
return locales
|
2023-04-20 19:21:59 +02:00
|
|
|
|
|
|
|
def scrape(self):
|
2025-03-09 11:23:58 -04:00
|
|
|
matched_schema_data = None
|
2024-07-13 00:16:47 -04:00
|
|
|
localized_title = []
|
|
|
|
localized_desc = []
|
|
|
|
for lang, locales in self.get_locales().items():
|
|
|
|
for loc in locales: # waterfall thru all locales
|
|
|
|
url = f"https://music.apple.com/{loc}/album/{self.id_value}"
|
|
|
|
try:
|
2025-03-09 11:23:58 -04:00
|
|
|
tl = f"{lang}-{loc}" if lang == "zh" else lang
|
|
|
|
headers = {
|
|
|
|
"Accept-Language": tl,
|
|
|
|
}
|
|
|
|
headers.update(self.headers)
|
2024-07-13 00:16:47 -04:00
|
|
|
content = (
|
|
|
|
BasicDownloader(url, headers=self.headers).download().html()
|
|
|
|
)
|
2025-03-09 11:23:58 -04:00
|
|
|
logger.debug(f"got localized content from {url}")
|
|
|
|
txt: str = content.xpath(
|
|
|
|
"//script[@id='schema:music-album']/text()"
|
|
|
|
)[0] # type:ignore
|
|
|
|
schema_data = json.loads(txt)
|
|
|
|
title = schema_data["name"]
|
|
|
|
if title:
|
|
|
|
localized_title.append({"lang": tl, "text": title})
|
|
|
|
try:
|
|
|
|
txt: str = content.xpath(
|
|
|
|
"//script[@id='serialized-server-data']/text()"
|
|
|
|
)[0] # type:ignore
|
|
|
|
server_data = json.loads(txt)
|
|
|
|
brief = server_data[0]["data"]["sections"][0]["items"][0][
|
|
|
|
"modalPresentationDescriptor"
|
|
|
|
]["paragraphText"]
|
|
|
|
if brief:
|
|
|
|
localized_desc.append({"lang": tl, "text": brief})
|
|
|
|
except Exception:
|
|
|
|
server_data = brief = None
|
|
|
|
if lang == SITE_DEFAULT_LANGUAGE or not matched_schema_data:
|
|
|
|
matched_schema_data = schema_data
|
2024-07-13 00:16:47 -04:00
|
|
|
break
|
|
|
|
except Exception:
|
|
|
|
pass
|
2025-03-09 11:23:58 -04:00
|
|
|
if matched_schema_data is None: # no schema data found
|
2023-05-28 12:16:56 -04:00
|
|
|
raise ParseError(self, f"localized content for {self.url}")
|
2025-03-09 11:23:58 -04:00
|
|
|
artist = [a["name"] for a in matched_schema_data.get("byArtist", [])]
|
|
|
|
release_date = matched_schema_data.get("datePublished", None)
|
|
|
|
genre = matched_schema_data.get("genre", [])
|
|
|
|
image_url = matched_schema_data.get("image", None)
|
|
|
|
track_list = [t["name"] for t in matched_schema_data.get("tracks", [])]
|
|
|
|
duration = round(
|
|
|
|
sum(
|
|
|
|
(parse_duration(t["duration"]) or timedelta()).total_seconds() * 1000
|
|
|
|
for t in matched_schema_data.get("tracks", [])
|
|
|
|
)
|
2023-04-20 19:21:59 +02:00
|
|
|
)
|
|
|
|
pd = ResourceContent(
|
|
|
|
metadata={
|
2024-07-13 00:16:47 -04:00
|
|
|
"localized_title": uniq(localized_title),
|
|
|
|
"localized_description": uniq(localized_desc),
|
2023-04-20 19:21:59 +02:00
|
|
|
"artist": artist,
|
|
|
|
"genre": genre,
|
|
|
|
"release_date": release_date,
|
|
|
|
"track_list": "\n".join(track_list),
|
|
|
|
"duration": duration,
|
|
|
|
"cover_image_url": image_url,
|
|
|
|
}
|
|
|
|
)
|
|
|
|
return pd
|