fix apple music

This commit is contained in:
mein Name 2025-03-09 11:23:58 -04:00 committed by Henri Dickson
parent 4758af58a6
commit e45980a85a
8 changed files with 977 additions and 1874 deletions

View file

@ -449,6 +449,7 @@ LANGUAGE_CODE, PREFERRED_LANGUAGES = _init_language_settings(
if TESTING: # force en if testing if TESTING: # force en if testing
LANGUAGE_CODE = "en" LANGUAGE_CODE = "en"
PREFERRED_LANGUAGES = ["en"]
LOCALE_PATHS = [os.path.join(BASE_DIR, "locale")] LOCALE_PATHS = [os.path.join(BASE_DIR, "locale")]
@ -580,7 +581,7 @@ SEARCH_INDEX_NEW_ONLY = False
INDEX_ALIASES = env("INDEX_ALIASES") INDEX_ALIASES = env("INDEX_ALIASES")
DOWNLOADER_SAVEDIR = env("NEODB_DOWNLOADER_SAVE_DIR", default="/tmp") # type: ignore DOWNLOADER_SAVEDIR = env("NEODB_DOWNLOADER_SAVE_DIR", default="") # type: ignore
DISABLE_MODEL_SIGNAL = False # disable index and social feeds during importing/etc DISABLE_MODEL_SIGNAL = False # disable index and social feeds during importing/etc

View file

@ -6,6 +6,8 @@ from catalog.music.utils import *
class BasicMusicTest(TestCase): class BasicMusicTest(TestCase):
databases = "__all__"
def test_gtin(self): def test_gtin(self):
self.assertIsNone(upc_to_gtin_13("018771208112X")) self.assertIsNone(upc_to_gtin_13("018771208112X"))
self.assertIsNone(upc_to_gtin_13("999018771208112")) self.assertIsNone(upc_to_gtin_13("999018771208112"))
@ -15,6 +17,8 @@ class BasicMusicTest(TestCase):
class SpotifyTestCase(TestCase): class SpotifyTestCase(TestCase):
databases = "__all__"
def test_parse(self): def test_parse(self):
t_id_type = IdType.Spotify_Album t_id_type = IdType.Spotify_Album
t_id_value = "65KwtzkJXw7oT819NFWmEP" t_id_value = "65KwtzkJXw7oT819NFWmEP"
@ -48,6 +52,8 @@ class SpotifyTestCase(TestCase):
class DoubanMusicTestCase(TestCase): class DoubanMusicTestCase(TestCase):
databases = "__all__"
def test_parse(self): def test_parse(self):
t_id_type = IdType.DoubanMusic t_id_type = IdType.DoubanMusic
t_id_value = "33551231" t_id_value = "33551231"
@ -74,6 +80,8 @@ class DoubanMusicTestCase(TestCase):
class MultiMusicSitesTestCase(TestCase): class MultiMusicSitesTestCase(TestCase):
databases = "__all__"
@use_local_response @use_local_response
def test_albums(self): def test_albums(self):
url1 = "https://music.douban.com/subject/33551231/" url1 = "https://music.douban.com/subject/33551231/"
@ -92,6 +100,8 @@ class MultiMusicSitesTestCase(TestCase):
class BandcampTestCase(TestCase): class BandcampTestCase(TestCase):
databases = "__all__"
def test_parse(self): def test_parse(self):
t_id_type = IdType.Bandcamp t_id_type = IdType.Bandcamp
t_id_value = "intlanthem.bandcamp.com/album/in-these-times" t_id_value = "intlanthem.bandcamp.com/album/in-these-times"
@ -119,6 +129,8 @@ class BandcampTestCase(TestCase):
class DiscogsReleaseTestCase(TestCase): class DiscogsReleaseTestCase(TestCase):
databases = "__all__"
def test_parse(self): def test_parse(self):
t_id_type = IdType.Discogs_Release t_id_type = IdType.Discogs_Release
t_id_value = "25829341" t_id_value = "25829341"
@ -155,6 +167,8 @@ class DiscogsReleaseTestCase(TestCase):
class DiscogsMasterTestCase(TestCase): class DiscogsMasterTestCase(TestCase):
databases = "__all__"
def test_parse(self): def test_parse(self):
t_id_type = IdType.Discogs_Master t_id_type = IdType.Discogs_Master
t_id_value = "469004" t_id_value = "469004"
@ -182,6 +196,8 @@ class DiscogsMasterTestCase(TestCase):
class AppleMusicTestCase(TestCase): class AppleMusicTestCase(TestCase):
databases = "__all__"
def test_parse(self): def test_parse(self):
t_id_type = IdType.AppleMusic t_id_type = IdType.AppleMusic
t_id_value = "1284391545" t_id_value = "1284391545"
@ -201,8 +217,10 @@ class AppleMusicTestCase(TestCase):
self.assertEqual(site.ready, False) self.assertEqual(site.ready, False)
site.get_resource_ready() site.get_resource_ready()
self.assertEqual(site.ready, True) self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata["title"], "Kids Only") self.assertEqual(
site.resource.metadata["localized_title"][0]["text"], "Kids Only"
)
self.assertEqual(site.resource.metadata["artist"], ["Leah Dou"]) self.assertEqual(site.resource.metadata["artist"], ["Leah Dou"])
self.assertIsInstance(site.resource.item, Album) self.assertIsInstance(site.resource.item, Album)
self.assertEqual(site.resource.item.genre, ["Pop"]) self.assertEqual(site.resource.item.genre, ["Pop", "Music"])
self.assertEqual(site.resource.item.duration, 2371628) self.assertEqual(site.resource.item.duration, 2368000)

View file

@ -9,8 +9,9 @@ Scraping the website directly.
""" """
import json import json
from datetime import timedelta
import dateparser from django.utils.dateparse import parse_duration
from loguru import logger from loguru import logger
from catalog.common import * from catalog.common import *
@ -18,7 +19,6 @@ from catalog.models import *
from common.models.lang import ( from common.models.lang import (
SITE_DEFAULT_LANGUAGE, SITE_DEFAULT_LANGUAGE,
SITE_PREFERRED_LANGUAGES, SITE_PREFERRED_LANGUAGES,
detect_language,
) )
from common.models.misc import uniq from common.models.misc import uniq
@ -39,7 +39,6 @@ class AppleMusic(AbstractSite):
headers = { headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": BasicDownloader.get_accept_language(),
"Accept-Encoding": "gzip, deflate", "Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive", "Connection": "keep-alive",
"DNT": "1", "DNT": "1",
@ -70,80 +69,63 @@ class AppleMusic(AbstractSite):
return locales return locales
def scrape(self): def scrape(self):
matched_content = None matched_schema_data = None
localized_title = [] localized_title = []
localized_desc = [] localized_desc = []
for lang, locales in self.get_locales().items(): for lang, locales in self.get_locales().items():
for loc in locales: # waterfall thru all locales for loc in locales: # waterfall thru all locales
url = f"https://music.apple.com/{loc}/album/{self.id_value}" url = f"https://music.apple.com/{loc}/album/{self.id_value}"
try: try:
tl = f"{lang}-{loc}" if lang == "zh" else lang
headers = {
"Accept-Language": tl,
}
headers.update(self.headers)
content = ( content = (
BasicDownloader(url, headers=self.headers).download().html() BasicDownloader(url, headers=self.headers).download().html()
) )
logger.info(f"got localized content from {url}") logger.debug(f"got localized content from {url}")
elem = content.xpath( txt: str = content.xpath(
"//script[@id='serialized-server-data']/text()" "//script[@id='schema:music-album']/text()"
) )[0] # type:ignore
txt: str = elem[0] # type:ignore schema_data = json.loads(txt)
page_data = json.loads(txt)[0] title = schema_data["name"]
album_data = page_data["data"]["sections"][0]["items"][0] if title:
title = album_data["title"]
brief = album_data.get("modalPresentationDescriptor", {}).get(
"paragraphText", ""
)
tl = detect_language(title + " " + brief)
localized_title.append({"lang": tl, "text": title}) localized_title.append({"lang": tl, "text": title})
try:
txt: str = content.xpath(
"//script[@id='serialized-server-data']/text()"
)[0] # type:ignore
server_data = json.loads(txt)
brief = server_data[0]["data"]["sections"][0]["items"][0][
"modalPresentationDescriptor"
]["paragraphText"]
if brief: if brief:
localized_desc.append({"lang": tl, "text": brief}) localized_desc.append({"lang": tl, "text": brief})
if lang == SITE_DEFAULT_LANGUAGE or not matched_content: except Exception:
matched_content = content server_data = brief = None
if lang == SITE_DEFAULT_LANGUAGE or not matched_schema_data:
matched_schema_data = schema_data
break break
except Exception: except Exception:
pass pass
if matched_content is None: if matched_schema_data is None: # no schema data found
raise ParseError(self, f"localized content for {self.url}") raise ParseError(self, f"localized content for {self.url}")
elem = matched_content.xpath("//script[@id='serialized-server-data']/text()") artist = [a["name"] for a in matched_schema_data.get("byArtist", [])]
txt: str = elem[0] # type:ignore release_date = matched_schema_data.get("datePublished", None)
page_data = json.loads(txt)[0] genre = matched_schema_data.get("genre", [])
album_data = page_data["data"]["sections"][0]["items"][0] image_url = matched_schema_data.get("image", None)
title = album_data["title"] track_list = [t["name"] for t in matched_schema_data.get("tracks", [])]
brief = album_data.get("modalPresentationDescriptor") duration = round(
brief = brief.get("paragraphText") if brief else None sum(
artist_list = album_data["subtitleLinks"] (parse_duration(t["duration"]) or timedelta()).total_seconds() * 1000
artist = [item["title"] for item in artist_list] for t in matched_schema_data.get("tracks", [])
)
track_data = page_data["data"]["seoData"]
date_elem = track_data.get("musicReleaseDate")
release_datetime = dateparser.parse(date_elem.strip()) if date_elem else None
release_date = (
release_datetime.strftime("%Y-%m-%d") if release_datetime else None
) )
track_list = [
f"{i}. {track['attributes']['name']}"
for i, track in enumerate(track_data["ogSongs"], 1)
]
duration_list = [
track["attributes"].get("durationInMillis", 0)
for track in track_data["ogSongs"]
]
duration = int(sum(duration_list))
genre = track_data["schemaContent"].get("genre")
if genre:
genre = [
genre[0]
] # apple treat "Music" as a genre. Thus, only the first genre is obtained.
images = matched_content.xpath("//source[@type='image/jpeg']/@srcset")
image_elem: str = images[0] if images else "" # type:ignore
image_url = image_elem.split(" ")[0] if image_elem else None
pd = ResourceContent( pd = ResourceContent(
metadata={ metadata={
"localized_title": uniq(localized_title), "localized_title": uniq(localized_title),
"localized_description": uniq(localized_desc), "localized_description": uniq(localized_desc),
"title": title,
"brief": brief,
"artist": artist, "artist": artist,
"genre": genre, "genre": genre,
"release_date": release_date, "release_date": release_date,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long