lib.itmens/catalog/sites/rss.py

138 lines
4.6 KiB
Python
Raw Normal View History

2023-01-29 20:05:30 -05:00
import logging
import pickle
2023-01-29 20:05:30 -05:00
import urllib.request
from datetime import datetime
2023-02-01 22:37:34 -05:00
import bleach
import podcastparser
2024-05-27 15:44:12 -04:00
from django.conf import settings
from django.core.cache import cache
from django.core.validators import URLValidator
from django.utils.timezone import make_aware
from catalog.common import *
2023-06-05 10:40:48 -04:00
from catalog.common.downloaders import (
_local_response_path,
2023-06-05 10:40:48 -04:00
get_mock_file,
get_mock_mode,
)
from catalog.models import *
from catalog.podcast.models import PodcastEpisode
2024-07-13 00:16:47 -04:00
from common.models.lang import detect_language
from journal.models.renderers import html_to_text
2023-01-29 20:05:30 -05:00
_logger = logging.getLogger(__name__)
@SiteManager.register
class RSS(AbstractSite):
SITE_NAME = SiteName.RSS
ID_TYPE = IdType.RSS
DEFAULT_MODEL = Podcast
URL_PATTERNS = [r".+[./](rss|xml)"]
@staticmethod
def parse_feed_from_url(url):
if not url:
return None
2023-07-20 21:59:49 -04:00
cache_key = f"rss:{url}"
feed = cache.get(cache_key)
2023-01-29 20:05:30 -05:00
if feed:
return feed
2023-06-05 10:40:48 -04:00
if get_mock_mode():
feed = pickle.load(open(_local_response_path + get_mock_file(url), "rb"))
else:
req = urllib.request.Request(url)
2023-11-26 17:23:53 -05:00
req.add_header("User-Agent", settings.NEODB_USER_AGENT)
2023-06-05 10:40:48 -04:00
try:
feed = podcastparser.parse(url, urllib.request.urlopen(req, timeout=3))
2024-04-06 00:13:50 -04:00
except Exception:
2023-12-11 22:08:29 -05:00
url = url.replace("https://", "http://")
req = urllib.request.Request(url)
req.add_header("User-Agent", settings.NEODB_USER_AGENT)
try:
feed = podcastparser.parse(
url, urllib.request.urlopen(req, timeout=3)
)
2024-04-06 00:13:50 -04:00
except Exception:
2023-12-11 22:08:29 -05:00
return None
2023-06-05 10:40:48 -04:00
if settings.DOWNLOADER_SAVEDIR:
pickle.dump(
feed,
open(settings.DOWNLOADER_SAVEDIR + "/" + get_mock_file(url), "wb"),
)
2023-07-20 21:59:49 -04:00
cache.set(cache_key, feed, timeout=settings.DOWNLOADER_CACHE_TIMEOUT)
2023-01-29 20:05:30 -05:00
return feed
@classmethod
def id_to_url(cls, id_value):
return f"https://{id_value}"
@classmethod
def url_to_id(cls, url: str):
return url.split("://")[1]
@classmethod
def validate_url_fallback(cls, url):
2023-04-25 19:04:57 -04:00
val = URLValidator()
try:
val(url)
return cls.parse_feed_from_url(url) is not None
except Exception:
return False
2023-01-29 20:05:30 -05:00
def scrape(self):
2023-12-31 08:32:19 -05:00
if not self.url:
2024-07-27 03:22:27 -04:00
raise ValueError("no url avaialble in RSS site")
2023-01-29 20:05:30 -05:00
feed = self.parse_feed_from_url(self.url)
if not feed:
2023-12-31 08:32:19 -05:00
raise ValueError(f"no feed avaialble in {self.url}")
2024-07-16 00:51:05 -04:00
title = feed["title"].strip()
if not title:
raise ParseError(self, "title")
2024-07-13 00:16:47 -04:00
desc = html_to_text(feed["description"])
lang = detect_language(title + " " + desc)
2023-01-29 20:05:30 -05:00
pd = ResourceContent(
metadata={
2024-07-13 00:16:47 -04:00
"title": title,
"brief": desc,
"localized_title": [{"lang": lang, "text": title}],
2024-07-16 00:51:05 -04:00
"localized_description": [{"lang": lang, "text": desc}] if desc else [],
2024-07-13 00:16:47 -04:00
"host": (
2024-06-02 14:50:07 -04:00
[feed.get("itunes_author")] if feed.get("itunes_author") else []
),
2023-01-29 20:05:30 -05:00
"official_site": feed.get("link"),
"cover_image_url": feed.get("cover_url"),
"genre": feed.get("itunes_categories", [None])[0],
}
)
pd.lookup_ids[IdType.RSS] = RSS.url_to_id(self.url)
return pd
def scrape_additional_data(self):
item = self.get_item()
feed = self.parse_feed_from_url(self.url)
if not feed:
return
for episode in feed["episodes"]:
PodcastEpisode.objects.get_or_create(
program=item,
guid=episode.get("guid"),
defaults={
"title": episode["title"],
2023-02-01 22:37:34 -05:00
"brief": bleach.clean(episode.get("description"), strip=True),
2023-01-29 20:05:30 -05:00
"description_html": episode.get("description_html"),
"cover_url": episode.get("episode_art_url"),
2024-06-02 14:50:07 -04:00
"media_url": (
episode.get("enclosures")[0].get("url")
if episode.get("enclosures")
else None
),
2023-01-29 20:05:30 -05:00
"pub_date": make_aware(
datetime.fromtimestamp(episode.get("published"))
),
"duration": episode.get("duration"),
"link": episode.get("link"),
},
)