diff --git a/catalog/book/models.py b/catalog/book/models.py index 18051b21..cee21862 100644 --- a/catalog/book/models.py +++ b/catalog/book/models.py @@ -90,6 +90,10 @@ class Edition(Item): # if not work: # _logger.info(f'Unable to find link for {w["url"]}') + def get_related_books(self): + # TODO + return [] + class Work(Item): category = ItemCategory.Book diff --git a/catalog/models.py b/catalog/models.py index 212b3789..ff405708 100644 --- a/catalog/models.py +++ b/catalog/models.py @@ -70,6 +70,8 @@ def all_categories(): def init_catalog_search_models(): + if settings.DISABLE_MODEL_SIGNAL: + return Indexer.update_model_indexable(Edition) Indexer.update_model_indexable(Work) Indexer.update_model_indexable(Movie) diff --git a/catalog/search/external.py b/catalog/search/external.py new file mode 100644 index 00000000..8884e268 --- /dev/null +++ b/catalog/search/external.py @@ -0,0 +1,259 @@ +from urllib.parse import quote_plus +from django.conf import settings +from catalog.common import * +from catalog.models import * +from catalog.sites.spotify import get_spotify_token +import requests +from lxml import html +import logging + +SEARCH_PAGE_SIZE = 5 # not all apis support page size +logger = logging.getLogger(__name__) + + +class SearchResultItem: + def __init__( + self, category, source_site, source_url, title, subtitle, brief, cover_url + ): + self.category = category + self.source_site = source_site + self.source_url = source_url + self.title = title + self.subtitle = subtitle + self.brief = brief + self.cover_url = cover_url + + @property + def verbose_category_name(self): + return self.category.label + + @property + def link(self): + return f"/search?q={quote_plus(self.source_url)}" + + @property + def scraped(self): + return False + + +class ProxiedRequest: + @classmethod + def get(cls, url): + u = f"http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={quote_plus(url)}" + return requests.get(u, timeout=10) + + +class Goodreads: + @classmethod + def search(cls, q, page=1): + results = [] + try: + search_url = ( + f"https://www.goodreads.com/search?page={page}&q={quote_plus(q)}" + ) + r = requests.get(search_url) + if r.url.startswith("https://www.goodreads.com/book/show/"): + # Goodreads will 302 if only one result matches ISBN + res = SiteManager.get_site_by_url(r.url).get_resource_ready() + subtitle = f"{res.metadata['pub_year']} {', '.join(res.metadata['author'])} {', '.join(res.metadata['translator'] if res.metadata['translator'] else [])}" + results.append( + SearchResultItem( + ItemCategory.Book, + SiteName.Goodreads, + res.url, + res.metadata["title"], + subtitle, + res.metadata["brief"], + res.metadata["cover_image_url"], + ) + ) + else: + h = html.fromstring(r.content.decode("utf-8")) + for c in h.xpath('//tr[@itemtype="http://schema.org/Book"]'): + el_cover = c.xpath('.//img[@class="bookCover"]/@src') + cover = el_cover[0] if el_cover else None + el_title = c.xpath('.//a[@class="bookTitle"]//text()') + title = "".join(el_title).strip() if el_title else None + el_url = c.xpath('.//a[@class="bookTitle"]/@href') + url = "https://www.goodreads.com" + el_url[0] if el_url else None + el_authors = c.xpath('.//a[@class="authorName"]//text()') + subtitle = ", ".join(el_authors) if el_authors else None + results.append( + SearchResultItem( + ItemCategory.Book, + SiteName.Goodreads, + url, + title, + subtitle, + "", + cover, + ) + ) + except Exception as e: + logger.error(f"Goodreads search '{q}' error: {e}") + return results + + +class GoogleBooks: + @classmethod + def search(cls, q, page=1): + results = [] + try: + api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={SEARCH_PAGE_SIZE*(page-1)}&maxResults={SEARCH_PAGE_SIZE}&maxAllowedMaturityRating=MATURE" + j = requests.get(api_url).json() + if "items" in j: + for b in j["items"]: + if "title" not in b["volumeInfo"]: + continue + title = b["volumeInfo"]["title"] + subtitle = "" + if "publishedDate" in b["volumeInfo"]: + subtitle += b["volumeInfo"]["publishedDate"] + " " + if "authors" in b["volumeInfo"]: + subtitle += ", ".join(b["volumeInfo"]["authors"]) + if "description" in b["volumeInfo"]: + brief = b["volumeInfo"]["description"] + elif "textSnippet" in b["volumeInfo"]: + brief = b["volumeInfo"]["textSnippet"]["searchInfo"] + else: + brief = "" + category = ItemCategory.Book + # b['volumeInfo']['infoLink'].replace('http:', 'https:') + url = "https://books.google.com/books?id=" + b["id"] + cover = ( + b["volumeInfo"]["imageLinks"]["thumbnail"] + if "imageLinks" in b["volumeInfo"] + else None + ) + results.append( + SearchResultItem( + category, + SiteName.GoogleBooks, + url, + title, + subtitle, + brief, + cover, + ) + ) + except Exception as e: + logger.error(f"GoogleBooks search '{q}' error: {e}") + return results + + +class TheMovieDatabase: + @classmethod + def search(cls, q, page=1): + results = [] + try: + api_url = f"https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={page}&api_key={settings.TMDB_API3_KEY}&language=zh-CN&include_adult=true" + j = requests.get(api_url).json() + for m in j["results"]: + if m["media_type"] in ["tv", "movie"]: + url = f"https://www.themoviedb.org/{m['media_type']}/{m['id']}" + if m["media_type"] == "tv": + cat = ItemCategory.TV + title = m["name"] + subtitle = f"{m.get('first_air_date')} {m.get('original_name')}" + else: + cat = ItemCategory.Movie + title = m["title"] + subtitle = f"{m.get('release_date')} {m.get('original_name')}" + cover = f"https://image.tmdb.org/t/p/w500/{m.get('poster_path')}" + results.append( + SearchResultItem( + cat, + SiteName.TMDB, + url, + title, + subtitle, + m.get("overview"), + cover, + ) + ) + except Exception as e: + logger.error(f"TMDb search '{q}' error: {e}") + return results + + +class Spotify: + @classmethod + def search(cls, q, page=1): + results = [] + try: + api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={SEARCH_PAGE_SIZE}&offset={page*SEARCH_PAGE_SIZE}" + headers = {"Authorization": f"Bearer {get_spotify_token()}"} + j = requests.get(api_url, headers=headers).json() + for a in j["albums"]["items"]: + title = a["name"] + subtitle = a["release_date"] + for artist in a["artists"]: + subtitle += " " + artist["name"] + url = a["external_urls"]["spotify"] + cover = a["images"][0]["url"] + results.append( + SearchResultItem( + ItemCategory.Music, + SiteName.Spotify, + url, + title, + subtitle, + "", + cover, + ) + ) + except Exception as e: + logger.error(f"Spotify search '{q}' error: {e}") + return results + + +class Bandcamp: + @classmethod + def search(cls, q, page=1): + results = [] + try: + search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={page}&q={quote_plus(q)}" + r = requests.get(search_url) + h = html.fromstring(r.content.decode("utf-8")) + for c in h.xpath('//li[@class="searchresult data-search"]'): + el_cover = c.xpath('.//div[@class="art"]/img/@src') + cover = el_cover[0] if el_cover else None + el_title = c.xpath('.//div[@class="heading"]//text()') + title = "".join(el_title).strip() if el_title else None + el_url = c.xpath('..//div[@class="itemurl"]/a/@href') + url = el_url[0] if el_url else None + el_authors = c.xpath('.//div[@class="subhead"]//text()') + subtitle = ", ".join(el_authors) if el_authors else None + results.append( + SearchResultItem( + ItemCategory.Music, + SiteName.Bandcamp, + url, + title, + subtitle, + "", + cover, + ) + ) + except Exception as e: + logger.error(f"Goodreads search '{q}' error: {e}") + return results + + +class ExternalSources: + @classmethod + def search(cls, c, q, page=1): + if not q: + return [] + results = [] + if c == "" or c is None: + c = "all" + if c == "all" or c == "movie": + results.extend(TheMovieDatabase.search(q, page)) + if c == "all" or c == "book": + results.extend(GoogleBooks.search(q, page)) + results.extend(Goodreads.search(q, page)) + if c == "all" or c == "music": + results.extend(Spotify.search(q, page)) + results.extend(Bandcamp.search(q, page)) + return results diff --git a/catalog/search/typesense.py b/catalog/search/typesense.py index 13467f5a..0570a433 100644 --- a/catalog/search/typesense.py +++ b/catalog/search/typesense.py @@ -4,7 +4,7 @@ import typesense from typesense.exceptions import ObjectNotFound from django.conf import settings from django.db.models.signals import post_save, post_delete - +from catalog.models import Item INDEX_NAME = "catalog" SEARCHABLE_ATTRIBUTES = [ @@ -257,7 +257,8 @@ class Indexer: @classmethod def item_to_obj(cls, item): try: - return cls.class_map[item["class_name"]].get_by_url(item["id"]) + return Item.get_by_url(item["id"]) except Exception as e: + print(e) logger.error(f"unable to load search result item from db:\n{item}") return None diff --git a/catalog/templates/album.html b/catalog/templates/album.html index f87161f4..bd4f19d0 100644 --- a/catalog/templates/album.html +++ b/catalog/templates/album.html @@ -97,13 +97,13 @@ {% if item.last_editor and item.last_editor.preference.show_last_edit %} -
{% trans '最近编辑者:' %}{{ item.last_editor | default:"" }}
+
{% trans '最近编辑者:' %}{{ item.last_editor | default:"" }}
{% endif %}
- {% trans '编辑这个作品' %} + {% trans '编辑这张专辑' %} {% if user.is_staff %} - / {% trans '删除' %} + / {% trans '删除' %} {% endif %}
diff --git a/catalog/templates/edition.html b/catalog/templates/edition.html index abe4a2fe..2ef25ef2 100644 --- a/catalog/templates/edition.html +++ b/catalog/templates/edition.html @@ -63,13 +63,13 @@ {% if item.last_editor and item.last_editor.preference.show_last_edit %} -
{% trans '最近编辑者:' %}{{ item.last_editor | default:"" }}
+
{% trans '最近编辑者:' %}{{ item.last_editor | default:"" }}
{% endif %}
- {% trans '编辑这本书' %} + {% trans '编辑这本书' %} {% if user.is_staff %} - / {% trans '删除' %} + / {% trans '删除' %} {% endif %}
@@ -84,7 +84,7 @@
{% for b in item.get_related_books %}

- {{ b.title }} + {{ b.title }} ({{ b.pub_house }} {{ b.pub_year }}) {{ b.get_source_site_display }}

diff --git a/common/templates/common/external_search_result.html b/catalog/templates/external_search_results.html similarity index 96% rename from common/templates/common/external_search_result.html rename to catalog/templates/external_search_results.html index 9d2132bb..b0317db4 100644 --- a/common/templates/common/external_search_result.html +++ b/catalog/templates/external_search_results.html @@ -13,7 +13,9 @@
  • diff --git a/catalog/templates/game.html b/catalog/templates/game.html index abc5b518..ddc50206 100644 --- a/catalog/templates/game.html +++ b/catalog/templates/game.html @@ -92,14 +92,13 @@ {% if item.last_editor and item.last_editor.preference.show_last_edit %} -
    {% trans '最近编辑者:' %}{{ item.last_editor | default:"" }}
    +
    {% trans '最近编辑者:' %}{{ item.last_editor | default:"" }}
    {% endif %}
    - {% trans '编辑这个游戏' %} - + {% trans '编辑这个游戏' %} {% if user.is_staff %} - / {% trans '删除' %} + / {% trans '删除' %} {% endif %}
    diff --git a/catalog/templates/item_base.html b/catalog/templates/item_base.html index e121ffb8..caade775 100644 --- a/catalog/templates/item_base.html +++ b/catalog/templates/item_base.html @@ -79,7 +79,7 @@
    {% for tag in item.tags %} - {{ tag }} + {{ tag }} {% endfor %}
    @@ -127,7 +127,7 @@
  • diff --git a/catalog/templates/search_results.html b/catalog/templates/search_results.html index 1db59beb..88bda511 100644 --- a/catalog/templates/search_results.html +++ b/catalog/templates/search_results.html @@ -54,7 +54,7 @@ {% endfor %} {% if request.GET.q and user.is_authenticated %} -
  • +
  • {% trans '正在实时搜索站外条目' %}
    @@ -111,117 +111,22 @@
    {% trans '没有想要的结果?' %}
    - {% if request.GET.c and request.GET.c in categories %} - - {% if request.GET.c|lower == 'book' %} - - - - - - {% elif request.GET.c|lower == 'movie' %} - - - - - - {% elif request.GET.c|lower == 'music' %} - - - - - - - - - {% elif request.GET.c|lower == 'game' %} - - - - - - {% endif %} - - {% else %} - - - - - - - - - - - - - - - - {% endif %} + + + + + + + + + + + + + + +
    - -
    diff --git a/catalog/templates/tvseason.html b/catalog/templates/tvseason.html index 26083034..4420a4d9 100644 --- a/catalog/templates/tvseason.html +++ b/catalog/templates/tvseason.html @@ -156,13 +156,13 @@ {% if item.last_editor and item.last_editor.preference.show_last_edit %} -
    {% trans '最近编辑者:' %}{{ item.last_editor | default:"" }}
    +
    {% trans '最近编辑者:' %}{{ item.last_editor | default:"" }}
    {% endif %}
    - {% trans '编辑这部电影' %} + {% trans '编辑这部剧集' %} {% if user.is_staff %} - / {% trans '删除' %} + / {% trans '删除' %} {% endif %}
    diff --git a/catalog/templates/tvshow.html b/catalog/templates/tvshow.html index c23c99bc..4a45a1ee 100644 --- a/catalog/templates/tvshow.html +++ b/catalog/templates/tvshow.html @@ -156,13 +156,13 @@ {% if item.last_editor and item.last_editor.preference.show_last_edit %} -
    {% trans '最近编辑者:' %}{{ item.last_editor | default:"" }}
    +
    {% trans '最近编辑者:' %}{{ item.last_editor | default:"" }}
    {% endif %}
    - {% trans '编辑这部电影' %} + {% trans '编辑这部剧集' %} {% if user.is_staff %} - / {% trans '删除' %} + / {% trans '删除' %} {% endif %}
    diff --git a/catalog/urls.py b/catalog/urls.py index 4001c66a..3363eb93 100644 --- a/catalog/urls.py +++ b/catalog/urls.py @@ -29,6 +29,20 @@ urlpatterns = [ retrieve, name="retrieve", ), + re_path( + r"^(?P" + + _get_all_url_paths() + + ")/(?P[A-Za-z0-9]{21,22})/edit$", + edit, + name="edit", + ), + re_path( + r"^(?P" + + _get_all_url_paths() + + ")/(?P[A-Za-z0-9]{21,22})/delete$", + delete, + name="delete", + ), re_path( r"^(?P" + _get_all_url_paths() @@ -43,7 +57,8 @@ urlpatterns = [ mark_list, name="mark_list", ), - path("search2/", search, name="search"), + path("search/", search, name="search"), + path("search/external/", external_search, name="external_search"), path("fetch_refresh/", fetch_refresh, name="fetch_refresh"), path("api/", api.urls), ] diff --git a/catalog/views.py b/catalog/views.py index 21f69018..77182bcb 100644 --- a/catalog/views.py +++ b/catalog/views.py @@ -1,6 +1,6 @@ import uuid import logging -from django.shortcuts import render, get_object_or_404, redirect, reverse +from django.shortcuts import render, get_object_or_404, redirect from django.contrib.auth.decorators import login_required, permission_required from django.utils.translation import gettext_lazy as _ from django.http import ( @@ -8,8 +8,10 @@ from django.http import ( HttpResponseServerError, HttpResponse, HttpResponseRedirect, + HttpResponseNotFound, ) -from django.core.exceptions import BadRequest, ObjectDoesNotExist, PermissionDenied +from django.contrib.auth.decorators import login_required, permission_required +from django.core.exceptions import ObjectDoesNotExist, PermissionDenied from django.db import IntegrityError, transaction from django.db.models import Count from django.utils import timezone @@ -21,15 +23,15 @@ from mastodon.models import MastodonApplication from mastodon.api import share_mark, share_review from .models import * from django.conf import settings -from common.scraper import get_scraper_by_url, get_normalized_url from django.utils.baseconv import base62 from journal.models import Mark, ShelfMember, Review from journal.models import query_visible, query_following from common.utils import PageLinksGenerator -from common.views import PAGE_LINK_NUMBER +from common.config import PAGE_LINK_NUMBER from journal.models import ShelfTypeNames import django_rq from rq.job import Job +from .search.external import ExternalSources _logger = logging.getLogger(__name__) @@ -107,10 +109,11 @@ def retrieve(request, item_path, item_uuid): return HttpResponseBadRequest() +@login_required def mark_list(request, item_path, item_uuid, following_only=False): item = get_object_or_404(Item, uid=base62.decode(item_uuid)) if not item: - return HttpResponseNotFound("item not found") + return HttpResponseNotFound(b"item not found") queryset = ShelfMember.objects.filter(item=item).order_by("-created_time") if following_only: queryset = queryset.filter(query_following(request.user)) @@ -135,7 +138,7 @@ def mark_list(request, item_path, item_uuid, following_only=False): def review_list(request, item_path, item_uuid): item = get_object_or_404(Item, uid=base62.decode(item_uuid)) if not item: - return HttpResponseNotFound("item not found") + return HttpResponseNotFound(b"item not found") queryset = Review.objects.filter(item=item).order_by("-created_time") queryset = queryset.filter(query_visible(request.user)) paginator = Paginator(queryset, NUM_REVIEWS_ON_LIST_PAGE) @@ -164,6 +167,7 @@ def fetch_task(url): return "-" +@login_required def fetch_refresh(request, job_id): retry = request.GET job = Job.fetch(id=job_id, connection=django_rq.get_connection("fetch")) @@ -185,9 +189,10 @@ def fetch_refresh(request, job_id): ) +@login_required def fetch(request, url, site: AbstractSite = None): if not site: - site = SiteManager.get_site_by_url(keywords) + site = SiteManager.get_site_by_url(url) if not site: return HttpResponseBadRequest() item = site.get_item() @@ -250,13 +255,13 @@ def search(request): items.append(i) for res in i.external_resources.all(): urls.append(res.url) - if request.path.endswith(".json/"): - return JsonResponse( - { - "num_pages": result.num_pages, - "items": list(map(lambda i: i.get_json(), items)), - } - ) + # if request.path.endswith(".json/"): + # return JsonResponse( + # { + # "num_pages": result.num_pages, + # "items": list(map(lambda i: i.get_json(), items)), + # } + # ) request.session["search_dedupe_urls"] = urls return render( request, @@ -270,3 +275,33 @@ def search(request): "hide_category": category is not None, }, ) + + +@login_required +def external_search(request): + category = request.GET.get("c", default="all").strip().lower() + if category == "all": + category = None + keywords = request.GET.get("q", default="").strip() + page_number = int(request.GET.get("page", default=1)) + items = ExternalSources.search(category, keywords, page_number) if keywords else [] + dedupe_urls = request.session.get("search_dedupe_urls", []) + items = [i for i in items if i.source_url not in dedupe_urls] + + return render( + request, + "external_search_results.html", + { + "external_items": items, + }, + ) + + +@login_required +def edit(request, item_uuid): + return HttpResponseBadRequest() + + +@login_required +def delete(request, item_uuid): + return HttpResponseBadRequest() diff --git a/common/importers/douban.py b/common/importers/douban.py deleted file mode 100644 index 321cea8f..00000000 --- a/common/importers/douban.py +++ /dev/null @@ -1,270 +0,0 @@ -import openpyxl -import requests -import re -from lxml import html -from markdownify import markdownify as md -from datetime import datetime -from common.scraper import get_scraper_by_url -import logging -import pytz -from django.conf import settings -from django.core.exceptions import ObjectDoesNotExist -from user_messages import api as msg -import django_rq -from common.utils import GenerateDateUUIDMediaFilePath -import os -from books.models import BookReview, Book, BookMark, BookTag -from movies.models import MovieReview, Movie, MovieMark, MovieTag -from music.models import AlbumReview, Album, AlbumMark, AlbumTag -from games.models import GameReview, Game, GameMark, GameTag -from common.scraper import DoubanAlbumScraper, DoubanBookScraper, DoubanGameScraper, DoubanMovieScraper -from PIL import Image -from io import BytesIO -import filetype -from common.models import MarkStatusEnum - - -logger = logging.getLogger(__name__) -tz_sh = pytz.timezone('Asia/Shanghai') - - -def fetch_remote_image(url): - try: - print(f'fetching remote image {url}') - raw_img = None - ext = None - if settings.SCRAPESTACK_KEY is not None: - dl_url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}' - elif settings.SCRAPERAPI_KEY is not None: - dl_url = f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}' - else: - dl_url = url - img_response = requests.get(dl_url, timeout=settings.SCRAPING_TIMEOUT) - raw_img = img_response.content - img = Image.open(BytesIO(raw_img)) - img.load() # corrupted image will trigger exception - content_type = img_response.headers.get('Content-Type') - ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension - f = GenerateDateUUIDMediaFilePath(None, "x." + ext, settings.MARKDOWNX_MEDIA_PATH) - file = settings.MEDIA_ROOT + f - local_url = settings.MEDIA_URL + f - os.makedirs(os.path.dirname(file), exist_ok=True) - img.save(file) - # print(f'remote image saved as {local_url}') - return local_url - except Exception: - print(f'unable to fetch remote image {url}') - return url - - -class DoubanImporter: - total = 0 - processed = 0 - skipped = 0 - imported = 0 - failed = [] - user = None - visibility = 0 - file = None - - def __init__(self, user, visibility): - self.user = user - self.visibility = visibility - - def update_user_import_status(self, status): - self.user.preference.import_status['douban_pending'] = status - self.user.preference.import_status['douban_file'] = self.file - self.user.preference.import_status['douban_visibility'] = self.visibility - self.user.preference.import_status['douban_total'] = self.total - self.user.preference.import_status['douban_processed'] = self.processed - self.user.preference.import_status['douban_skipped'] = self.skipped - self.user.preference.import_status['douban_imported'] = self.imported - self.user.preference.import_status['douban_failed'] = self.failed - self.user.preference.save(update_fields=['import_status']) - - def import_from_file(self, uploaded_file): - try: - wb = openpyxl.open(uploaded_file, read_only=True, data_only=True, keep_links=False) - wb.close() - file = settings.MEDIA_ROOT + GenerateDateUUIDMediaFilePath(None, "x.xlsx", settings.SYNC_FILE_PATH_ROOT) - os.makedirs(os.path.dirname(file), exist_ok=True) - with open(file, 'wb') as destination: - for chunk in uploaded_file.chunks(): - destination.write(chunk) - self.file = file - self.update_user_import_status(2) - jid = f'Douban_{self.user.id}_{os.path.basename(self.file)}' - django_rq.get_queue('doufen').enqueue(self.import_from_file_task, job_id=jid) - except Exception: - return False - # self.import_from_file_task(file, user, visibility) - return True - - mark_sheet_config = { - '想读': [MarkStatusEnum.WISH, DoubanBookScraper, Book, BookMark, BookTag], - '在读': [MarkStatusEnum.DO, DoubanBookScraper, Book, BookMark, BookTag], - '读过': [MarkStatusEnum.COLLECT, DoubanBookScraper, Book, BookMark, BookTag], - '想看': [MarkStatusEnum.WISH, DoubanMovieScraper, Movie, MovieMark, MovieTag], - '在看': [MarkStatusEnum.DO, DoubanMovieScraper, Movie, MovieMark, MovieTag], - '想看': [MarkStatusEnum.COLLECT, DoubanMovieScraper, Movie, MovieMark, MovieTag], - '想听': [MarkStatusEnum.WISH, DoubanAlbumScraper, Album, AlbumMark, AlbumTag], - '在听': [MarkStatusEnum.DO, DoubanAlbumScraper, Album, AlbumMark, AlbumTag], - '听过': [MarkStatusEnum.COLLECT, DoubanAlbumScraper, Album, AlbumMark, AlbumTag], - '想玩': [MarkStatusEnum.WISH, DoubanGameScraper, Game, GameMark, GameTag], - '在玩': [MarkStatusEnum.DO, DoubanGameScraper, Game, GameMark, GameTag], - '玩过': [MarkStatusEnum.COLLECT, DoubanGameScraper, Game, GameMark, GameTag], - } - review_sheet_config = { - '书评': [DoubanBookScraper, Book, BookReview], - '影评': [DoubanMovieScraper, Movie, MovieReview], - '乐评': [DoubanAlbumScraper, Album, AlbumReview], - '游戏评论&攻略': [DoubanGameScraper, Game, GameReview], - } - mark_data = {} - review_data = {} - entity_lookup = {} - - def load_sheets(self): - f = open(self.file, 'rb') - wb = openpyxl.load_workbook(f, read_only=True, data_only=True, keep_links=False) - for data, config in [(self.mark_data, self.mark_sheet_config), (self.review_data, self.review_sheet_config)]: - for name in config: - data[name] = [] - if name in wb: - print(f'{self.user} parsing {name}') - for row in wb[name].iter_rows(min_row=2, values_only=True): - cells = [cell for cell in row] - if len(cells) > 6: - data[name].append(cells) - for sheet in self.mark_data.values(): - for cells in sheet: - # entity_lookup["title|rating"] = [(url, time), ...] - k = f'{cells[0]}|{cells[5]}' - v = (cells[3], cells[4]) - if k in self.entity_lookup: - self.entity_lookup[k].append(v) - else: - self.entity_lookup[k] = [v] - self.total = sum(map(lambda a: len(a), self.review_data.values())) - - def guess_entity_url(self, title, rating, timestamp): - k = f'{title}|{rating}' - if k not in self.entity_lookup: - return None - v = self.entity_lookup[k] - if len(v) > 1: - v.sort(key=lambda c: abs(timestamp - (datetime.strptime(c[1], "%Y-%m-%d %H:%M:%S") if type(c[1])==str else c[1]).replace(tzinfo=tz_sh))) - return v[0][0] - # for sheet in self.mark_data.values(): - # for cells in sheet: - # if cells[0] == title and cells[5] == rating: - # return cells[3] - - def import_from_file_task(self): - print(f'{self.user} import start') - msg.info(self.user, f'开始导入豆瓣评论') - self.update_user_import_status(1) - self.load_sheets() - print(f'{self.user} sheet loaded, {self.total} lines total') - self.update_user_import_status(1) - for name, param in self.review_sheet_config.items(): - self.import_review_sheet(self.review_data[name], param[0], param[1], param[2]) - self.update_user_import_status(0) - msg.success(self.user, f'豆瓣评论导入完成,共处理{self.total}篇,已存在{self.skipped}篇,新增{self.imported}篇。') - if len(self.failed): - msg.error(self.user, f'豆瓣评论导入时未能处理以下网址:\n{" , ".join(self.failed)}') - - def import_review_sheet(self, worksheet, scraper, entity_class, review_class): - prefix = f'{self.user} |' - if worksheet is None: # or worksheet.max_row < 2: - print(f'{prefix} {review_class.__name__} empty sheet') - return - for cells in worksheet: - if len(cells) < 6: - continue - title = cells[0] - entity_title = re.sub('^《', '', re.sub('》$', '', cells[1])) - review_url = cells[2] - time = cells[3] - rating = cells[4] - content = cells[6] - self.processed += 1 - if time: - if type(time) == str: - time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S") - time = time.replace(tzinfo=tz_sh) - else: - time = None - if not content: - content = "" - if not title: - title = "" - r = self.import_review(entity_title, rating, title, review_url, content, time, scraper, entity_class, review_class) - if r == 1: - self.imported += 1 - elif r == 2: - self.skipped += 1 - else: - self.failed.append(review_url) - self.update_user_import_status(1) - - def import_review(self, entity_title, rating, title, review_url, content, time, scraper, entity_class, review_class): - # return 1: done / 2: skipped / None: failed - prefix = f'{self.user} |' - url = self.guess_entity_url(entity_title, rating, time) - if url is None: - print(f'{prefix} fetching {review_url}') - try: - if settings.SCRAPESTACK_KEY is not None: - _review_url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={review_url}' - else: - _review_url = review_url - r = requests.get(_review_url, timeout=settings.SCRAPING_TIMEOUT) - if r.status_code != 200: - print(f'{prefix} fetching error {review_url} {r.status_code}') - return - h = html.fromstring(r.content.decode('utf-8')) - for u in h.xpath("//header[@class='main-hd']/a/@href"): - if '.douban.com/subject/' in u: - url = u - if not url: - print(f'{prefix} fetching error {review_url} unable to locate entity url') - return - except Exception: - print(f'{prefix} fetching exception {review_url}') - return - try: - entity = entity_class.objects.get(source_url=url) - print(f'{prefix} matched {url}') - except ObjectDoesNotExist: - try: - print(f'{prefix} scraping {url}') - scraper.scrape(url) - form = scraper.save(request_user=self.user) - entity = form.instance - except Exception as e: - print(f"{prefix} scrape failed: {url} {e}") - logger.error(f"{prefix} scrape failed: {url}", exc_info=e) - return - params = { - 'owner': self.user, - entity_class.__name__.lower(): entity - } - if review_class.objects.filter(**params).exists(): - return 2 - content = re.sub(r'([^<]+)', r'\1', content) - content = re.sub(r'(]+>)', r'\1
    ', content) - content = re.sub(r'
    ([^<]+)
    ', r'
    \1
    ', content) - content = md(content) - content = re.sub(r'(?<=!\[\]\()([^)]+)(?=\))', lambda x: fetch_remote_image(x[1]), content) - params = { - 'owner': self.user, - 'created_time': time, - 'edited_time': time, - 'title': title, - 'content': content, - 'visibility': self.visibility, - entity_class.__name__.lower(): entity, - } - review_class.objects.create(**params) - return 1 diff --git a/common/importers/goodreads.py b/common/importers/goodreads.py deleted file mode 100644 index 5a2263c7..00000000 --- a/common/importers/goodreads.py +++ /dev/null @@ -1,202 +0,0 @@ -import re -import requests -from lxml import html -from datetime import datetime -# from common.scrapers.goodreads import GoodreadsScraper -from common.scraper import get_scraper_by_url -from books.models import Book, BookMark -from collection.models import Collection -from common.models import MarkStatusEnum -from django.conf import settings -from user_messages import api as msg -import django_rq -from django.utils.timezone import make_aware - - -re_list = r'^https://www.goodreads.com/list/show/\d+' -re_shelf = r'^https://www.goodreads.com/review/list/\d+[^?]*\?shelf=[^&]+' -re_profile = r'^https://www.goodreads.com/user/show/(\d+)' -gr_rating = { - 'did not like it': 2, - 'it was ok': 4, - 'liked it': 6, - 'really liked it': 8, - 'it was amazing': 10 -} - - -class GoodreadsImporter: - @classmethod - def import_from_url(self, raw_url, user): - match_list = re.match(re_list, raw_url) - match_shelf = re.match(re_shelf, raw_url) - match_profile = re.match(re_profile, raw_url) - if match_profile or match_shelf or match_list: - django_rq.get_queue('doufen').enqueue(self.import_from_url_task, raw_url, user) - return True - else: - return False - - @classmethod - def import_from_url_task(cls, url, user): - match_list = re.match(re_list, url) - match_shelf = re.match(re_shelf, url) - match_profile = re.match(re_profile, url) - total = 0 - if match_list or match_shelf: - shelf = cls.parse_shelf(match_shelf[0], user) if match_shelf else cls.parse_list(match_list[0], user) - if shelf['title'] and shelf['books']: - collection = Collection.objects.create(title=shelf['title'], - description=shelf['description'] + '\n\nImported from [Goodreads](' + url + ')', - owner=user) - for book in shelf['books']: - collection.append_item(book['book'], book['review']) - total += 1 - collection.save() - msg.success(user, f'成功从Goodreads导入包含{total}本书的收藏单{shelf["title"]}。') - elif match_profile: - uid = match_profile[1] - shelves = { - MarkStatusEnum.WISH: f'https://www.goodreads.com/review/list/{uid}?shelf=to-read', - MarkStatusEnum.DO: f'https://www.goodreads.com/review/list/{uid}?shelf=currently-reading', - MarkStatusEnum.COLLECT: f'https://www.goodreads.com/review/list/{uid}?shelf=read', - } - for status in shelves: - shelf_url = shelves.get(status) - shelf = cls.parse_shelf(shelf_url, user) - for book in shelf['books']: - params = { - 'owner': user, - 'rating': book['rating'], - 'text': book['review'], - 'status': status, - 'visibility': user.preference.default_visibility, - 'book': book['book'], - } - if book['last_updated']: - params['created_time'] = book['last_updated'] - params['edited_time'] = book['last_updated'] - try: - mark = BookMark.objects.create(**params) - mark.book.update_rating(None, mark.rating) - except Exception: - print(f'Skip mark for {book["book"]}') - pass - total += 1 - msg.success(user, f'成功从Goodreads用户主页导入{total}个标记。') - - @classmethod - def parse_shelf(cls, url, user): # return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]} - title = None - books = [] - url_shelf = url + '&view=table' - while url_shelf: - print(f'Shelf loading {url_shelf}') - r = requests.get(url_shelf, timeout=settings.SCRAPING_TIMEOUT) - if r.status_code != 200: - print(f'Shelf loading error {url_shelf}') - break - url_shelf = None - content = html.fromstring(r.content.decode('utf-8')) - title_elem = content.xpath("//span[@class='h1Shelf']/text()") - if not title_elem: - print(f'Shelf parsing error {url_shelf}') - break - title = title_elem[0].strip() - print("Shelf title: " + title) - for cell in content.xpath("//tbody[@id='booksBody']/tr"): - url_book = 'https://www.goodreads.com' + \ - cell.xpath( - ".//td[@class='field title']//a/@href")[0].strip() - # has_review = cell.xpath( - # ".//td[@class='field actions']//a/text()")[0].strip() == 'view (with text)' - rating_elem = cell.xpath( - ".//td[@class='field rating']//span/@title") - rating = gr_rating.get( - rating_elem[0].strip()) if rating_elem else None - url_review = 'https://www.goodreads.com' + \ - cell.xpath( - ".//td[@class='field actions']//a/@href")[0].strip() - review = '' - last_updated = None - try: - r2 = requests.get( - url_review, timeout=settings.SCRAPING_TIMEOUT) - if r2.status_code == 200: - c2 = html.fromstring(r2.content.decode('utf-8')) - review_elem = c2.xpath( - "//div[@itemprop='reviewBody']/text()") - review = '\n'.join( - p.strip() for p in review_elem) if review_elem else '' - date_elem = c2.xpath( - "//div[@class='readingTimeline__text']/text()") - for d in date_elem: - date_matched = re.search(r'(\w+)\s+(\d+),\s+(\d+)', d) - if date_matched: - last_updated = make_aware(datetime.strptime(date_matched[1] + ' ' + date_matched[2] + ' ' + date_matched[3], '%B %d %Y')) - else: - print(f"Error loading review{url_review}, ignored") - scraper = get_scraper_by_url(url_book) - url_book = scraper.get_effective_url(url_book) - book = Book.objects.filter(source_url=url_book).first() - if not book: - print("add new book " + url_book) - scraper.scrape(url_book) - form = scraper.save(request_user=user) - book = form.instance - books.append({ - 'url': url_book, - 'book': book, - 'rating': rating, - 'review': review, - 'last_updated': last_updated - }) - except Exception: - print("Error adding " + url_book) - pass # likely just download error - next_elem = content.xpath("//a[@class='next_page']/@href") - url_shelf = ('https://www.goodreads.com' + next_elem[0].strip()) if next_elem else None - return {'title': title, 'description': '', 'books': books} - - @classmethod - def parse_list(cls, url, user): # return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]} - title = None - books = [] - url_shelf = url - while url_shelf: - print(f'List loading {url_shelf}') - r = requests.get(url_shelf, timeout=settings.SCRAPING_TIMEOUT) - if r.status_code != 200: - print(f'List loading error {url_shelf}') - break - url_shelf = None - content = html.fromstring(r.content.decode('utf-8')) - title_elem = content.xpath('//h1[@class="gr-h1 gr-h1--serif"]/text()') - if not title_elem: - print(f'List parsing error {url_shelf}') - break - title = title_elem[0].strip() - description = content.xpath('//div[@class="mediumText"]/text()')[0].strip() - print("List title: " + title) - for link in content.xpath('//a[@class="bookTitle"]/@href'): - url_book = 'https://www.goodreads.com' + link - try: - scraper = get_scraper_by_url(url_book) - url_book = scraper.get_effective_url(url_book) - book = Book.objects.filter(source_url=url_book).first() - if not book: - print("add new book " + url_book) - scraper.scrape(url_book) - form = scraper.save(request_user=user) - book = form.instance - books.append({ - 'url': url_book, - 'book': book, - 'review': '', - }) - except Exception: - print("Error adding " + url_book) - pass # likely just download error - next_elem = content.xpath("//a[@class='next_page']/@href") - url_shelf = ('https://www.goodreads.com' + next_elem[0].strip()) if next_elem else None - return {'title': title, 'description': description, 'books': books} diff --git a/common/management/commands/index_stats.py b/common/management/commands/index_stats.py deleted file mode 100644 index 28a9f07e..00000000 --- a/common/management/commands/index_stats.py +++ /dev/null @@ -1,40 +0,0 @@ -from django.core.management.base import BaseCommand -from common.index import Indexer -from django.conf import settings -from movies.models import Movie -from books.models import Book -from games.models import Game -from music.models import Album, Song -from django.core.paginator import Paginator -from tqdm import tqdm -from time import sleep -from datetime import timedelta -from django.utils import timezone - - -class Command(BaseCommand): - help = 'Check search index' - - def handle(self, *args, **options): - print(f'Connecting to search server') - stats = Indexer.get_stats() - print(stats) - st = Indexer.instance().get_all_update_status() - cnt = {"enqueued": [0, 0], "processing": [0, 0], "processed": [0, 0], "failed": [0, 0]} - lastEnq = {"enqueuedAt": ""} - lastProc = {"enqueuedAt": ""} - for s in st: - n = s["type"].get("number") - cnt[s["status"]][0] += 1 - cnt[s["status"]][1] += n if n else 0 - if s["status"] == "processing": - print(s) - elif s["status"] == "enqueued": - if s["enqueuedAt"] > lastEnq["enqueuedAt"]: - lastEnq = s - elif s["status"] == "processed": - if s["enqueuedAt"] > lastProc["enqueuedAt"]: - lastProc = s - print(lastEnq) - print(lastProc) - print(cnt) diff --git a/common/management/commands/init_index.py b/common/management/commands/init_index.py deleted file mode 100644 index 797d32e4..00000000 --- a/common/management/commands/init_index.py +++ /dev/null @@ -1,18 +0,0 @@ -from django.core.management.base import BaseCommand -from common.index import Indexer -from django.conf import settings - - -class Command(BaseCommand): - help = 'Initialize the search index' - - def handle(self, *args, **options): - print(f'Connecting to search server') - Indexer.init() - self.stdout.write(self.style.SUCCESS('Index created.')) - # try: - # Indexer.init() - # self.stdout.write(self.style.SUCCESS('Index created.')) - # except Exception: - # Indexer.update_settings() - # self.stdout.write(self.style.SUCCESS('Index settings updated.')) diff --git a/common/management/commands/reindex.py b/common/management/commands/reindex.py deleted file mode 100644 index 5dcc766f..00000000 --- a/common/management/commands/reindex.py +++ /dev/null @@ -1,40 +0,0 @@ -from django.core.management.base import BaseCommand -from common.index import Indexer -from django.conf import settings -from movies.models import Movie -from books.models import Book -from games.models import Game -from music.models import Album, Song -from django.core.paginator import Paginator -from tqdm import tqdm -from time import sleep -from datetime import timedelta -from django.utils import timezone - - -BATCH_SIZE = 1000 - - -class Command(BaseCommand): - help = 'Regenerate the search index' - - # def add_arguments(self, parser): - # parser.add_argument('hours', type=int, help='Re-index items modified in last N hours, 0 to reindex all') - - def handle(self, *args, **options): - # h = int(options['hours']) - print(f'Connecting to search server') - if Indexer.busy(): - print('Please wait for previous updates') - # Indexer.update_settings() - # self.stdout.write(self.style.SUCCESS('Index settings updated.')) - for c in [Book, Song, Album, Game, Movie]: - print(f'Re-indexing {c}') - qs = c.objects.all() # if h == 0 else c.objects.filter(edited_time__gt=timezone.now() - timedelta(hours=h)) - pg = Paginator(qs.order_by('id'), BATCH_SIZE) - for p in tqdm(pg.page_range): - items = list(map(lambda o: Indexer.obj_to_dict(o), pg.get_page(p).object_list)) - if items: - Indexer.replace_batch(items) - while Indexer.busy(): - sleep(0.5) diff --git a/common/scraper.py b/common/scraper.py deleted file mode 100644 index 56a8a763..00000000 --- a/common/scraper.py +++ /dev/null @@ -1,265 +0,0 @@ -import requests -import functools -import random -import logging -import re -import dateparser -import datetime -import time -import filetype -import dns.resolver -import urllib.parse -from lxml import html -from threading import Thread -from django.utils import timezone -from django.utils.translation import ugettext_lazy as _ -from django.core.exceptions import ObjectDoesNotExist, ValidationError -from django.core.files.uploadedfile import SimpleUploadedFile -from common.models import SourceSiteEnum -from django.conf import settings -from django.core.exceptions import ValidationError - - -RE_NUMBERS = re.compile(r"\d+\d*") -RE_WHITESPACES = re.compile(r"\s+") - - -DEFAULT_REQUEST_HEADERS = { - 'Host': '', - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:70.0) Gecko/20100101 Firefox/70.0', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', - # well, since brotli lib is so bothering, remove `br` - 'Accept-Encoding': 'gzip, deflate', - 'Connection': 'keep-alive', - 'DNT': '1', - 'Upgrade-Insecure-Requests': '1', - 'Cache-Control': 'no-cache', -} - - -# luminati account credentials -PORT = 22225 - - -logger = logging.getLogger(__name__) - - -# register all implemented scraper in form of {host: scraper_class,} -scraper_registry = {} - - -def get_normalized_url(raw_url): - url = re.sub(r'//m.douban.com/(\w+)/', r'//\1.douban.com/', raw_url) - url = re.sub(r'//www.google.com/books/edition/_/([A-Za-z0-9_\-]+)[\?]*', r'//books.google.com/books?id=\1&', url) - return url - - -def log_url(func): - """ - Catch exceptions and log then pass the exceptions. - First postion argument (except cls/self) of decorated function must be the url. - """ - @functools.wraps(func) - def wrapper(*args, **kwargs): - try: - return func(*args, **kwargs) - except Exception as e: - # log the url and trace stack - logger.error(f"Scrape Failed URL: {args[1]}\n{e}") - if settings.DEBUG: - logger.error("Expections during scraping:", exc_info=e) - raise e - - return wrapper - - -def parse_date(raw_str): - return dateparser.parse( - raw_str, - settings={ - "RELATIVE_BASE": datetime.datetime(1900, 1, 1) - } - ) - - -class AbstractScraper: - """ - Scrape entities. The entities means those defined in the models.py file, - like Book, Movie...... - """ - - # subclasses must specify those two variables - # site means general sites, like amazon/douban etc - site_name = None - # host means technically hostname - host = None - # corresponding data class - data_class = None - # corresponding form class - form_class = None - # used to extract effective url - regex = None - # scraped raw image - raw_img = None - # scraped raw data - raw_data = {} - - def __init_subclass__(cls, **kwargs): - # this statement initialize the subclasses - super().__init_subclass__(**kwargs) - assert cls.site_name is not None, "class variable `site_name` must be specified" - assert bool(cls.host), "class variable `host` must be specified" - assert cls.data_class is not None, "class variable `data_class` must be specified" - assert cls.form_class is not None, "class variable `form_class` must be specified" - assert cls.regex is not None, "class variable `regex` must be specified" - assert isinstance(cls.host, str) or (isinstance(cls.host, list) and isinstance( - cls.host[0], str)), "`host` must be type str or list" - assert cls.site_name in SourceSiteEnum, "`site_name` must be one of `SourceSiteEnum` value" - assert hasattr(cls, 'scrape') and callable( - cls.scrape), "scaper must have method `.scrape()`" - - # decorate the scrape method - cls.scrape = classmethod(log_url(cls.scrape)) - - # register scraper - if isinstance(cls.host, list): - for host in cls.host: - scraper_registry[host] = cls - else: - scraper_registry[cls.host] = cls - - def scrape(self, url): - """ - Scrape/request model schema specified data from given url and return it. - Implementations of subclasses to this method would be decorated as class method. - return (data_dict, image) - Should set the `raw_data` and the `raw_img` - """ - raise NotImplementedError("Subclass should implement this method") - - @classmethod - def get_effective_url(cls, raw_url): - """ - The return value should be identical with that saved in DB as `source_url` - """ - url = cls.regex.findall(raw_url.replace('http:', 'https:')) # force all http to be https - if not url: - raise ValueError(f"not valid url: {raw_url}") - return url[0] - - @classmethod - def download_page(cls, url, headers): - url = cls.get_effective_url(url) - - if settings.LUMINATI_USERNAME is None: - proxies = None - if settings.PROXYCRAWL_KEY is not None: - url = f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={url}' - # if settings.SCRAPESTACK_KEY is not None: - # url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}' - else: - session_id = random.random() - proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' % - (settings.LUMINATI_USERNAME, session_id, settings.LUMINATI_PASSWORD, PORT)) - proxies = { - 'http': proxy_url, - 'https': proxy_url, - } - - r = requests.get(url, proxies=proxies, - headers=headers, timeout=settings.SCRAPING_TIMEOUT) - - if r.status_code != 200: - raise RuntimeError(f"download page failed, status code {r.status_code}") - # with open('temp.html', 'w', encoding='utf-8') as fp: - # fp.write(r.content.decode('utf-8')) - return html.fromstring(r.content.decode('utf-8')) - - @classmethod - def download_image(cls, url, item_url=None): - if url is None: - return None, None - raw_img = None - session_id = random.random() - proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' % - (settings.LUMINATI_USERNAME, session_id, settings.LUMINATI_PASSWORD, PORT)) - proxies = { - 'http': proxy_url, - 'https': proxy_url, - } - if settings.LUMINATI_USERNAME is None: - proxies = None - if url: - img_response = requests.get( - url, - headers={ - 'accept': 'image/webp,image/apng,image/*,*/*;q=0.8', - 'accept-encoding': 'gzip, deflate', - 'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,fr-FR;q=0.6,fr;q=0.5,zh-TW;q=0.4', - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 Edg/81.0.416.72', - 'cache-control': 'no-cache', - 'dnt': '1', - }, - proxies=proxies, - timeout=settings.SCRAPING_TIMEOUT, - ) - if img_response.status_code == 200: - raw_img = img_response.content - content_type = img_response.headers.get('Content-Type') - ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension - else: - ext = None - return raw_img, ext - - @classmethod - def save(cls, request_user, instance=None): - entity_cover = { - 'cover': SimpleUploadedFile('temp.' + cls.img_ext, cls.raw_img) - } if cls.img_ext is not None else None - form = cls.form_class(data=cls.raw_data, files=entity_cover, instance=instance) - if form.is_valid(): - form.instance.last_editor = request_user - form.instance._change_reason = 'scrape' - form.save() - cls.instance = form.instance - else: - logger.error(str(form.errors)) - raise ValidationError("Form invalid.") - return form - - -from common.scrapers.bandcamp import BandcampAlbumScraper -from common.scrapers.goodreads import GoodreadsScraper -from common.scrapers.google import GoogleBooksScraper -from common.scrapers.tmdb import TmdbMovieScraper -from common.scrapers.steam import SteamGameScraper -from common.scrapers.imdb import ImdbMovieScraper -from common.scrapers.igdb import IgdbGameScraper -from common.scrapers.spotify import SpotifyAlbumScraper, SpotifyTrackScraper -from common.scrapers.douban import DoubanAlbumScraper, DoubanBookScraper, DoubanGameScraper, DoubanMovieScraper -from common.scrapers.bangumi import BangumiScraper - - -def get_scraper_by_url(url): - parsed_url = urllib.parse.urlparse(url) - hostname = parsed_url.netloc - for host in scraper_registry: - if host in url: - return scraper_registry[host] - # TODO move this logic to scraper class - try: - answers = dns.resolver.query(hostname, 'CNAME') - for rdata in answers: - if str(rdata.target) == 'dom.bandcamp.com.': - return BandcampAlbumScraper - except Exception as e: - pass - try: - answers = dns.resolver.query(hostname, 'A') - for rdata in answers: - if str(rdata.address) == '35.241.62.186': - return BandcampAlbumScraper - except Exception as e: - pass - return None diff --git a/common/scrapers/bandcamp.py b/common/scrapers/bandcamp.py deleted file mode 100644 index 5f939da6..00000000 --- a/common/scrapers/bandcamp.py +++ /dev/null @@ -1,71 +0,0 @@ -import re -import dateparser -import json -from lxml import html -from common.models import SourceSiteEnum -from common.scraper import AbstractScraper -from music.models import Album -from music.forms import AlbumForm - - -class BandcampAlbumScraper(AbstractScraper): - site_name = SourceSiteEnum.BANDCAMP.value - # API URL - host = '.bandcamp.com/' - data_class = Album - form_class = AlbumForm - - regex = re.compile(r"https://[a-zA-Z0-9\-\.]+/album/[^?#]+") - - def scrape(self, url, response=None): - effective_url = self.get_effective_url(url) - if effective_url is None: - raise ValueError("not valid url") - if response is not None: - content = html.fromstring(response.content.decode('utf-8')) - else: - content = self.download_page(url, {}) - try: - title = content.xpath("//h2[@class='trackTitle']/text()")[0].strip() - artist = [content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip()] - except IndexError: - raise ValueError("given url contains no valid info") - - genre = [] # TODO: parse tags - track_list = [] - release_nodes = content.xpath("//div[@class='tralbumData tralbum-credits']/text()") - release_date = dateparser.parse(re.sub(r'releas\w+ ', '', release_nodes[0].strip())) if release_nodes else None - duration = None - company = None - brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()") - brief = "".join(brief_nodes) if brief_nodes else None - cover_url = content.xpath("//div[@id='tralbumArt']/a/@href")[0].strip() - bandcamp_page_data = json.loads(content.xpath( - "//meta[@name='bc-page-properties']/@content")[0].strip()) - other_info = {} - other_info['bandcamp_album_id'] = bandcamp_page_data['item_id'] - - raw_img, ext = self.download_image(cover_url, url) - - data = { - 'title': title, - 'artist': artist, - 'genre': genre, - 'track_list': track_list, - 'release_date': release_date, - 'duration': duration, - 'company': company, - 'brief': brief, - 'other_info': other_info, - 'source_site': self.site_name, - 'source_url': effective_url, - 'cover_url': cover_url, - } - - self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext - return data, raw_img - - @classmethod - def get_effective_url(cls, raw_url): - url = cls.regex.findall(raw_url) - return url[0] if len(url) > 0 else None diff --git a/common/scrapers/bangumi.py b/common/scrapers/bangumi.py deleted file mode 100644 index 498ba849..00000000 --- a/common/scrapers/bangumi.py +++ /dev/null @@ -1,199 +0,0 @@ -import re -from common.models import SourceSiteEnum -from movies.models import Movie, MovieGenreEnum -from movies.forms import MovieForm -from books.models import Book -from books.forms import BookForm -from music.models import Album, Song -from music.forms import AlbumForm, SongForm -from games.models import Game -from games.forms import GameForm -from common.scraper import * -from django.core.exceptions import ObjectDoesNotExist - - -def find_entity(source_url): - """ - for bangumi - """ - # to be added when new scrape method is implemented - result = Game.objects.filter(source_url=source_url) - if result: - return result[0] - else: - raise ObjectDoesNotExist - - -class BangumiScraper(AbstractScraper): - site_name = SourceSiteEnum.BANGUMI.value - host = 'bgm.tv' - - # for interface coherence - data_class = type("FakeDataClass", (object,), {})() - data_class.objects = type("FakeObjectsClass", (object,), {})() - data_class.objects.get = find_entity - # should be set at scrape_* method - form_class = '' - - regex = re.compile(r"https{0,1}://bgm\.tv/subject/\d+") - - def scrape(self, url): - """ - This is the scraping portal - """ - headers = DEFAULT_REQUEST_HEADERS.copy() - headers['Host'] = self.host - content = self.download_page(url, headers) - - # download image - img_url = 'http:' + content.xpath("//div[@class='infobox']//img[1]/@src")[0] - raw_img, ext = self.download_image(img_url, url) - - # Test category - category_code = content.xpath("//div[@id='headerSearch']//option[@selected]/@value")[0] - handler_map = { - '1': self.scrape_book, - '2': self.scrape_movie, - '3': self.scrape_album, - '4': self.scrape_game - } - data = handler_map[category_code](self, content) - data['source_url'] = self.get_effective_url(url) - - self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext - return data, raw_img - - def scrape_game(self, content): - self.data_class = Game - self.form_class = GameForm - - title_elem = content.xpath("//a[@property='v:itemreviewed']/text()") - if not title_elem: - raise ValueError("no game info found on this page") - title = None - else: - title = title_elem[0].strip() - - other_title_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'别名')]]/text()") - if not other_title_elem: - other_title_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'别名')]]/a/text()") - other_title = other_title_elem if other_title_elem else [] - - chinese_name_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'中文')]]/text()") - if not chinese_name_elem: - chinese_name_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'中文')]]/a/text()") - if chinese_name_elem: - chinese_name = chinese_name_elem[0] - # switch chinese name with original name - title, chinese_name = chinese_name, title - # actually the name appended is original - other_title.append(chinese_name) - - developer_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'开发')]]/text()") - if not developer_elem: - developer_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'开发')]]/a/text()") - developer = developer_elem if developer_elem else None - - publisher_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'发行:')]]/text()") - if not publisher_elem: - publisher_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'发行:')]]/a/text()") - publisher = publisher_elem if publisher_elem else None - - platform_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'平台')]]/text()") - if not platform_elem: - platform_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'平台')]]/a/text()") - platform = platform_elem if platform_elem else None - - genre_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'类型')]]/text()") - if not genre_elem: - genre_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'类型')]]/a/text()") - genre = genre_elem if genre_elem else None - - date_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'发行日期')]]/text()") - if not date_elem: - date_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'发行日期')]]/a/text()") - release_date = parse_date(date_elem[0]) if date_elem else None - - brief = ''.join(content.xpath("//div[@property='v:summary']/text()")) - - other_info = {} - other_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'人数')]]/text()") - if other_elem: - other_info['游玩人数'] = other_elem[0] - other_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'引擎')]]/text()") - if other_elem: - other_info['引擎'] = ' '.join(other_elem) - other_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'售价')]]/text()") - if other_elem: - other_info['售价'] = ' '.join(other_elem) - other_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'官方网站')]]/text()") - if other_elem: - other_info['网站'] = other_elem[0] - other_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'剧本')]]/a/text()") or content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'剧本')]]/text()") - if other_elem: - other_info['剧本'] = ' '.join(other_elem) - other_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'编剧')]]/a/text()") or content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'编剧')]]/text()") - if other_elem: - other_info['编剧'] = ' '.join(other_elem) - other_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'音乐')]]/a/text()") or content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'音乐')]]/text()") - if other_elem: - other_info['音乐'] = ' '.join(other_elem) - other_elem = content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'美术')]]/a/text()") or content.xpath( - "//ul[@id='infobox']/li[child::span[contains(text(),'美术')]]/text()") - if other_elem: - other_info['美术'] = ' '.join(other_elem) - - data = { - 'title': title, - 'other_title': None, - 'developer': developer, - 'publisher': publisher, - 'release_date': release_date, - 'genre': genre, - 'platform': platform, - 'brief': brief, - 'other_info': other_info, - 'source_site': self.site_name, - } - - return data - - def scrape_movie(self, content): - self.data_class = Movie - self.form_class = MovieForm - raise NotImplementedError - - def scrape_book(self, content): - self.data_class = Book - self.form_class = BookForm - raise NotImplementedError - - def scrape_album(self, content): - self.data_class = Album - self.form_class = AlbumForm - raise NotImplementedError diff --git a/common/scrapers/douban.py b/common/scrapers/douban.py deleted file mode 100644 index 1cfd7310..00000000 --- a/common/scrapers/douban.py +++ /dev/null @@ -1,714 +0,0 @@ -import requests -import re -import filetype -from lxml import html -from common.models import SourceSiteEnum -from movies.models import Movie, MovieGenreEnum -from movies.forms import MovieForm -from books.models import Book -from books.forms import BookForm -from music.models import Album -from music.forms import AlbumForm -from games.models import Game -from games.forms import GameForm -from django.core.validators import URLValidator -from django.conf import settings -from PIL import Image -from io import BytesIO -from common.scraper import * - - -class DoubanScrapperMixin: - @classmethod - def download_page(cls, url, headers): - url = cls.get_effective_url(url) - r = None - error = 'DoubanScrapper: error occured when downloading ' + url - content = None - last_error = None - - def get(url): - nonlocal r - # print('Douban GET ' + url) - try: - r = requests.get(url, timeout=settings.SCRAPING_TIMEOUT) - except Exception as e: - r = requests.Response() - r.status_code = f"Exception when GET {url} {e}" + url - # print('Douban CODE ' + str(r.status_code)) - return r - - def check_content(): - nonlocal r, error, content, last_error - content = None - last_error = None - if r.status_code == 200: - content = r.content.decode('utf-8') - if content.find('关于豆瓣') == -1: - if content.find('你的 IP 发出') == -1: - error = error + 'Content not authentic' # response is garbage - else: - error = error + 'IP banned' - content = None - last_error = 'network' - elif content.find('页面不存在') != -1 or content.find('呃... 你想访问的条目豆瓣不收录。') != -1: # re.search('不存在[^<]+', content, re.MULTILINE): - content = None - last_error = 'censorship' - error = error + 'Not found or hidden by Douban' - elif r.status_code == 204: - content = None - last_error = 'censorship' - error = error + 'Not found or hidden by Douban' - else: - content = None - last_error = 'network' - error = error + str(r.status_code) - - def fix_wayback_links(): - nonlocal content - # fix links - content = re.sub(r'href="http[^"]+http', r'href="http', content) - # https://img9.doubanio.com/view/subject/{l|m|s}/public/s1234.jpg - content = re.sub(r'src="[^"]+/(s\d+\.\w+)"', - r'src="https://img9.doubanio.com/view/subject/m/public/\1"', content) - # https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2681329386.jpg - # https://img9.doubanio.com/view/photo/{l|m|s}/public/p1234.webp - content = re.sub(r'src="[^"]+/(p\d+\.\w+)"', - r'src="https://img9.doubanio.com/view/photo/m/public/\1"', content) - - # Wayback Machine: get latest available - def wayback(): - nonlocal r, error, content - error = error + '\nWayback: ' - get('http://archive.org/wayback/available?url=' + url) - if r.status_code == 200: - w = r.json() - if w['archived_snapshots'] and w['archived_snapshots']['closest']: - get(w['archived_snapshots']['closest']['url']) - check_content() - if content is not None: - fix_wayback_links() - else: - error = error + 'No snapshot available' - else: - error = error + str(r.status_code) - - # Wayback Machine: guess via CDX API - def wayback_cdx(): - nonlocal r, error, content - error = error + '\nWayback: ' - get('http://web.archive.org/cdx/search/cdx?url=' + url) - if r.status_code == 200: - dates = re.findall(r'[^\s]+\s+(\d+)\s+[^\s]+\s+[^\s]+\s+\d+\s+[^\s]+\s+\d{5,}', - r.content.decode('utf-8')) - # assume snapshots whose size >9999 contain real content, use the latest one of them - if len(dates) > 0: - get('http://web.archive.org/web/' + dates[-1] + '/' + url) - check_content() - if content is not None: - fix_wayback_links() - else: - error = error + 'No snapshot available' - else: - error = error + str(r.status_code) - - def latest(): - nonlocal r, error, content - if settings.SCRAPESTACK_KEY is not None: - error = error + '\nScrapeStack: ' - get(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}') - elif settings.SCRAPERAPI_KEY is not None: - error = error + '\nScraperAPI: ' - get(f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}') - else: - error = error + '\nDirect: ' - get(url) - check_content() - if last_error == 'network' and settings.PROXYCRAWL_KEY is not None: - error = error + '\nProxyCrawl: ' - get(f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={url}') - check_content() - if last_error == 'censorship' and settings.LOCAL_PROXY is not None: - error = error + '\nLocal: ' - get(f'{settings.LOCAL_PROXY}?url={url}') - check_content() - - latest() - if content is None: - wayback_cdx() - - if content is None: - raise RuntimeError(error) - # with open('/tmp/temp.html', 'w', encoding='utf-8') as fp: - # fp.write(content) - return html.fromstring(content) - - @classmethod - def download_image(cls, url, item_url=None): - raw_img = None - ext = None - - if settings.SCRAPESTACK_KEY is not None: - dl_url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}' - elif settings.SCRAPERAPI_KEY is not None: - dl_url = f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}' - else: - dl_url = url - - try: - img_response = requests.get(dl_url, timeout=settings.SCRAPING_TIMEOUT) - if img_response.status_code == 200: - raw_img = img_response.content - img = Image.open(BytesIO(raw_img)) - img.load() # corrupted image will trigger exception - content_type = img_response.headers.get('Content-Type') - ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension - else: - logger.error(f"Douban: download image failed {img_response.status_code} {dl_url} {item_url}") - # raise RuntimeError(f"Douban: download image failed {img_response.status_code} {dl_url}") - except Exception as e: - raw_img = None - ext = None - logger.error(f"Douban: download image failed {e} {dl_url} {item_url}") - - if raw_img is None and settings.PROXYCRAWL_KEY is not None: - try: - dl_url = f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={url}' - img_response = requests.get(dl_url, timeout=settings.SCRAPING_TIMEOUT) - if img_response.status_code == 200: - raw_img = img_response.content - img = Image.open(BytesIO(raw_img)) - img.load() # corrupted image will trigger exception - content_type = img_response.headers.get('Content-Type') - ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension - else: - logger.error(f"Douban: download image failed {img_response.status_code} {dl_url} {item_url}") - except Exception as e: - raw_img = None - ext = None - logger.error(f"Douban: download image failed {e} {dl_url} {item_url}") - return raw_img, ext - - -class DoubanBookScraper(DoubanScrapperMixin, AbstractScraper): - site_name = SourceSiteEnum.DOUBAN.value - host = "book.douban.com" - data_class = Book - form_class = BookForm - - regex = re.compile(r"https://book\.douban\.com/subject/\d+/{0,1}") - - def scrape(self, url): - headers = DEFAULT_REQUEST_HEADERS.copy() - headers['Host'] = self.host - content = self.download_page(url, headers) - - isbn_elem = content.xpath("//div[@id='info']//span[text()='ISBN:']/following::text()") - isbn = isbn_elem[0].strip() if isbn_elem else None - title_elem = content.xpath("/html/body//h1/span/text()") - title = title_elem[0].strip() if title_elem else None - if not title: - if isbn: - title = 'isbn: ' + isbn - else: - raise ValueError("given url contains no book title or isbn") - - subtitle_elem = content.xpath( - "//div[@id='info']//span[text()='副标题:']/following::text()") - subtitle = subtitle_elem[0].strip()[:500] if subtitle_elem else None - - orig_title_elem = content.xpath( - "//div[@id='info']//span[text()='原作名:']/following::text()") - orig_title = orig_title_elem[0].strip()[:500] if orig_title_elem else None - - language_elem = content.xpath( - "//div[@id='info']//span[text()='语言:']/following::text()") - language = language_elem[0].strip() if language_elem else None - - pub_house_elem = content.xpath( - "//div[@id='info']//span[text()='出版社:']/following::text()") - pub_house = pub_house_elem[0].strip() if pub_house_elem else None - - pub_date_elem = content.xpath( - "//div[@id='info']//span[text()='出版年:']/following::text()") - pub_date = pub_date_elem[0].strip() if pub_date_elem else '' - year_month_day = RE_NUMBERS.findall(pub_date) - if len(year_month_day) in (2, 3): - pub_year = int(year_month_day[0]) - pub_month = int(year_month_day[1]) - elif len(year_month_day) == 1: - pub_year = int(year_month_day[0]) - pub_month = None - else: - pub_year = None - pub_month = None - if pub_year and pub_month and pub_year < pub_month: - pub_year, pub_month = pub_month, pub_year - pub_year = None if pub_year is not None and pub_year not in range( - 0, 3000) else pub_year - pub_month = None if pub_month is not None and pub_month not in range( - 1, 12) else pub_month - - binding_elem = content.xpath( - "//div[@id='info']//span[text()='装帧:']/following::text()") - binding = binding_elem[0].strip() if binding_elem else None - - price_elem = content.xpath( - "//div[@id='info']//span[text()='定价:']/following::text()") - price = price_elem[0].strip() if price_elem else None - - pages_elem = content.xpath( - "//div[@id='info']//span[text()='页数:']/following::text()") - pages = pages_elem[0].strip() if pages_elem else None - if pages is not None: - pages = int(RE_NUMBERS.findall(pages)[ - 0]) if RE_NUMBERS.findall(pages) else None - if pages and (pages > 999999 or pages < 1): - pages = None - - brief_elem = content.xpath( - "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()") - brief = '\n'.join(p.strip() - for p in brief_elem) if brief_elem else None - - contents = None - try: - contents_elem = content.xpath( - "//h2/span[text()='目录']/../following-sibling::div[1]")[0] - # if next the id of next sibling contains `dir`, that would be the full contents - if "dir" in contents_elem.getnext().xpath("@id")[0]: - contents_elem = contents_elem.getnext() - contents = '\n'.join(p.strip() for p in contents_elem.xpath( - "text()")[:-2]) if contents_elem else None - else: - contents = '\n'.join(p.strip() for p in contents_elem.xpath( - "text()")) if contents_elem else None - except Exception: - pass - - img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src") - img_url = img_url_elem[0].strip() if img_url_elem else None - raw_img, ext = self.download_image(img_url, url) - - # there are two html formats for authors and translators - authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/ - preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""") - if not authors_elem: - authors_elem = content.xpath( - """//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""") - if authors_elem: - authors = [] - for author in authors_elem: - authors.append(RE_WHITESPACES.sub(' ', author.strip())[:200]) - else: - authors = None - - translators_elem = content.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/ - preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""") - if not translators_elem: - translators_elem = content.xpath( - """//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""") - if translators_elem: - translators = [] - for translator in translators_elem: - translators.append(RE_WHITESPACES.sub(' ', translator.strip())) - else: - translators = None - - other = {} - cncode_elem = content.xpath( - "//div[@id='info']//span[text()='统一书号:']/following::text()") - if cncode_elem: - other['统一书号'] = cncode_elem[0].strip() - series_elem = content.xpath( - "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()") - if series_elem: - other['丛书'] = series_elem[0].strip() - imprint_elem = content.xpath( - "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()") - if imprint_elem: - other['出品方'] = imprint_elem[0].strip() - - data = { - 'title': title, - 'subtitle': subtitle, - 'orig_title': orig_title, - 'author': authors, - 'translator': translators, - 'language': language, - 'pub_house': pub_house, - 'pub_year': pub_year, - 'pub_month': pub_month, - 'binding': binding, - 'price': price, - 'pages': pages, - 'isbn': isbn, - 'brief': brief, - 'contents': contents, - 'other_info': other, - 'source_site': self.site_name, - 'source_url': self.get_effective_url(url), - } - self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext - return data, raw_img - - -class DoubanMovieScraper(DoubanScrapperMixin, AbstractScraper): - site_name = SourceSiteEnum.DOUBAN.value - host = 'movie.douban.com' - data_class = Movie - form_class = MovieForm - - regex = re.compile(r"https://movie\.douban\.com/subject/\d+/{0,1}") - - def scrape(self, url): - headers = DEFAULT_REQUEST_HEADERS.copy() - headers['Host'] = self.host - content = self.download_page(url, headers) - - # parsing starts here - try: - raw_title = content.xpath( - "//span[@property='v:itemreviewed']/text()")[0].strip() - except IndexError: - raise ValueError("given url contains no movie info") - - orig_title = content.xpath( - "//img[@rel='v:image']/@alt")[0].strip() - title = raw_title.split(orig_title)[0].strip() - # if has no chinese title - if title == '': - title = orig_title - - if title == orig_title: - orig_title = None - - # there are two html formats for authors and translators - other_title_elem = content.xpath( - "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]") - other_title = other_title_elem[0].strip().split( - ' / ') if other_title_elem else None - - imdb_elem = content.xpath( - "//div[@id='info']//span[text()='IMDb链接:']/following-sibling::a[1]/text()") - if not imdb_elem: - imdb_elem = content.xpath( - "//div[@id='info']//span[text()='IMDb:']/following-sibling::text()[1]") - imdb_code = imdb_elem[0].strip() if imdb_elem else None - - director_elem = content.xpath( - "//div[@id='info']//span[text()='导演']/following-sibling::span[1]/a/text()") - director = director_elem if director_elem else None - - playwright_elem = content.xpath( - "//div[@id='info']//span[text()='编剧']/following-sibling::span[1]/a/text()") - playwright = list(map(lambda a: a[:200], playwright_elem)) if playwright_elem else None - - actor_elem = content.xpath( - "//div[@id='info']//span[text()='主演']/following-sibling::span[1]/a/text()") - actor = list(map(lambda a: a[:200], actor_elem)) if actor_elem else None - - # construct genre translator - genre_translator = {} - attrs = [attr for attr in dir(MovieGenreEnum) if '__' not in attr] - for attr in attrs: - genre_translator[getattr(MovieGenreEnum, attr).label] = getattr( - MovieGenreEnum, attr).value - - genre_elem = content.xpath("//span[@property='v:genre']/text()") - if genre_elem: - genre = [] - for g in genre_elem: - g = g.split(' ')[0] - if g == '紀錄片': # likely some original data on douban was corrupted - g = '纪录片' - elif g == '鬼怪': - g = '惊悚' - if g in genre_translator: - genre.append(genre_translator[g]) - elif g in genre_translator.values(): - genre.append(g) - else: - logger.error(f'unable to map genre {g}') - else: - genre = None - - showtime_elem = content.xpath( - "//span[@property='v:initialReleaseDate']/text()") - if showtime_elem: - showtime = [] - for st in showtime_elem: - parts = st.split('(') - if len(parts) == 1: - time = st.split('(')[0] - region = '' - else: - time = st.split('(')[0] - region = st.split('(')[1][0:-1] - showtime.append({time: region}) - else: - showtime = None - - site_elem = content.xpath( - "//div[@id='info']//span[text()='官方网站:']/following-sibling::a[1]/@href") - site = site_elem[0].strip()[:200] if site_elem else None - try: - validator = URLValidator() - validator(site) - except ValidationError: - site = None - - area_elem = content.xpath( - "//div[@id='info']//span[text()='制片国家/地区:']/following-sibling::text()[1]") - if area_elem: - area = [a.strip()[:100] for a in area_elem[0].split('/')] - else: - area = None - - language_elem = content.xpath( - "//div[@id='info']//span[text()='语言:']/following-sibling::text()[1]") - if language_elem: - language = [a.strip() for a in language_elem[0].split(' / ')] - else: - language = None - - year_elem = content.xpath("//span[@class='year']/text()") - year = int(re.search(r'\d+', year_elem[0])[0]) if year_elem and re.search(r'\d+', year_elem[0]) else None - - duration_elem = content.xpath("//span[@property='v:runtime']/text()") - other_duration_elem = content.xpath( - "//span[@property='v:runtime']/following-sibling::text()[1]") - if duration_elem: - duration = duration_elem[0].strip() - if other_duration_elem: - duration += other_duration_elem[0].rstrip() - duration = duration.split('/')[0].strip() - else: - duration = None - - season_elem = content.xpath( - "//*[@id='season']/option[@selected='selected']/text()") - if not season_elem: - season_elem = content.xpath( - "//div[@id='info']//span[text()='季数:']/following-sibling::text()[1]") - season = int(season_elem[0].strip()) if season_elem else None - else: - season = int(season_elem[0].strip()) - - episodes_elem = content.xpath( - "//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]") - episodes = int(episodes_elem[0].strip()) if episodes_elem and episodes_elem[0].strip().isdigit() else None - - single_episode_length_elem = content.xpath( - "//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]") - single_episode_length = single_episode_length_elem[0].strip( - )[:100] if single_episode_length_elem else None - - # if has field `episodes` not none then must be series - is_series = True if episodes else False - - brief_elem = content.xpath("//span[@class='all hidden']") - if not brief_elem: - brief_elem = content.xpath("//span[@property='v:summary']") - brief = '\n'.join([e.strip() for e in brief_elem[0].xpath( - './text()')]) if brief_elem else None - - img_url_elem = content.xpath("//img[@rel='v:image']/@src") - img_url = img_url_elem[0].strip() if img_url_elem else None - raw_img, ext = self.download_image(img_url, url) - - data = { - 'title': title, - 'orig_title': orig_title, - 'other_title': other_title, - 'imdb_code': imdb_code, - 'director': director, - 'playwright': playwright, - 'actor': actor, - 'genre': genre, - 'showtime': showtime, - 'site': site, - 'area': area, - 'language': language, - 'year': year, - 'duration': duration, - 'season': season, - 'episodes': episodes, - 'single_episode_length': single_episode_length, - 'brief': brief, - 'is_series': is_series, - 'source_site': self.site_name, - 'source_url': self.get_effective_url(url), - } - self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext - return data, raw_img - - -class DoubanAlbumScraper(DoubanScrapperMixin, AbstractScraper): - site_name = SourceSiteEnum.DOUBAN.value - host = 'music.douban.com' - data_class = Album - form_class = AlbumForm - - regex = re.compile(r"https://music\.douban\.com/subject/\d+/{0,1}") - - def scrape(self, url): - headers = DEFAULT_REQUEST_HEADERS.copy() - headers['Host'] = self.host - content = self.download_page(url, headers) - - # parsing starts here - try: - title = content.xpath("//h1/span/text()")[0].strip() - except IndexError: - raise ValueError("given url contains no album info") - if not title: - raise ValueError("given url contains no album info") - - artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()") - artist = None if not artists_elem else list(map(lambda a: a[:200], artists_elem)) - - genre_elem = content.xpath( - "//div[@id='info']//span[text()='流派:']/following::text()[1]") - genre = genre_elem[0].strip() if genre_elem else None - - date_elem = content.xpath( - "//div[@id='info']//span[text()='发行时间:']/following::text()[1]") - release_date = parse_date(date_elem[0].strip()) if date_elem else None - - company_elem = content.xpath( - "//div[@id='info']//span[text()='出版者:']/following::text()[1]") - company = company_elem[0].strip() if company_elem else None - - track_list_elem = content.xpath( - "//div[@class='track-list']/div[@class='indent']/div/text()" - ) - if track_list_elem: - track_list = '\n'.join([track.strip() for track in track_list_elem]) - else: - track_list = None - - brief_elem = content.xpath("//span[@class='all hidden']") - if not brief_elem: - brief_elem = content.xpath("//span[@property='v:summary']") - brief = '\n'.join([e.strip() for e in brief_elem[0].xpath( - './text()')]) if brief_elem else None - - other_info = {} - other_elem = content.xpath( - "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]") - if other_elem: - other_info['又名'] = other_elem[0].strip() - other_elem = content.xpath( - "//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]") - if other_elem: - other_info['专辑类型'] = other_elem[0].strip() - other_elem = content.xpath( - "//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]") - if other_elem: - other_info['介质'] = other_elem[0].strip() - other_elem = content.xpath( - "//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]") - if other_elem: - other_info['ISRC'] = other_elem[0].strip() - other_elem = content.xpath( - "//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]") - if other_elem: - other_info['条形码'] = other_elem[0].strip() - other_elem = content.xpath( - "//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]") - if other_elem: - other_info['碟片数'] = other_elem[0].strip() - - img_url_elem = content.xpath("//div[@id='mainpic']//img/@src") - img_url = img_url_elem[0].strip() if img_url_elem else None - raw_img, ext = self.download_image(img_url, url) - - data = { - 'title': title, - 'artist': artist, - 'genre': genre, - 'release_date': release_date, - 'duration': None, - 'company': company, - 'track_list': track_list, - 'brief': brief, - 'other_info': other_info, - 'source_site': self.site_name, - 'source_url': self.get_effective_url(url), - } - self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext - return data, raw_img - - -class DoubanGameScraper(DoubanScrapperMixin, AbstractScraper): - site_name = SourceSiteEnum.DOUBAN.value - host = 'www.douban.com/game/' - data_class = Game - form_class = GameForm - - regex = re.compile(r"https://www\.douban\.com/game/\d+/{0,1}") - - def scrape(self, url): - headers = DEFAULT_REQUEST_HEADERS.copy() - headers['Host'] = 'www.douban.com' - content = self.download_page(url, headers) - - try: - raw_title = content.xpath( - "//div[@id='content']/h1/text()")[0].strip() - except IndexError: - raise ValueError("given url contains no game info") - - title = raw_title - - other_title_elem = content.xpath( - "//dl[@class='game-attr']//dt[text()='别名:']/following-sibling::dd[1]/text()") - other_title = other_title_elem[0].strip().split(' / ') if other_title_elem else None - - developer_elem = content.xpath( - "//dl[@class='game-attr']//dt[text()='开发商:']/following-sibling::dd[1]/text()") - developer = developer_elem[0].strip().split(' / ') if developer_elem else None - - publisher_elem = content.xpath( - "//dl[@class='game-attr']//dt[text()='发行商:']/following-sibling::dd[1]/text()") - publisher = publisher_elem[0].strip().split(' / ') if publisher_elem else None - - platform_elem = content.xpath( - "//dl[@class='game-attr']//dt[text()='平台:']/following-sibling::dd[1]/a/text()") - platform = platform_elem if platform_elem else None - - genre_elem = content.xpath( - "//dl[@class='game-attr']//dt[text()='类型:']/following-sibling::dd[1]/a/text()") - genre = None - if genre_elem: - genre = [g for g in genre_elem if g != '游戏'] - - date_elem = content.xpath( - "//dl[@class='game-attr']//dt[text()='发行日期:']/following-sibling::dd[1]/text()") - release_date = parse_date(date_elem[0].strip()) if date_elem else None - - brief_elem = content.xpath("//div[@class='mod item-desc']/p/text()") - brief = '\n'.join(brief_elem) if brief_elem else None - - img_url_elem = content.xpath( - "//div[@class='item-subject-info']/div[@class='pic']//img/@src") - img_url = img_url_elem[0].strip() if img_url_elem else None - raw_img, ext = self.download_image(img_url, url) - - data = { - 'title': title, - 'other_title': other_title, - 'developer': developer, - 'publisher': publisher, - 'release_date': release_date, - 'genre': genre, - 'platform': platform, - 'brief': brief, - 'other_info': None, - 'source_site': self.site_name, - 'source_url': self.get_effective_url(url), - } - - self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext - return data, raw_img diff --git a/common/scrapers/goodreads.py b/common/scrapers/goodreads.py deleted file mode 100644 index b4a51687..00000000 --- a/common/scrapers/goodreads.py +++ /dev/null @@ -1,157 +0,0 @@ -import requests -import re -import filetype -from lxml import html -from common.models import SourceSiteEnum -from movies.models import Movie, MovieGenreEnum -from movies.forms import MovieForm -from books.models import Book -from books.forms import BookForm -from music.models import Album, Song -from music.forms import AlbumForm, SongForm -from games.models import Game -from games.forms import GameForm -from django.conf import settings -from PIL import Image -from io import BytesIO -from common.scraper import * - - -class GoodreadsScraper(AbstractScraper): - site_name = SourceSiteEnum.GOODREADS.value - host = "www.goodreads.com" - data_class = Book - form_class = BookForm - regex = re.compile(r"https://www\.goodreads\.com/book/show/\d+") - - @classmethod - def get_effective_url(cls, raw_url): - u = re.match(r".+/book/show/(\d+)", raw_url) - if not u: - u = re.match(r".+book/(\d+)", raw_url) - return "https://www.goodreads.com/book/show/" + u[1] if u else None - - def scrape(self, url, response=None): - """ - This is the scraping portal - """ - if response is not None: - content = html.fromstring(response.content.decode('utf-8')) - else: - headers = None # DEFAULT_REQUEST_HEADERS.copy() - content = self.download_page(url, headers) - - try: - title = content.xpath("//h1/text()")[0].strip() - except IndexError: - raise ValueError("given url contains no book info") - - subtitle = None - - orig_title_elem = content.xpath("//div[@id='bookDataBox']//div[text()='Original Title']/following-sibling::div/text()") - orig_title = orig_title_elem[0].strip() if orig_title_elem else None - - language_elem = content.xpath('//div[@itemprop="inLanguage"]/text()') - language = language_elem[0].strip() if language_elem else None - - pub_house_elem = content.xpath("//div[contains(text(), 'Published') and @class='row']/text()") - try: - months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] - r = re.compile('.*Published.*(' + '|'.join(months) + ').*(\\d\\d\\d\\d).+by\\s*(.+)\\s*', re.DOTALL) - pub = r.match(pub_house_elem[0]) - pub_year = pub[2] - pub_month = months.index(pub[1]) + 1 - pub_house = pub[3].strip() - except Exception: - pub_year = None - pub_month = None - pub_house = None - - pub_house_elem = content.xpath("//nobr[contains(text(), 'first published')]/text()") - try: - pub = re.match(r'.*first published\s+(.+\d\d\d\d).*', pub_house_elem[0], re.DOTALL) - first_pub = pub[1] - except Exception: - first_pub = None - - binding_elem = content.xpath('//span[@itemprop="bookFormat"]/text()') - binding = binding_elem[0].strip() if binding_elem else None - - pages_elem = content.xpath('//span[@itemprop="numberOfPages"]/text()') - pages = pages_elem[0].strip() if pages_elem else None - if pages is not None: - pages = int(RE_NUMBERS.findall(pages)[ - 0]) if RE_NUMBERS.findall(pages) else None - - isbn_elem = content.xpath('//span[@itemprop="isbn"]/text()') - if not isbn_elem: - isbn_elem = content.xpath('//div[@itemprop="isbn"]/text()') # this is likely ASIN - isbn = isbn_elem[0].strip() if isbn_elem else None - - brief_elem = content.xpath('//div[@id="description"]/span[@style="display:none"]/text()') - if brief_elem: - brief = '\n'.join(p.strip() for p in brief_elem) - else: - brief_elem = content.xpath('//div[@id="description"]/span/text()') - brief = '\n'.join(p.strip() for p in brief_elem) if brief_elem else None - - genre = content.xpath('//div[@class="bigBoxBody"]/div/div/div/a/text()') - genre = genre[0] if genre else None - book_title = re.sub('\n', '', content.xpath('//h1[@id="bookTitle"]/text()')[0]).strip() - author = content.xpath('//a[@class="authorName"]/span/text()')[0] - contents = None - - img_url_elem = content.xpath("//img[@id='coverImage']/@src") - img_url = img_url_elem[0].strip() if img_url_elem else None - raw_img, ext = self.download_image(img_url, url) - - authors_elem = content.xpath("//a[@class='authorName'][not(../span[@class='authorName greyText smallText role'])]/span/text()") - if authors_elem: - authors = [] - for author in authors_elem: - authors.append(RE_WHITESPACES.sub(' ', author.strip())) - else: - authors = None - - translators = None - authors_elem = content.xpath("//a[@class='authorName'][../span/text()='(Translator)']/span/text()") - if authors_elem: - translators = [] - for translator in authors_elem: - translators.append(RE_WHITESPACES.sub(' ', translator.strip())) - else: - translators = None - - other = {} - if first_pub: - other['首版时间'] = first_pub - if genre: - other['分类'] = genre - series_elem = content.xpath("//h2[@id='bookSeries']/a/text()") - if series_elem: - other['丛书'] = re.sub(r'\(\s*(.+[^\s])\s*#.*\)', '\\1', series_elem[0].strip()) - - data = { - 'title': title, - 'subtitle': subtitle, - 'orig_title': orig_title, - 'author': authors, - 'translator': translators, - 'language': language, - 'pub_house': pub_house, - 'pub_year': pub_year, - 'pub_month': pub_month, - 'binding': binding, - 'pages': pages, - 'isbn': isbn, - 'brief': brief, - 'contents': contents, - 'other_info': other, - 'cover_url': img_url, - 'source_site': self.site_name, - 'source_url': self.get_effective_url(url), - } - data['source_url'] = self.get_effective_url(url) - - self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext - return data, raw_img diff --git a/common/scrapers/google.py b/common/scrapers/google.py deleted file mode 100644 index 0082fb3e..00000000 --- a/common/scrapers/google.py +++ /dev/null @@ -1,102 +0,0 @@ -import requests -import re -import filetype -from lxml import html -from common.models import SourceSiteEnum -from movies.models import Movie, MovieGenreEnum -from movies.forms import MovieForm -from books.models import Book -from books.forms import BookForm -from music.models import Album, Song -from music.forms import AlbumForm, SongForm -from games.models import Game -from games.forms import GameForm -from django.conf import settings -from PIL import Image -from io import BytesIO -from common.scraper import * - - -# https://developers.google.com/youtube/v3/docs/?apix=true -# https://developers.google.com/books/docs/v1/using -class GoogleBooksScraper(AbstractScraper): - site_name = SourceSiteEnum.GOOGLEBOOKS.value - host = ["books.google.com", "www.google.com/books"] - data_class = Book - form_class = BookForm - regex = re.compile(r"https://books\.google\.com/books\?id=([^&#]+)") - - @classmethod - def get_effective_url(cls, raw_url): - # https://books.google.com/books?id=wUHxzgEACAAJ - # https://books.google.com/books/about/%E7%8F%BE%E5%A0%B4%E6%AD%B7%E5%8F%B2.html?id=nvNoAAAAIAAJ - # https://www.google.com/books/edition/_/nvNoAAAAIAAJ?hl=en&gbpv=1 - u = re.match(r"https://books\.google\.com/books.*id=([^&#]+)", raw_url) - if not u: - u = re.match(r"https://www\.google\.com/books/edition/[^/]+/([^&#?]+)", raw_url) - return 'https://books.google.com/books?id=' + u[1] if u else None - - def scrape(self, url, response=None): - url = self.get_effective_url(url) - m = self.regex.match(url) - if m: - api_url = f'https://www.googleapis.com/books/v1/volumes/{m[1]}' - else: - raise ValueError("not valid url") - b = requests.get(api_url).json() - other = {} - title = b['volumeInfo']['title'] - subtitle = b['volumeInfo']['subtitle'] if 'subtitle' in b['volumeInfo'] else None - pub_year = None - pub_month = None - if 'publishedDate' in b['volumeInfo']: - pub_date = b['volumeInfo']['publishedDate'].split('-') - pub_year = pub_date[0] - pub_month = pub_date[1] if len(pub_date) > 1 else None - pub_house = b['volumeInfo']['publisher'] if 'publisher' in b['volumeInfo'] else None - language = b['volumeInfo']['language'] if 'language' in b['volumeInfo'] else None - pages = b['volumeInfo']['pageCount'] if 'pageCount' in b['volumeInfo'] else None - if 'mainCategory' in b['volumeInfo']: - other['分类'] = b['volumeInfo']['mainCategory'] - authors = b['volumeInfo']['authors'] if 'authors' in b['volumeInfo'] else None - if 'description' in b['volumeInfo']: - brief = b['volumeInfo']['description'] - elif 'textSnippet' in b['volumeInfo']: - brief = b["volumeInfo"]["textSnippet"]["searchInfo"] - else: - brief = '' - brief = re.sub(r'<.*?>', '', brief.replace(' 1: - # more than one disc - track_list.append(str( - track['disc_number']) + '-' + str(track['track_number']) + '. ' + track['name']) - else: - track_list.append(str(track['track_number']) + '. ' + track['name']) - track_list = '\n'.join(track_list) - - release_date = parse_date(res_data['release_date']) - - other_info = {} - if res_data['external_ids'].get('upc'): - # bar code - other_info['UPC'] = res_data['external_ids']['upc'] - - raw_img, ext = self.download_image(res_data['images'][0]['url'], url) - - data = { - 'title': title, - 'artist': artist, - 'genre': genre, - 'track_list': track_list, - 'release_date': release_date, - 'duration': duration, - 'company': company, - 'brief': None, - 'other_info': other_info, - 'source_site': self.site_name, - 'source_url': effective_url, - } - - # set tracks_data, used for adding tracks - self.track_urls = track_urls - - self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext - return data, raw_img - - @classmethod - def get_effective_url(cls, raw_url): - code = cls.regex.findall(raw_url) - if code: - return f"https://open.spotify.com/album/{code[0]}" - else: - return None - - # @classmethod - # def save(cls, request_user): - # form = super().save(request_user) - # task = Thread( - # target=cls.add_tracks, - # args=(form.instance, request_user), - # daemon=True - # ) - # task.start() - # return form - - @classmethod - def get_api_url(cls, url): - return "https://api.spotify.com/v1/albums/" + cls.regex.findall(url)[0] - - @classmethod - def add_tracks(cls, album: Album, request_user): - to_be_updated_tracks = [] - for track_url in cls.track_urls: - track = cls.get_track_or_none(track_url) - # seems lik if fire too many requests at the same time - # spotify would limit access - if track is None: - task = Thread( - target=cls.scrape_and_save_track, - args=(track_url, album, request_user), - daemon=True - ) - task.start() - task.join() - else: - to_be_updated_tracks.append(track) - cls.bulk_update_track_album(to_be_updated_tracks, album, request_user) - - @classmethod - def get_track_or_none(cls, track_url: str): - try: - instance = Song.objects.get(source_url=track_url) - return instance - except ObjectDoesNotExist: - return None - - @classmethod - def scrape_and_save_track(cls, url: str, album: Album, request_user): - data, img = SpotifyTrackScraper.scrape(url) - SpotifyTrackScraper.raw_data['album'] = album - SpotifyTrackScraper.save(request_user) - - @classmethod - def bulk_update_track_album(cls, tracks, album, request_user): - for track in tracks: - track.last_editor = request_user - track.edited_time = timezone.now() - track.album = album - Song.objects.bulk_update(tracks, [ - 'last_editor', - 'edited_time', - 'album' - ]) - - -def get_spotify_token(): - global spotify_token, spotify_token_expire_time - if spotify_token is None or is_spotify_token_expired(): - invoke_spotify_token() - return spotify_token - - -def is_spotify_token_expired(): - global spotify_token_expire_time - return True if spotify_token_expire_time <= time.time() else False - - -def invoke_spotify_token(): - global spotify_token, spotify_token_expire_time - r = requests.post( - "https://accounts.spotify.com/api/token", - data={ - "grant_type": "client_credentials" - }, - headers={ - "Authorization": f"Basic {settings.SPOTIFY_CREDENTIAL}" - } - ) - data = r.json() - if r.status_code == 401: - # token expired, try one more time - # this maybe caused by external operations, - # for example debugging using a http client - r = requests.post( - "https://accounts.spotify.com/api/token", - data={ - "grant_type": "client_credentials" - }, - headers={ - "Authorization": f"Basic {settings.SPOTIFY_CREDENTIAL}" - } - ) - data = r.json() - elif r.status_code != 200: - raise Exception(f"Request to spotify API fails. Reason: {r.reason}") - # minus 2 for execution time error - spotify_token_expire_time = int(data['expires_in']) + time.time() - 2 - spotify_token = data['access_token'] diff --git a/common/scrapers/steam.py b/common/scrapers/steam.py deleted file mode 100644 index 43f1c76b..00000000 --- a/common/scrapers/steam.py +++ /dev/null @@ -1,92 +0,0 @@ -import re -from common.models import SourceSiteEnum -from games.models import Game -from games.forms import GameForm -from common.scraper import * -from common.scrapers.igdb import IgdbGameScraper - - -class SteamGameScraper(AbstractScraper): - site_name = SourceSiteEnum.STEAM.value - host = 'store.steampowered.com' - data_class = Game - form_class = GameForm - - regex = re.compile(r"https://store\.steampowered\.com/app/\d+") - - def scrape(self, url): - m = self.regex.match(url) - if m: - effective_url = m[0] - else: - raise ValueError("not valid url") - try: - s = IgdbGameScraper() - s.scrape_steam(effective_url) - self.raw_data = s.raw_data - self.raw_img = s.raw_img - self.img_ext = s.img_ext - self.raw_data['source_site'] = self.site_name - self.raw_data['source_url'] = effective_url - # return self.raw_data, self.raw_img - except: - self.raw_img = None - self.raw_data = {} - headers = DEFAULT_REQUEST_HEADERS.copy() - headers['Host'] = self.host - headers['Cookie'] = "wants_mature_content=1; birthtime=754700401;" - content = self.download_page(url, headers) - - title = content.xpath("//div[@class='apphub_AppName']/text()")[0] - developer = content.xpath("//div[@id='developers_list']/a/text()") - publisher = content.xpath("//div[@class='glance_ctn']//div[@class='dev_row'][2]//a/text()") - release_date = parse_date( - content.xpath( - "//div[@class='release_date']/div[@class='date']/text()")[0] - ) - - genre = content.xpath( - "//div[@class='details_block']/b[2]/following-sibling::a/text()") - - platform = ['PC'] - - brief = content.xpath( - "//div[@class='game_description_snippet']/text()")[0].strip() - - img_url = content.xpath( - "//img[@class='game_header_image_full']/@src" - )[0].replace("header.jpg", "library_600x900.jpg") - raw_img, img_ext = self.download_image(img_url, url) - - # no 600x900 picture - if raw_img is None: - img_url = content.xpath("//img[@class='game_header_image_full']/@src")[0] - raw_img, img_ext = self.download_image(img_url, url) - - if raw_img is not None: - self.raw_img = raw_img - self.img_ext = img_ext - - data = { - 'title': title if title else self.raw_data['title'], - 'other_title': None, - 'developer': developer if 'developer' not in self.raw_data else self.raw_data['developer'], - 'publisher': publisher if 'publisher' not in self.raw_data else self.raw_data['publisher'], - 'release_date': release_date if 'release_date' not in self.raw_data else self.raw_data['release_date'], - 'genre': genre if 'genre' not in self.raw_data else self.raw_data['genre'], - 'platform': platform if 'platform' not in self.raw_data else self.raw_data['platform'], - 'brief': brief if brief else self.raw_data['brief'], - 'other_info': None if 'other_info' not in self.raw_data else self.raw_data['other_info'], - 'source_site': self.site_name, - 'source_url': effective_url - } - self.raw_data = data - return self.raw_data, self.raw_img - - @classmethod - def get_effective_url(cls, raw_url): - m = cls.regex.match(raw_url) - if m: - return m[0] - else: - return None diff --git a/common/scrapers/tmdb.py b/common/scrapers/tmdb.py deleted file mode 100644 index 15072add..00000000 --- a/common/scrapers/tmdb.py +++ /dev/null @@ -1,150 +0,0 @@ -import requests -import re -from common.models import SourceSiteEnum -from movies.models import Movie -from movies.forms import MovieForm -from django.conf import settings -from common.scraper import * - - -class TmdbMovieScraper(AbstractScraper): - site_name = SourceSiteEnum.TMDB.value - host = 'https://www.themoviedb.org/' - data_class = Movie - form_class = MovieForm - regex = re.compile(r"https://www\.themoviedb\.org/(movie|tv)/([a-zA-Z0-9]+)") - # http://api.themoviedb.org/3/genre/movie/list?api_key=&language=zh - # http://api.themoviedb.org/3/genre/tv/list?api_key=&language=zh - genre_map = { - 'Sci-Fi & Fantasy': 'Sci-Fi', - 'War & Politics': 'War', - '儿童': 'Kids', - '冒险': 'Adventure', - '剧情': 'Drama', - '动作': 'Action', - '动作冒险': 'Action', - '动画': 'Animation', - '历史': 'History', - '喜剧': 'Comedy', - '奇幻': 'Fantasy', - '家庭': 'Family', - '恐怖': 'Horror', - '悬疑': 'Mystery', - '惊悚': 'Thriller', - '战争': 'War', - '新闻': 'News', - '爱情': 'Romance', - '犯罪': 'Crime', - '电视电影': 'TV Movie', - '真人秀': 'Reality-TV', - '科幻': 'Sci-Fi', - '纪录': 'Documentary', - '肥皂剧': 'Soap', - '脱口秀': 'Talk-Show', - '西部': 'Western', - '音乐': 'Music', - } - - def scrape_imdb(self, imdb_code): - api_url = f"https://api.themoviedb.org/3/find/{imdb_code}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&external_source=imdb_id" - r = requests.get(api_url) - res_data = r.json() - if 'movie_results' in res_data and len(res_data['movie_results']) > 0: - url = f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}" - elif 'tv_results' in res_data and len(res_data['tv_results']) > 0: - url = f"https://www.themoviedb.org/tv/{res_data['tv_results'][0]['id']}" - else: - raise ValueError("Cannot find IMDb ID in TMDB") - return self.scrape(url) - - def scrape(self, url): - m = self.regex.match(url) - if m: - effective_url = m[0] - else: - raise ValueError("not valid url") - effective_url = m[0] - is_series = m[1] == 'tv' - id = m[2] - if is_series: - api_url = f"https://api.themoviedb.org/3/tv/{id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits" - else: - api_url = f"https://api.themoviedb.org/3/movie/{id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits" - r = requests.get(api_url) - res_data = r.json() - - if is_series: - title = res_data['name'] - orig_title = res_data['original_name'] - year = int(res_data['first_air_date'].split('-')[0]) if res_data['first_air_date'] else None - imdb_code = res_data['external_ids']['imdb_id'] - showtime = [{res_data['first_air_date']: "首播日期"}] if res_data['first_air_date'] else None - duration = None - else: - title = res_data['title'] - orig_title = res_data['original_title'] - year = int(res_data['release_date'].split('-')[0]) if res_data['release_date'] else None - showtime = [{res_data['release_date']: "发布日期"}] if res_data['release_date'] else None - imdb_code = res_data['imdb_id'] - duration = res_data['runtime'] if res_data['runtime'] else None # in minutes - - genre = list(map(lambda x: self.genre_map[x['name']] if x['name'] in self.genre_map else 'Other', res_data['genres'])) - language = list(map(lambda x: x['name'], res_data['spoken_languages'])) - brief = res_data['overview'] - - if is_series: - director = list(map(lambda x: x['name'], res_data['created_by'])) - else: - director = list(map(lambda x: x['name'], filter(lambda c: c['job'] == 'Director', res_data['credits']['crew']))) - playwright = list(map(lambda x: x['name'], filter(lambda c: c['job'] == 'Screenplay', res_data['credits']['crew']))) - actor = list(map(lambda x: x['name'], res_data['credits']['cast'])) - area = [] - - other_info = {} - other_info['TMDB评分'] = res_data['vote_average'] - # other_info['分级'] = res_data['contentRating'] - # other_info['Metacritic评分'] = res_data['metacriticRating'] - # other_info['奖项'] = res_data['awards'] - other_info['TMDB_ID'] = id - if is_series: - other_info['Seasons'] = res_data['number_of_seasons'] - other_info['Episodes'] = res_data['number_of_episodes'] - - img_url = ('https://image.tmdb.org/t/p/original/' + res_data['poster_path']) if res_data['poster_path'] is not None else None - # TODO: use GET /configuration to get base url - raw_img, ext = self.download_image(img_url, url) - - data = { - 'title': title, - 'orig_title': orig_title, - 'other_title': None, - 'imdb_code': imdb_code, - 'director': director, - 'playwright': playwright, - 'actor': actor, - 'genre': genre, - 'showtime': showtime, - 'site': None, - 'area': area, - 'language': language, - 'year': year, - 'duration': duration, - 'season': None, - 'episodes': None, - 'single_episode_length': None, - 'brief': brief, - 'is_series': is_series, - 'other_info': other_info, - 'source_site': self.site_name, - 'source_url': effective_url, - } - self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext - return data, raw_img - - @classmethod - def get_effective_url(cls, raw_url): - m = cls.regex.match(raw_url) - if raw_url: - return m[0] - else: - return None diff --git a/common/searcher.py b/common/searcher.py deleted file mode 100644 index 48c43af9..00000000 --- a/common/searcher.py +++ /dev/null @@ -1,209 +0,0 @@ -from urllib.parse import quote_plus -from enum import Enum -from common.models import SourceSiteEnum -from django.conf import settings -from common.scrapers.goodreads import GoodreadsScraper -from common.scrapers.spotify import get_spotify_token -import requests -from lxml import html -import logging - -SEARCH_PAGE_SIZE = 5 # not all apis support page size -logger = logging.getLogger(__name__) - - -class Category(Enum): - Book = '书籍' - Movie = '电影' - Music = '音乐' - Game = '游戏' - TV = '剧集' - - -class SearchResultItem: - def __init__(self, category, source_site, source_url, title, subtitle, brief, cover_url): - self.category = category - self.source_site = source_site - self.source_url = source_url - self.title = title - self.subtitle = subtitle - self.brief = brief - self.cover_url = cover_url - - @property - def verbose_category_name(self): - return self.category.value - - @property - def link(self): - return f"/search?q={quote_plus(self.source_url)}" - - @property - def scraped(self): - return False - - -class ProxiedRequest: - @classmethod - def get(cls, url): - u = f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={quote_plus(url)}' - return requests.get(u, timeout=10) - - -class Goodreads: - @classmethod - def search(self, q, page=1): - results = [] - try: - search_url = f'https://www.goodreads.com/search?page={page}&q={quote_plus(q)}' - r = requests.get(search_url) - if r.url.startswith('https://www.goodreads.com/book/show/'): - # Goodreads will 302 if only one result matches ISBN - data, img = GoodreadsScraper.scrape(r.url, r) - subtitle = f"{data['pub_year']} {', '.join(data['author'])} {', '.join(data['translator'] if data['translator'] else [])}" - results.append(SearchResultItem(Category.Book, SourceSiteEnum.GOODREADS, - data['source_url'], data['title'], subtitle, - data['brief'], data['cover_url'])) - else: - h = html.fromstring(r.content.decode('utf-8')) - for c in h.xpath('//tr[@itemtype="http://schema.org/Book"]'): - el_cover = c.xpath('.//img[@class="bookCover"]/@src') - cover = el_cover[0] if el_cover else None - el_title = c.xpath('.//a[@class="bookTitle"]//text()') - title = ''.join(el_title).strip() if el_title else None - el_url = c.xpath('.//a[@class="bookTitle"]/@href') - url = 'https://www.goodreads.com' + \ - el_url[0] if el_url else None - el_authors = c.xpath('.//a[@class="authorName"]//text()') - subtitle = ', '.join(el_authors) if el_authors else None - results.append(SearchResultItem( - Category.Book, SourceSiteEnum.GOODREADS, url, title, subtitle, '', cover)) - except Exception as e: - logger.error(f"Goodreads search '{q}' error: {e}") - return results - - -class GoogleBooks: - @classmethod - def search(self, q, page=1): - results = [] - try: - api_url = f'https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={SEARCH_PAGE_SIZE*(page-1)}&maxResults={SEARCH_PAGE_SIZE}&maxAllowedMaturityRating=MATURE' - j = requests.get(api_url).json() - if 'items' in j: - for b in j['items']: - if 'title' not in b['volumeInfo']: - continue - title = b['volumeInfo']['title'] - subtitle = '' - if 'publishedDate' in b['volumeInfo']: - subtitle += b['volumeInfo']['publishedDate'] + ' ' - if 'authors' in b['volumeInfo']: - subtitle += ', '.join(b['volumeInfo']['authors']) - if 'description' in b['volumeInfo']: - brief = b['volumeInfo']['description'] - elif 'textSnippet' in b['volumeInfo']: - brief = b["volumeInfo"]["textSnippet"]["searchInfo"] - else: - brief = '' - category = Category.Book - # b['volumeInfo']['infoLink'].replace('http:', 'https:') - url = 'https://books.google.com/books?id=' + b['id'] - cover = b['volumeInfo']['imageLinks']['thumbnail'] if 'imageLinks' in b['volumeInfo'] else None - results.append(SearchResultItem( - category, SourceSiteEnum.GOOGLEBOOKS, url, title, subtitle, brief, cover)) - except Exception as e: - logger.error(f"GoogleBooks search '{q}' error: {e}") - return results - - -class TheMovieDatabase: - @classmethod - def search(self, q, page=1): - results = [] - try: - api_url = f'https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={page}&api_key={settings.TMDB_API3_KEY}&language=zh-CN&include_adult=true' - j = requests.get(api_url).json() - for m in j['results']: - if m['media_type'] in ['tv', 'movie']: - url = f"https://www.themoviedb.org/{m['media_type']}/{m['id']}" - if m['media_type'] == 'tv': - cat = Category.TV - title = m['name'] - subtitle = f"{m.get('first_air_date')} {m.get('original_name')}" - else: - cat = Category.Movie - title = m['title'] - subtitle = f"{m.get('release_date')} {m.get('original_name')}" - cover = f"https://image.tmdb.org/t/p/w500/{m.get('poster_path')}" - results.append(SearchResultItem( - cat, SourceSiteEnum.TMDB, url, title, subtitle, m.get('overview'), cover)) - except Exception as e: - logger.error(f"TMDb search '{q}' error: {e}") - return results - - -class Spotify: - @classmethod - def search(self, q, page=1): - results = [] - try: - api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={SEARCH_PAGE_SIZE}&offset={page*SEARCH_PAGE_SIZE}" - headers = { - 'Authorization': f"Bearer {get_spotify_token()}" - } - j = requests.get(api_url, headers=headers).json() - for a in j['albums']['items']: - title = a['name'] - subtitle = a['release_date'] - for artist in a['artists']: - subtitle += ' ' + artist['name'] - url = a['external_urls']['spotify'] - cover = a['images'][0]['url'] - results.append(SearchResultItem( - Category.Music, SourceSiteEnum.SPOTIFY, url, title, subtitle, '', cover)) - except Exception as e: - logger.error(f"Spotify search '{q}' error: {e}") - return results - - -class Bandcamp: - @classmethod - def search(self, q, page=1): - results = [] - try: - search_url = f'https://bandcamp.com/search?from=results&item_type=a&page={page}&q={quote_plus(q)}' - r = requests.get(search_url) - h = html.fromstring(r.content.decode('utf-8')) - for c in h.xpath('//li[@class="searchresult data-search"]'): - el_cover = c.xpath('.//div[@class="art"]/img/@src') - cover = el_cover[0] if el_cover else None - el_title = c.xpath('.//div[@class="heading"]//text()') - title = ''.join(el_title).strip() if el_title else None - el_url = c.xpath('..//div[@class="itemurl"]/a/@href') - url = el_url[0] if el_url else None - el_authors = c.xpath('.//div[@class="subhead"]//text()') - subtitle = ', '.join(el_authors) if el_authors else None - results.append(SearchResultItem(Category.Music, SourceSiteEnum.BANDCAMP, url, title, subtitle, '', cover)) - except Exception as e: - logger.error(f"Goodreads search '{q}' error: {e}") - return results - - -class ExternalSources: - @classmethod - def search(self, c, q, page=1): - if not q: - return [] - results = [] - if c == '' or c is None: - c = 'all' - if c == 'all' or c == 'movie': - results.extend(TheMovieDatabase.search(q, page)) - if c == 'all' or c == 'book': - results.extend(GoogleBooks.search(q, page)) - results.extend(Goodreads.search(q, page)) - if c == 'all' or c == 'music': - results.extend(Spotify.search(q, page)) - results.extend(Bandcamp.search(q, page)) - return results diff --git a/common/templates/common/search_result.html b/common/templates/common/search_result.html deleted file mode 100644 index d3fc5417..00000000 --- a/common/templates/common/search_result.html +++ /dev/null @@ -1,243 +0,0 @@ -{% load static %} -{% load i18n %} -{% load l10n %} -{% load humanize %} -{% load admin_url %} -{% load mastodon %} -{% load oauth_token %} -{% load truncate %} -{% load highlight %} -{% load thumb %} - - - - - - - {{ site_name }} - {% trans '搜索结果' %} - - - - - - - - - - -
    -
    - {% include 'partial/_navbar.html' %} - -
    -
    -
    -
    - -
    - {% if request.GET.q %} -
    “{{ request.GET.q }}” {% trans '的搜索结果' %}
    - {% endif %} - - {% if request.GET.tag %} -
    {% trans '含有标签' %} “{{ request.GET.tag }}” {% trans '的结果' %}
    - {% endif %} - -
      - {% for item in items %} - {% include "partial/list_item.html" %} - {% empty %} -
    • - {% trans '无站内条目匹配' %} -
    • - {% endfor %} - {% if request.GET.q and user.is_authenticated %} -
    • - {% trans '正在实时搜索站外条目' %} -
      -
      -
      -
      -
      -
      -
      -
      -
      -
      -
      -
      -
      -
      -
      -
      -
    • - {% endif %} -
    -
    - -
    - -
    - -
    -
    - -
    -
    -
    - {% trans '没有想要的结果?' %} -
    - {% if request.GET.c and request.GET.c in categories %} - - {% if request.GET.c|lower == 'book' %} - - - - - - {% elif request.GET.c|lower == 'movie' %} - - - - - - {% elif request.GET.c|lower == 'music' %} - - - - - - - - - {% elif request.GET.c|lower == 'game' %} - - - - - - {% endif %} - - {% else %} - - - - - - - - - - - - - - - - {% endif %} -
    - - -
    - -
    -
    -
    -
    -
    - {% include 'partial/_footer.html' %} -
    - - - - - - - - diff --git a/common/templates/partial/_navbar.html b/common/templates/partial/_navbar.html index a2e9c989..8d387103 100644 --- a/common/templates/partial/_navbar.html +++ b/common/templates/partial/_navbar.html @@ -1,7 +1,7 @@ {% load static %} {% load i18n %} {% load admin_url %} -
    +
    diff --git a/journal/templates/list_item_base.html b/journal/templates/list_item_base.html index 3bc8c2c2..336a6da3 100644 --- a/journal/templates/list_item_base.html +++ b/journal/templates/list_item_base.html @@ -70,7 +70,7 @@
    {% for tag_dict in item.tags %} - {{ tag_dict }} + {{ tag_dict }} {% endfor %}
    diff --git a/journal/templates/piece_delete.html b/journal/templates/piece_delete.html index 188bcd28..4bb1eecd 100644 --- a/journal/templates/piece_delete.html +++ b/journal/templates/piece_delete.html @@ -37,7 +37,7 @@ {% endif %} diff --git a/journal/templates/review.html b/journal/templates/review.html index 561e6ffe..a3fa4dfb 100644 --- a/journal/templates/review.html +++ b/journal/templates/review.html @@ -42,7 +42,7 @@
    - {{ review.owner.username }} + {{ review.owner.username }} {% if mark %} diff --git a/journal/urls.py b/journal/urls.py index 82615097..1f7ef520 100644 --- a/journal/urls.py +++ b/journal/urls.py @@ -1,5 +1,6 @@ from django.urls import path, re_path from .views import * +from .feeds import ReviewFeed from catalog.models import * @@ -65,7 +66,7 @@ urlpatterns = [ name="collection_update_item_note", ), re_path( - r"^user/(?P[A-Za-z0-9_\-.@]+)/(?P" + r"^users/(?P[A-Za-z0-9_\-.@]+)/(?P" + _get_all_shelf_types() + ")/(?P" + _get_all_categories() @@ -74,31 +75,32 @@ urlpatterns = [ name="user_mark_list", ), re_path( - r"^user/(?P[A-Za-z0-9_\-.@]+)/reviews/(?P" + r"^users/(?P[A-Za-z0-9_\-.@]+)/reviews/(?P" + _get_all_categories() + ")/$", user_review_list, name="user_review_list", ), re_path( - r"^user/(?P[A-Za-z0-9_\-.@]+)/tags/(?P[^/]+)/$", + r"^users/(?P[A-Za-z0-9_\-.@]+)/tags/(?P[^/]+)/$", user_tag_member_list, name="user_tag_member_list", ), re_path( - r"^user/(?P[A-Za-z0-9_\-.@]+)/collections/$", + r"^users/(?P[A-Za-z0-9_\-.@]+)/collections/$", user_collection_list, name="user_collection_list", ), re_path( - r"^user/(?P[A-Za-z0-9_\-.@]+)/like/collections/$", + r"^users/(?P[A-Za-z0-9_\-.@]+)/like/collections/$", user_liked_collection_list, name="user_liked_collection_list", ), re_path( - r"^user/(?P[A-Za-z0-9_\-.@]+)/tags/$", + r"^users/(?P[A-Za-z0-9_\-.@]+)/tags/$", user_tag_list, name="user_tag_list", ), - re_path(r"^user/(?P[A-Za-z0-9_\-.@]+)/$", home, name="user_profile"), + re_path(r"^users/(?P[A-Za-z0-9_\-.@]+)/$", profile, name="user_profile"), + path("users//feed/reviews/", ReviewFeed(), name="review_feed"), ] diff --git a/journal/views.py b/journal/views.py index 007a8e52..5a57320a 100644 --- a/journal/views.py +++ b/journal/views.py @@ -1,11 +1,11 @@ import logging -from django.shortcuts import render, get_object_or_404, redirect, reverse +from django.shortcuts import render, get_object_or_404, redirect +from django.urls import reverse from django.contrib.auth.decorators import login_required, permission_required from django.utils.translation import gettext_lazy as _ from django.http import ( HttpResponse, HttpResponseBadRequest, - HttpResponseServerError, HttpResponseNotFound, ) from django.core.exceptions import ObjectDoesNotExist, PermissionDenied @@ -14,10 +14,8 @@ from django.utils import timezone from django.core.paginator import Paginator from .models import * from django.conf import settings -import re from django.http import HttpResponseRedirect from django.db.models import Q -import time from management.models import Announcement from django.utils.baseconv import base62 from .forms import * @@ -157,6 +155,7 @@ def mark(request, item_uuid): except Exception: return render_relogin(request) return HttpResponseRedirect(request.META.get("HTTP_REFERER")) + return HttpResponseBadRequest() def collection_retrieve(request, collection_uuid): @@ -436,7 +435,6 @@ def user_tag_list(request, user_name): ): return render_user_blocked(request) tags = Tag.objects.filter(owner=user) - tags = user.tag_set.all() if user != request.user: tags = tags.filter(visibility=0) tags = tags.values("title").annotate(total=Count("members")).order_by("-total") @@ -497,7 +495,7 @@ def user_liked_collection_list(request, user_name): ) -def home_anonymous(request, id): +def profile_anonymous(request, id): login_url = settings.LOGIN_URL + "?next=" + request.get_full_path() try: username = id.split("@")[0] @@ -515,15 +513,14 @@ def home_anonymous(request, id): return redirect(login_url) -def home(request, user_name): - if not request.user.is_authenticated: - return home_anonymous(request, user_name) +def profile(request, user_name): if request.method != "GET": return HttpResponseBadRequest() user = User.get(user_name) if user is None: return render_user_not_found(request) - + if not request.user.is_authenticated and user.get_preference().no_anonymous_view: + return profile_anonymous(request, user_name) # access one's own home page if user == request.user: reports = Report.objects.order_by("-submitted_time").filter(is_read=False) @@ -538,7 +535,7 @@ def home(request, user_name): pass # visit other's home page else: - if request.user.is_blocked_by(user) or request.user.is_blocking(user): + if user.is_blocked_by(request.user) or user.is_blocking(request.user): return render_user_blocked(request) # no these value on other's home page reports = None @@ -583,12 +580,13 @@ def home(request, user_name): if user != request.user: liked_collections = liked_collections.filter(query_visible(request.user)) - layout = user.get_preference().get_serialized_home_layout() + layout = user.get_preference().get_serialized_profile_layout() return render( request, "profile.html", { "user": user, + "top_tags": user.tag_manager.all_tags[:10], "shelf_list": shelf_list, "collections": collections[:5], "collections_count": collections.count(), diff --git a/social/models.py b/social/models.py index e076b6cd..b9fb820e 100644 --- a/social/models.py +++ b/social/models.py @@ -91,7 +91,7 @@ class DataSignalManager: @staticmethod def add_handler_for_model(model): - if not settings.DISABLE_SOCIAL: + if not settings.DISABLE_MODEL_SIGNAL: post_save.connect(DataSignalManager.save_handler, sender=model) pre_delete.connect(DataSignalManager.delete_handler, sender=model) diff --git a/social/templates/activity/create_collection.html b/social/templates/activity/create_collection.html index b9d31f3e..fa4f1b3a 100644 --- a/social/templates/activity/create_collection.html +++ b/social/templates/activity/create_collection.html @@ -31,7 +31,7 @@
    - {{ activity.owner.display_name }} {% trans '创建了收藏单' %} + {{ activity.owner.display_name }} {% trans '创建了收藏单' %} - {{ activity.owner.display_name }} 关注了 - {{ activity.action_object.target.owner.display_name }} + {{ activity.owner.display_name }} 关注了 + {{ activity.action_object.target.owner.display_name }} 的收藏单
    diff --git a/social/templates/activity/mark_item.html b/social/templates/activity/mark_item.html index 461678c3..427f4519 100644 --- a/social/templates/activity/mark_item.html +++ b/social/templates/activity/mark_item.html @@ -32,7 +32,7 @@
    - {{ activity.owner.display_name }} {{ activity.action_object.parent.shelf_label }} + {{ activity.owner.display_name }} {{ activity.action_object.parent.shelf_label }} - {{ activity.owner.display_name }} {% trans '评论了' %} + {{ activity.owner.display_name }} {% trans '评论了' %} {{ activity.action_object.item.title }} {% if activity.action_object.item.year %}({{ activity.action_object.item.year }}){% endif %} diff --git a/social/templates/feed_data.html b/social/templates/feed_data.html index a95e95d9..24199a4b 100644 --- a/social/templates/feed_data.html +++ b/social/templates/feed_data.html @@ -76,7 +76,11 @@ hx-swap="outerHTML">
    {% endif %} {% empty %} +{% if request.GET.last %}
    {% trans '目前没有更多内容了' %}
    +{% else %} +
    {% trans '在NeoDB导入或标记一些书影音,去联邦宇宙(长毛象)关注一些正在使用NeoDB的用户,这里就会显示你和她们的近期动态。' %}
    +{% endif %} {% endfor %} - - - - - - -
    -
    - {% include "partial/_navbar.html" with current="timeline" %} - -
    -
    -
    -
    -
    - - -
      -
      -
    -
    -
    -
    - - {% include "partial/_sidebar.html" %} -
    -
    -
    - {% include "partial/_footer.html" %} -
    - - - -{% if unread_announcements %} -{% include "partial/_announcement.html" %} -{% endif %} - - diff --git a/timeline/templates/timeline_data.html b/timeline/templates/timeline_data.html deleted file mode 100644 index 02adc8aa..00000000 --- a/timeline/templates/timeline_data.html +++ /dev/null @@ -1,125 +0,0 @@ -{% load static %} -{% load i18n %} -{% load l10n %} -{% load admin_url %} -{% load mastodon %} -{% load oauth_token %} -{% load truncate %} -{% load thumb %} -{% load prettydate %} -{% load user_item %} - -{% for activity in activities %} -{% current_user_marked_item activity.target.item as marked %} -
  • -
    - - - - {% if not marked %} - - {% endif %} -
    -
    -
    - - {% if activity.target.shared_link %} - - - {{ activity.target.created_time|prettydate }} - {% else %} - {{ activity.target.created_time|prettydate }} - {% endif %} - -
    - - {{ activity.owner.display_name }} {{ activity.target.translated_status }} - - -

    - {% if activity.review %} - {{ activity.review.title }} - {% endif %} - {% if activity.mark %} - {% if activity.mark.rating %} - - {% endif %} - - {% if activity.mark.text %} -

    {{ activity.mark.text }}

    - {% endif %} - {% endif %} -

    -
    -
  • -{% if forloop.last %} -
    - - - - - - - - - - - - - - - - - - - - - - -
    -{% endif %} -{% empty %} -
    {% trans '目前没有更多内容了' %}
    -{% endfor %} diff --git a/timeline/tests.py b/timeline/tests.py deleted file mode 100644 index 7ce503c2..00000000 --- a/timeline/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase - -# Create your tests here. diff --git a/timeline/urls.py b/timeline/urls.py deleted file mode 100644 index b501f1e8..00000000 --- a/timeline/urls.py +++ /dev/null @@ -1,9 +0,0 @@ -from django.urls import path, re_path -from .views import * - - -app_name = 'timeline' -urlpatterns = [ - path('', timeline, name='timeline'), - path('data', data, name='data'), -] diff --git a/timeline/views.py b/timeline/views.py deleted file mode 100644 index 08d936a3..00000000 --- a/timeline/views.py +++ /dev/null @@ -1,70 +0,0 @@ -import logging -from django.shortcuts import render, get_object_or_404, redirect, reverse -from django.contrib.auth.decorators import login_required, permission_required -from django.utils.translation import gettext_lazy as _ -from django.http import HttpResponseBadRequest, HttpResponseServerError -from django.core.exceptions import ObjectDoesNotExist, PermissionDenied -from django.db import IntegrityError, transaction -from django.db.models import Count -from django.utils import timezone -from django.core.paginator import Paginator -from mastodon import mastodon_request_included -from mastodon.models import MastodonApplication -from common.utils import PageLinksGenerator -from .models import * -from books.models import BookTag -from movies.models import MovieTag -from games.models import GameTag -from music.models import AlbumTag -from django.conf import settings -import re -from users.models import User -from django.http import HttpResponseRedirect -from django.db.models import Q -import time -from management.models import Announcement - - -logger = logging.getLogger(__name__) -mastodon_logger = logging.getLogger("django.mastodon") -PAGE_SIZE = 20 - - -@login_required -def timeline(request): - if request.method != 'GET': - return - user = request.user - unread = Announcement.objects.filter(pk__gt=user.read_announcement_index).order_by('-pk') - if unread: - user.read_announcement_index = Announcement.objects.latest('pk').pk - user.save(update_fields=['read_announcement_index']) - return render( - request, - 'timeline.html', - { - 'book_tags': BookTag.all_by_user(user)[:10], - 'movie_tags': MovieTag.all_by_user(user)[:10], - 'music_tags': AlbumTag.all_by_user(user)[:10], - 'game_tags': GameTag.all_by_user(user)[:10], - 'unread_announcements': unread, - } - ) - - -@login_required -def data(request): - if request.method != 'GET': - return - q = Q(owner_id__in=request.user.following, visibility__lt=2) | Q(owner_id=request.user.id) - last = request.GET.get('last') - if last: - q = q & Q(created_time__lt=last) - activities = Activity.objects.filter(q).order_by('-created_time')[:PAGE_SIZE] - return render( - request, - 'timeline_data.html', - { - 'activities': activities, - } - ) diff --git a/users/account.py b/users/account.py index 6bc38e71..f8ace2cd 100644 --- a/users/account.py +++ b/users/account.py @@ -12,17 +12,8 @@ from .forms import ReportForm from mastodon.api import * from mastodon import mastodon_request_included from common.config import * -from common.models import MarkStatusEnum from common.utils import PageLinksGenerator from management.models import Announcement -from books.models import * -from movies.models import * -from music.models import * -from games.models import * -from books.forms import BookMarkStatusTranslator -from movies.forms import MovieMarkStatusTranslator -from music.forms import MusicMarkStatusTranslator -from games.forms import GameMarkStatusTranslator from mastodon.models import MastodonApplication from mastodon.api import verify_account from django.conf import settings @@ -34,35 +25,28 @@ from datetime import timedelta from django.utils import timezone import json from django.contrib import messages -from books.models import BookMark, BookReview -from movies.models import MovieMark, MovieReview -from games.models import GameMark, GameReview -from music.models import AlbumMark, SongMark, AlbumReview, SongReview -from collection.models import Collection, CollectionMark -from common.importers.goodreads import GoodreadsImporter -from common.importers.douban import DoubanImporter - +from journal.models import remove_data_by_user # the 'login' page that user can see def login(request): - if request.method == 'GET': - selected_site = request.GET.get('site', default='') + if request.method == "GET": + selected_site = request.GET.get("site", default="") sites = MastodonApplication.objects.all().order_by("domain_name") # store redirect url in the cookie - if request.GET.get('next'): - request.session['next_url'] = request.GET.get('next') + if request.GET.get("next"): + request.session["next_url"] = request.GET.get("next") return render( request, - 'users/login.html', + "users/login.html", { - 'sites': sites, - 'scope': quote(settings.MASTODON_CLIENT_SCOPE), - 'selected_site': selected_site, - 'allow_any_site': settings.MASTODON_ALLOW_ANY_SITE, - } + "sites": sites, + "scope": quote(settings.MASTODON_CLIENT_SCOPE), + "selected_site": selected_site, + "allow_any_site": settings.MASTODON_ALLOW_ANY_SITE, + }, ) else: return HttpResponseBadRequest() @@ -70,70 +54,80 @@ def login(request): # connect will redirect to mastodon server def connect(request): - login_domain = request.session['swap_domain'] if request.session.get('swap_login') else request.GET.get('domain') + login_domain = ( + request.session["swap_domain"] + if request.session.get("swap_login") + else request.GET.get("domain") + ) if not login_domain: - return render(request, 'common/error.html', {'msg': '未指定实例域名', 'secondary_msg': "", }) - login_domain = login_domain.strip().lower().split('//')[-1].split('/')[0].split('@')[-1] + return render( + request, + "common/error.html", + { + "msg": "未指定实例域名", + "secondary_msg": "", + }, + ) + login_domain = ( + login_domain.strip().lower().split("//")[-1].split("/")[0].split("@")[-1] + ) domain, version = get_instance_info(login_domain) app, error_msg = get_mastodon_application(domain) if app is None: - return render(request, 'common/error.html', {'msg': error_msg, 'secondary_msg': "", }) + return render( + request, + "common/error.html", + { + "msg": error_msg, + "secondary_msg": "", + }, + ) else: login_url = get_mastodon_login_url(app, login_domain, version, request) resp = redirect(login_url) - resp.set_cookie('mastodon_domain', domain) + resp.set_cookie("mastodon_domain", domain) return resp # mastodon server redirect back to here @mastodon_request_included def OAuth2_login(request): - if request.method != 'GET': + if request.method != "GET": return HttpResponseBadRequest() - code = request.GET.get('code') - site = request.COOKIES.get('mastodon_domain') + code = request.GET.get("code") + site = request.COOKIES.get("mastodon_domain") try: token, refresh_token = obtain_token(site, request, code) except ObjectDoesNotExist: return HttpResponseBadRequest("Mastodon site not registered") if not token: - return render( - request, - 'common/error.html', - { - 'msg': _("认证失败😫") - } - ) + return render(request, "common/error.html", {"msg": _("认证失败😫")}) - if request.session.get('swap_login', False) and request.user.is_authenticated: # swap login for existing user + if ( + request.session.get("swap_login", False) and request.user.is_authenticated + ): # swap login for existing user return swap_login(request, token, site, refresh_token) user = authenticate(request, token=token, site=site) if user: # existing user user.mastodon_token = token user.mastodon_refresh_token = refresh_token - user.save(update_fields=['mastodon_token', 'mastodon_refresh_token']) + user.save(update_fields=["mastodon_token", "mastodon_refresh_token"]) auth_login(request, user) - if request.session.get('next_url') is not None: - response = redirect(request.session.get('next_url')) - del request.session['next_url'] + if request.session.get("next_url") is not None: + response = redirect(request.session.get("next_url")) + del request.session["next_url"] else: - response = redirect(reverse('common:home')) + response = redirect(reverse("common:home")) return response else: # newly registered user code, user_data = verify_account(site, token) if code != 200 or user_data is None: - return render( - request, - 'common/error.html', - { - 'msg': _("联邦网络访问失败😫") - } - ) + return render(request, "common/error.html", {"msg": _("联邦网络访问失败😫")}) new_user = User( - username=user_data['username'], - mastodon_id=user_data['id'], + username=user_data["username"], + mastodon_id=user_data["id"], mastodon_site=site, mastodon_token=token, mastodon_refresh_token=refresh_token, @@ -141,15 +135,15 @@ def OAuth2_login(request): ) new_user.save() Preference.objects.create(user=new_user) - request.session['new_user'] = True + request.session["new_user"] = True auth_login(request, new_user) - return redirect(reverse('users:register')) + return redirect(reverse("users:register")) @mastodon_request_included @login_required def logout(request): - if request.method == 'GET': + if request.method == "GET": # revoke_token(request.user.mastodon_site, request.user.mastodon_token) auth_logout(request) return redirect(reverse("users:login")) @@ -160,9 +154,9 @@ def logout(request): @mastodon_request_included @login_required def reconnect(request): - if request.method == 'POST': - request.session['swap_login'] = True - request.session['swap_domain'] = request.POST['domain'] + if request.method == "POST": + request.session["swap_login"] = True + request.session["swap_domain"] = request.POST["domain"] return connect(request) else: return HttpResponseBadRequest() @@ -170,76 +164,85 @@ def reconnect(request): @mastodon_request_included def register(request): - if request.session.get('new_user'): - del request.session['new_user'] - return render(request, 'users/register.html') + if request.session.get("new_user"): + del request.session["new_user"] + return render(request, "users/register.html") else: - return redirect(reverse('common:home')) + return redirect(reverse("common:home")) def swap_login(request, token, site, refresh_token): - del request.session['swap_login'] - del request.session['swap_domain'] + del request.session["swap_login"] + del request.session["swap_domain"] code, data = verify_account(site, token) current_user = request.user if code == 200 and data is not None: - username = data['username'] + username = data["username"] if username == current_user.username and site == current_user.mastodon_site: - messages.add_message(request, messages.ERROR, _(f'该身份 {username}@{site} 与当前账号相同。')) + messages.add_message( + request, messages.ERROR, _(f"该身份 {username}@{site} 与当前账号相同。") + ) else: try: existing_user = User.objects.get(username=username, mastodon_site=site) - messages.add_message(request, messages.ERROR, _(f'该身份 {username}@{site} 已被用于其它账号。')) + messages.add_message( + request, messages.ERROR, _(f"该身份 {username}@{site} 已被用于其它账号。") + ) except ObjectDoesNotExist: current_user.username = username - current_user.mastodon_id = data['id'] + current_user.mastodon_id = data["id"] current_user.mastodon_site = site current_user.mastodon_token = token current_user.mastodon_refresh_token = refresh_token current_user.mastodon_account = data - current_user.save(update_fields=['username', 'mastodon_id', 'mastodon_site', 'mastodon_token', 'mastodon_refresh_token', 'mastodon_account']) - django_rq.get_queue('mastodon').enqueue(refresh_mastodon_data_task, current_user, token) - messages.add_message(request, messages.INFO, _(f'账号身份已更新为 {username}@{site}。')) + current_user.save( + update_fields=[ + "username", + "mastodon_id", + "mastodon_site", + "mastodon_token", + "mastodon_refresh_token", + "mastodon_account", + ] + ) + django_rq.get_queue("mastodon").enqueue( + refresh_mastodon_data_task, current_user, token + ) + messages.add_message( + request, messages.INFO, _(f"账号身份已更新为 {username}@{site}。") + ) else: - messages.add_message(request, messages.ERROR, _('连接联邦网络获取身份信息失败。')) - return redirect(reverse('users:data')) + messages.add_message(request, messages.ERROR, _("连接联邦网络获取身份信息失败。")) + return redirect(reverse("users:data")) def auth_login(request, user): - """ Decorates django ``login()``. Attach token to session.""" + """Decorates django ``login()``. Attach token to session.""" auth.login(request, user) - if user.mastodon_last_refresh < timezone.now() - timedelta(hours=1) or user.mastodon_account == {}: - django_rq.get_queue('mastodon').enqueue(refresh_mastodon_data_task, user) + if ( + user.mastodon_last_refresh < timezone.now() - timedelta(hours=1) + or user.mastodon_account == {} + ): + django_rq.get_queue("mastodon").enqueue(refresh_mastodon_data_task, user) def auth_logout(request): - """ Decorates django ``logout()``. Release token in session.""" + """Decorates django ``logout()``. Release token in session.""" auth.logout(request) @login_required def clear_data(request): - if request.method == 'POST': - if request.POST.get('verification') == request.user.mastodon_username: - BookMark.objects.filter(owner=request.user).delete() - MovieMark.objects.filter(owner=request.user).delete() - GameMark.objects.filter(owner=request.user).delete() - AlbumMark.objects.filter(owner=request.user).delete() - SongMark.objects.filter(owner=request.user).delete() - BookReview.objects.filter(owner=request.user).delete() - MovieReview.objects.filter(owner=request.user).delete() - GameReview.objects.filter(owner=request.user).delete() - AlbumReview.objects.filter(owner=request.user).delete() - SongReview.objects.filter(owner=request.user).delete() - CollectionMark.objects.filter(owner=request.user).delete() - Collection.objects.filter(owner=request.user).delete() + if request.method == "POST": + if request.POST.get("verification") == request.user.mastodon_username: + remove_data_by_user(request.user) request.user.first_name = request.user.username request.user.last_name = request.user.mastodon_site request.user.is_active = False - request.user.username = 'removed_' + str(request.user.id) + request.user.username = "removed_" + str(request.user.id) request.user.mastodon_id = 0 - request.user.mastodon_site = 'removed' - request.user.mastodon_token = '' + request.user.mastodon_site = "removed" + request.user.mastodon_token = "" request.user.mastodon_locked = False request.user.mastodon_followers = [] request.user.mastodon_following = [] @@ -251,5 +254,5 @@ def clear_data(request): auth_logout(request) return redirect(reverse("users:login")) else: - messages.add_message(request, messages.ERROR, _('验证信息不符。')) + messages.add_message(request, messages.ERROR, _("验证信息不符。")) return redirect(reverse("users:data")) diff --git a/users/data.py b/users/data.py index 563af02e..23b64408 100644 --- a/users/data.py +++ b/users/data.py @@ -12,17 +12,8 @@ from .forms import ReportForm from mastodon.api import * from mastodon import mastodon_request_included from common.config import * -from common.models import MarkStatusEnum from common.utils import PageLinksGenerator from management.models import Announcement -from books.models import * -from movies.models import * -from music.models import * -from games.models import * -from books.forms import BookMarkStatusTranslator -from movies.forms import MovieMarkStatusTranslator -from music.forms import MusicMarkStatusTranslator -from games.forms import GameMarkStatusTranslator from mastodon.models import MastodonApplication from mastodon.api import verify_account from django.conf import settings @@ -34,20 +25,10 @@ from datetime import timedelta from django.utils import timezone import json from django.contrib import messages -from books.models import BookMark, BookReview -from movies.models import MovieMark, MovieReview -from games.models import GameMark, GameReview -from music.models import AlbumMark, SongMark, AlbumReview, SongReview -from timeline.models import Activity -from collection.models import Collection -if settings.ENABLE_NEW_MODEL: - from journal.importers.douban import DoubanImporter - from journal.importers.goodreads import GoodreadsImporter - from journal.models import reset_visibility_for_user -else: - from common.importers.douban import DoubanImporter - from common.importers.goodreads import GoodreadsImporter +from journal.importers.douban import DoubanImporter +from journal.importers.goodreads import GoodreadsImporter +from journal.models import reset_visibility_for_user @mastodon_request_included @@ -84,7 +65,6 @@ def data(request): "users/data.html", { "allow_any_site": settings.MASTODON_ALLOW_ANY_SITE, - "latest_task": request.user.user_synctasks.order_by("-id").first(), "import_status": request.user.get_preference().import_status, "export_status": request.user.get_preference().export_status, }, diff --git a/users/models.py b/users/models.py index 206ce8b1..86bccd8a 100644 --- a/users/models.py +++ b/users/models.py @@ -8,27 +8,31 @@ from django.utils.translation import gettext_lazy as _ from common.utils import GenerateDateUUIDMediaFilePath from django.conf import settings from mastodon.api import * -from django.shortcuts import reverse +from django.urls import reverse def report_image_path(instance, filename): - return GenerateDateUUIDMediaFilePath(instance, filename, settings.REPORT_MEDIA_PATH_ROOT) + return GenerateDateUUIDMediaFilePath( + instance, filename, settings.REPORT_MEDIA_PATH_ROOT + ) class User(AbstractUser): if settings.MASTODON_ALLOW_ANY_SITE: username = models.CharField( - _('username'), + _("username"), max_length=150, unique=False, - help_text=_('Required. 150 characters or fewer. Letters, digits and @/./+/-/_ only.'), + help_text=_( + "Required. 150 characters or fewer. Letters, digits and @/./+/-/_ only." + ), ) following = models.JSONField(default=list) mastodon_id = models.CharField(max_length=100, blank=False) # mastodon domain name, eg donotban.com mastodon_site = models.CharField(max_length=100, blank=False) - mastodon_token = models.CharField(max_length=2048, default='') - mastodon_refresh_token = models.CharField(max_length=2048, default='') + mastodon_token = models.CharField(max_length=2048, default="") + mastodon_refresh_token = models.CharField(max_length=2048, default="") mastodon_locked = models.BooleanField(default=False) mastodon_followers = models.JSONField(default=list) mastodon_following = models.JSONField(default=list) @@ -44,7 +48,8 @@ class User(AbstractUser): class Meta: constraints = [ models.UniqueConstraint( - fields=['username', 'mastodon_site'], name="unique_user_identity") + fields=["username", "mastodon_site"], name="unique_user_identity" + ) ] # def save(self, *args, **kwargs): @@ -54,15 +59,21 @@ class User(AbstractUser): @property def mastodon_username(self): - return self.username + '@' + self.mastodon_site + return self.username + "@" + self.mastodon_site @property def display_name(self): - return self.mastodon_account['display_name'] if self.mastodon_account and 'display_name' in self.mastodon_account and self.mastodon_account['display_name'] else self.mastodon_username + return ( + self.mastodon_account["display_name"] + if self.mastodon_account + and "display_name" in self.mastodon_account + and self.mastodon_account["display_name"] + else self.mastodon_username + ) @property def url(self): - return reverse("users:home", args=[self.mastodon_username]) + return reverse("journal:user_profile", args=[self.mastodon_username]) def __str__(self): return self.mastodon_username @@ -74,59 +85,92 @@ class User(AbstractUser): return pref def refresh_mastodon_data(self): - """ Try refresh account data from mastodon server, return true if refreshed successfully, note it will not save to db """ + """Try refresh account data from mastodon server, return true if refreshed successfully, note it will not save to db""" self.mastodon_last_refresh = timezone.now() code, mastodon_account = verify_account(self.mastodon_site, self.mastodon_token) if code == 401 and self.mastodon_refresh_token: - self.mastodon_token = refresh_access_token(self.mastodon_site, self.mastodon_refresh_token) + self.mastodon_token = refresh_access_token( + self.mastodon_site, self.mastodon_refresh_token + ) if self.mastodon_token: - code, mastodon_account = verify_account(self.mastodon_site, self.mastodon_token) + code, mastodon_account = verify_account( + self.mastodon_site, self.mastodon_token + ) updated = False if mastodon_account: self.mastodon_account = mastodon_account - self.mastodon_locked = mastodon_account['locked'] - if self.username != mastodon_account['username']: + self.mastodon_locked = mastodon_account["locked"] + if self.username != mastodon_account["username"]: print(f"username changed from {self} to {mastodon_account['username']}") - self.username = mastodon_account['username'] + self.username = mastodon_account["username"] # self.mastodon_token = token # user.mastodon_id = mastodon_account['id'] - self.mastodon_followers = get_related_acct_list(self.mastodon_site, self.mastodon_token, f'/api/v1/accounts/{self.mastodon_id}/followers') - self.mastodon_following = get_related_acct_list(self.mastodon_site, self.mastodon_token, f'/api/v1/accounts/{self.mastodon_id}/following') - self.mastodon_mutes = get_related_acct_list(self.mastodon_site, self.mastodon_token, '/api/v1/mutes') - self.mastodon_blocks = get_related_acct_list(self.mastodon_site, self.mastodon_token, '/api/v1/blocks') - self.mastodon_domain_blocks = get_related_acct_list(self.mastodon_site, self.mastodon_token, '/api/v1/domain_blocks') + self.mastodon_followers = get_related_acct_list( + self.mastodon_site, + self.mastodon_token, + f"/api/v1/accounts/{self.mastodon_id}/followers", + ) + self.mastodon_following = get_related_acct_list( + self.mastodon_site, + self.mastodon_token, + f"/api/v1/accounts/{self.mastodon_id}/following", + ) + self.mastodon_mutes = get_related_acct_list( + self.mastodon_site, self.mastodon_token, "/api/v1/mutes" + ) + self.mastodon_blocks = get_related_acct_list( + self.mastodon_site, self.mastodon_token, "/api/v1/blocks" + ) + self.mastodon_domain_blocks = get_related_acct_list( + self.mastodon_site, self.mastodon_token, "/api/v1/domain_blocks" + ) self.following = self.get_following_ids() updated = True elif code == 401: - print(f'401 {self}') - self.mastodon_token = '' + print(f"401 {self}") + self.mastodon_token = "" return updated def get_following_ids(self): fl = [] for m in self.mastodon_following: target = User.get(m) - if target and ((not target.mastodon_locked) or self.mastodon_username in target.mastodon_followers): - fl.append(target.id) + if target and ( + (not target.mastodon_locked) + or self.mastodon_username in target.mastodon_followers + ): + fl.append(target.pk) return fl def is_blocking(self, target): - return target.mastodon_username in self.mastodon_blocks or target.mastodon_site in self.mastodon_domain_blocks + return ( + ( + target.mastodon_username in self.mastodon_blocks + or target.mastodon_site in self.mastodon_domain_blocks + ) + if target.is_authenticated + else self.preference.no_anonymous_view + ) def is_blocked_by(self, target): - return target.is_blocking(self) + return target.is_authenticated and target.is_blocking(self) def is_muting(self, target): return target.mastodon_username in self.mastodon_mutes def is_following(self, target): - return self.mastodon_username in target.mastodon_followers if target.mastodon_locked else self.mastodon_username in target.mastodon_followers or target.mastodon_username in self.mastodon_following + return ( + self.mastodon_username in target.mastodon_followers + if target.mastodon_locked + else self.mastodon_username in target.mastodon_followers + or target.mastodon_username in self.mastodon_following + ) def is_followed_by(self, target): return target.is_following(self) def get_mark_for_item(self, item): - params = {item.__class__.__name__.lower() + '_id': item.id, 'owner': self} + params = {item.__class__.__name__.lower() + "_id": item.id, "owner": self} mark = item.mark_class.objects.filter(**params).first() return mark @@ -143,16 +187,16 @@ class User(AbstractUser): return 0 @classmethod - def get(self, id): + def get(cls, id): if isinstance(id, str): try: - username = id.split('@')[0] - site = id.split('@')[1] - except IndexError as e: + username = id.split("@")[0] + site = id.split("@")[1] + except IndexError: return None - query_kwargs = {'username': username, 'mastodon_site': site} + query_kwargs = {"username": username, "mastodon_site": site} elif isinstance(id, int): - query_kwargs = {'pk': id} + query_kwargs = {"pk": id} else: return None return User.objects.filter(**query_kwargs).first() @@ -160,30 +204,44 @@ class User(AbstractUser): class Preference(models.Model): user = models.OneToOneField(User, models.CASCADE, primary_key=True) - home_layout = postgres.ArrayField( - postgres.HStoreField(), + profile_layout = models.JSONField( blank=True, default=list, ) - export_status = models.JSONField(blank=True, null=True, encoder=DjangoJSONEncoder, default=dict) - import_status = models.JSONField(blank=True, null=True, encoder=DjangoJSONEncoder, default=dict) + export_status = models.JSONField( + blank=True, null=True, encoder=DjangoJSONEncoder, default=dict + ) + import_status = models.JSONField( + blank=True, null=True, encoder=DjangoJSONEncoder, default=dict + ) default_visibility = models.PositiveSmallIntegerField(default=0) classic_homepage = models.BooleanField(null=False, default=False) mastodon_publish_public = models.BooleanField(null=False, default=False) - mastodon_append_tag = models.CharField(max_length=2048, default='') + mastodon_append_tag = models.CharField(max_length=2048, default="") show_last_edit = models.PositiveSmallIntegerField(default=0) + no_anonymous_view = models.PositiveSmallIntegerField(default=0) - def get_serialized_home_layout(self): - return str(self.home_layout).replace("\'", "\"") + def get_serialized_profile_layout(self): + return str(self.profile_layout).replace("'", '"') def __str__(self): return str(self.user) class Report(models.Model): - submit_user = models.ForeignKey(User, on_delete=models.SET_NULL, related_name='sumbitted_reports', null=True) - reported_user = models.ForeignKey(User, on_delete=models.SET_NULL, related_name='accused_reports', null=True) - image = models.ImageField(upload_to=report_image_path, height_field=None, width_field=None, blank=True, default='') + submit_user = models.ForeignKey( + User, on_delete=models.SET_NULL, related_name="sumbitted_reports", null=True + ) + reported_user = models.ForeignKey( + User, on_delete=models.SET_NULL, related_name="accused_reports", null=True + ) + image = models.ImageField( + upload_to=report_image_path, + height_field=None, + width_field=None, + blank=True, + default="", + ) is_read = models.BooleanField(default=False) submitted_time = models.DateTimeField(auto_now_add=True) message = models.CharField(max_length=1000) diff --git a/users/tasks.py b/users/tasks.py index 945a3037..7ba83a41 100644 --- a/users/tasks.py +++ b/users/tasks.py @@ -12,17 +12,8 @@ from .forms import ReportForm from mastodon.api import * from mastodon import mastodon_request_included from common.config import * -from common.models import MarkStatusEnum from common.utils import PageLinksGenerator from management.models import Announcement -from books.models import * -from movies.models import * -from music.models import * -from games.models import * -from books.forms import BookMarkStatusTranslator -from movies.forms import MovieMarkStatusTranslator -from music.forms import MusicMarkStatusTranslator -from games.forms import GameMarkStatusTranslator from mastodon.models import MastodonApplication from django.conf import settings from urllib.parse import quote @@ -40,216 +31,3 @@ def refresh_mastodon_data_task(user, token=None): print(f"{user} mastodon data refreshed") else: print(f"{user} mastodon data refresh failed") - - -def export_marks_task(user): - user.preference.export_status["marks_pending"] = True - user.preference.save(update_fields=["export_status"]) - filename = GenerateDateUUIDMediaFilePath( - None, "f.xlsx", settings.MEDIA_ROOT + settings.EXPORT_FILE_PATH_ROOT - ) - if not os.path.exists(os.path.dirname(filename)): - os.makedirs(os.path.dirname(filename)) - heading = ["标题", "简介", "豆瓣评分", "链接", "创建时间", "我的评分", "标签", "评论", "NeoDB链接", "其它ID"] - wb = ( - Workbook() - ) # adding write_only=True will speed up but corrupt the xlsx and won't be importable - for status, label in [("collect", "看过"), ("do", "在看"), ("wish", "想看")]: - ws = wb.create_sheet(title=label) - marks = MovieMark.objects.filter(owner=user, status=status).order_by( - "-edited_time" - ) - ws.append(heading) - for mark in marks: - movie = mark.movie - title = movie.title - summary = ( - str(movie.year) - + " / " - + ",".join(movie.area) - + " / " - + ",".join(map(lambda x: str(MovieGenreTranslator[x]), movie.genre)) - + " / " - + ",".join(movie.director) - + " / " - + ",".join(movie.actor) - ) - tags = ",".join(list(map(lambda m: m.content, mark.tags))) - world_rating = (movie.rating / 2) if movie.rating else None - timestamp = mark.edited_time.strftime("%Y-%m-%d %H:%M:%S") - my_rating = (mark.rating / 2) if mark.rating else None - text = mark.text - source_url = movie.source_url - url = settings.APP_WEBSITE + movie.get_absolute_url() - line = [ - title, - summary, - world_rating, - source_url, - timestamp, - my_rating, - tags, - text, - url, - movie.imdb_code, - ] - ws.append(line) - - for status, label in [("collect", "听过"), ("do", "在听"), ("wish", "想听")]: - ws = wb.create_sheet(title=label) - marks = AlbumMark.objects.filter(owner=user, status=status).order_by( - "-edited_time" - ) - ws.append(heading) - for mark in marks: - album = mark.album - title = album.title - summary = ( - ",".join(album.artist) - + " / " - + (album.release_date.strftime("%Y") if album.release_date else "") - ) - tags = ",".join(list(map(lambda m: m.content, mark.tags))) - world_rating = (album.rating / 2) if album.rating else None - timestamp = mark.edited_time.strftime("%Y-%m-%d %H:%M:%S") - my_rating = (mark.rating / 2) if mark.rating else None - text = mark.text - source_url = album.source_url - url = settings.APP_WEBSITE + album.get_absolute_url() - line = [ - title, - summary, - world_rating, - source_url, - timestamp, - my_rating, - tags, - text, - url, - "", - ] - ws.append(line) - - for status, label in [("collect", "读过"), ("do", "在读"), ("wish", "想读")]: - ws = wb.create_sheet(title=label) - marks = BookMark.objects.filter(owner=user, status=status).order_by( - "-edited_time" - ) - ws.append(heading) - for mark in marks: - book = mark.book - title = book.title - summary = ( - ",".join(book.author) - + " / " - + str(book.pub_year) - + " / " - + book.pub_house - ) - tags = ",".join(list(map(lambda m: m.content, mark.tags))) - world_rating = (book.rating / 2) if book.rating else None - timestamp = mark.edited_time.strftime("%Y-%m-%d %H:%M:%S") - my_rating = (mark.rating / 2) if mark.rating else None - text = mark.text - source_url = book.source_url - url = settings.APP_WEBSITE + book.get_absolute_url() - line = [ - title, - summary, - world_rating, - source_url, - timestamp, - my_rating, - tags, - text, - url, - book.isbn, - ] - ws.append(line) - - for status, label in [("collect", "玩过"), ("do", "在玩"), ("wish", "想玩")]: - ws = wb.create_sheet(title=label) - marks = GameMark.objects.filter(owner=user, status=status).order_by( - "-edited_time" - ) - ws.append(heading) - for mark in marks: - game = mark.game - title = game.title - summary = ( - ",".join(game.genre) - + " / " - + ",".join(game.platform) - + " / " - + (game.release_date.strftime("%Y-%m-%d") if game.release_date else "") - ) - tags = ",".join(list(map(lambda m: m.content, mark.tags))) - world_rating = (game.rating / 2) if game.rating else None - timestamp = mark.edited_time.strftime("%Y-%m-%d %H:%M:%S") - my_rating = (mark.rating / 2) if mark.rating else None - text = mark.text - source_url = game.source_url - url = settings.APP_WEBSITE + game.get_absolute_url() - line = [ - title, - summary, - world_rating, - source_url, - timestamp, - my_rating, - tags, - text, - url, - "", - ] - ws.append(line) - - review_heading = [ - "标题", - "评论对象", - "链接", - "创建时间", - "我的评分", - "类型", - "内容", - "评论对象原始链接", - "评论对象NeoDB链接", - ] - for ReviewModel, label in [ - (MovieReview, "影评"), - (BookReview, "书评"), - (AlbumReview, "乐评"), - (GameReview, "游戏评论"), - ]: - ws = wb.create_sheet(title=label) - reviews = ReviewModel.objects.filter(owner=user).order_by("-edited_time") - ws.append(review_heading) - for review in reviews: - title = review.title - target = "《" + review.item.title + "》" - url = review.url - timestamp = review.edited_time.strftime("%Y-%m-%d %H:%M:%S") - my_rating = None # (mark.rating / 2) if mark.rating else None - content = review.content - target_source_url = review.item.source_url - target_url = review.item.absolute_url - line = [ - title, - target, - url, - timestamp, - my_rating, - label, - content, - target_source_url, - target_url, - ] - ws.append(line) - - wb.save(filename=filename) - user.preference.export_status["marks_pending"] = False - user.preference.export_status["marks_file"] = filename - user.preference.export_status["marks_date"] = datetime.now().strftime( - "%Y-%m-%d %H:%M" - ) - user.preference.save(update_fields=["export_status"]) diff --git a/users/templates/users/data.html b/users/templates/users/data.html index 3740de3b..99708aba 100644 --- a/users/templates/users/data.html +++ b/users/templates/users/data.html @@ -25,131 +25,25 @@
    + {% if messages %}
    - {% if messages %}
      {% for message in messages %} {{ message }} {% endfor %}
    - {% endif %}
    + {% endif %}
    -
    {% trans '导入豆瓣标记和短评' %}
    -
    -
    - {% csrf_token %} - - {% trans '导入:' %} -
    - - -
    -
    - - -
    -
    - - -
    -
    - - -
    -
    - {% trans '覆盖:' %} -
    - - -
    -
    - {% trans '可见性:' %} -
    - - -
    -
    -
    - 从豆伴(豆坟)备份导出的.xlsx文件,请勿手动修改该文件: - -
    - -
    - - -
    -
    -
    -
    - -
    -
    -
    -
    {% trans '导入豆瓣评论' %}
    +
    {% trans '导入豆瓣标记和评论' %}
    {% csrf_token %}
    - 请在豆伴(豆坟)导出时勾选「书影音游剧」和「评论」;已经存在的评论不会被覆盖。 + 请在豆伴(豆坟)导出时勾选「书影音游剧」和「评论」。正向变化(想读->在读->已读)的标记会更新,其它已经存在的标记和评论不会被覆盖。

    豆伴(豆坟)备份导出的.xlsx文件: @@ -323,8 +217,6 @@ {% include "partial/_footer.html" %}

    -
    -
    diff --git a/users/templates/users/home.html b/users/templates/users/home.html deleted file mode 100644 index f605486e..00000000 --- a/users/templates/users/home.html +++ /dev/null @@ -1,674 +0,0 @@ -{% load static %} -{% load i18n %} -{% load admin_url %} -{% load mastodon %} -{% load oauth_token %} -{% load truncate %} -{% load thumb %} - - - - - - - {% if user == request.user %} - {{ site_name }} - {% trans '我的个人主页' %} - {% else %} - {{ site_name }} - {{user.display_name}} - {% endif %} - - - {% include "partial/_common_libs.html" with jquery=1 %} - - - - - - -
    -
    - {% include "partial/_navbar.html" with current="home" %} - -
    -
    -
    - -
    - -
    -
    - {% trans '想读的书' %} -
    - - {{ wish_book_count }} - - {% if wish_book_more %} - {% trans '更多' %} - {% endif %} - -
    - -
    -
    - {% trans '在读的书' %} -
    - - {{ do_book_count }} - - {% if do_book_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '读过的书' %} -
    - - {{ collect_book_count }} - - {% if collect_book_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '评论过的书籍' %} -
    - - {{ book_reviews_count }} - - {% if book_reviews_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '想看的电影/剧集' %} -
    - - {{ wish_movie_count }} - - {% if wish_movie_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '在看的电影/剧集' %} -
    - - {{ do_movie_count }} - - {% if do_movie_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '看过的电影/剧集' %} -
    - - {{ collect_movie_count }} - - {% if collect_movie_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '评论过的电影/剧集' %} -
    - - {{ movie_reviews_count }} - - {% if movie_reviews_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '想听的音乐' %} -
    - - {{ wish_music_count }} - - {% if wish_music_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '在听的音乐' %} -
    - - {{ do_music_count }} - - {% if do_music_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '听过的音乐' %} -
    - - {{ collect_music_count }} - - {% if collect_music_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '评论过的音乐' %} -
    - - {{ music_reviews_count }} - - {% if music_reviews_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '想玩的游戏' %} -
    - - {{ wish_game_count }} - - {% if wish_game_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '在玩的游戏' %} -
    - - {{ do_game_count }} - - {% if do_game_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '玩过的游戏' %} -
    - - {{ collect_game_count }} - - {% if collect_game_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '评论过的游戏' %} -
    - - {{ game_reviews_count }} - - {% if game_reviews_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    -
    - {% trans '创建的收藏单' %} -
    - - {{ collections_count }} - - {% if collections_more %} - {% trans '更多' %} - {% endif %} - {% if user == request.user %} - {% trans '新建' %} - {% endif %} - - -
    - -
    -
    - {% trans '关注的收藏单' %} -
    - - {{ marked_collections_count }} - - {% if marked_collections_more %} - {% trans '更多' %} - {% endif %} - - -
    - -
    - - {% if user == request.user %} - -
    -
    - - {% trans '编辑布局' %} - - - - - - - - - -
    - -
    - - - {% csrf_token %} - - - - - {% endif %} - - -
    - - {% include "partial/_sidebar.html" %} -
    -
    -
    - {% include "partial/_footer.html" %} -
    - - {% if unread_announcements %} - {% include "partial/_announcement.html" %} - {% endif %} - - \ No newline at end of file diff --git a/users/templates/users/item_list.html b/users/templates/users/item_list.html deleted file mode 100644 index 595071dc..00000000 --- a/users/templates/users/item_list.html +++ /dev/null @@ -1,94 +0,0 @@ -{% load static %} -{% load i18n %} -{% load l10n %} -{% load admin_url %} -{% load mastodon %} -{% load oauth_token %} -{% load truncate %} -{% load thumb %} - - - - - - - {{ site_name }} - {{ user.mastodon_username }} {{ list_title }} - - - - - - - - - - - - -
    -
    - {% include "partial/_navbar.html" %} - -
    -
    -
    -
    -
    - -
    -
    - {{ user.mastodon_username }} {{ list_title }} -
    -
    -
      - {% for mark in marks %} - {% include "partial/list_item.html" with item=mark.item hide_category=True %} - {% empty %} -
      {% trans '无结果' %}
      - {% endfor %} -
    -
    - -
    -
    - - {% include "partial/_sidebar.html" %} -
    -
    -
    - {% include "partial/_footer.html" %} -
    - - - - - - diff --git a/users/templates/users/manage_report.html b/users/templates/users/manage_report.html index ee2270eb..1d8f4822 100644 --- a/users/templates/users/manage_report.html +++ b/users/templates/users/manage_report.html @@ -27,9 +27,9 @@ {% for report in reports %}
    - {{ report.submit_user.username }} + {{ report.submit_user.username }} {% trans '举报了' %} - {{ report.reported_user.username }} + {{ report.reported_user.username }} @{{ report.submitted_time }} {% if report.image %} diff --git a/users/templates/users/preferences.html b/users/templates/users/preferences.html index 42e8274c..88526deb 100644 --- a/users/templates/users/preferences.html +++ b/users/templates/users/preferences.html @@ -45,7 +45,13 @@ {% trans '登录后显示个人主页:' %}
    - + +
    +
    + {% trans '不允许未登录用户访问个人主页和RSS:' %} +
    + +

    {% trans '显示最近编辑者:' %} diff --git a/users/templates/users/relation_list.html b/users/templates/users/relation_list.html index ae14e2ea..00e77da9 100644 --- a/users/templates/users/relation_list.html +++ b/users/templates/users/relation_list.html @@ -64,7 +64,7 @@ @@ -142,7 +142,7 @@ {% else %} {% endif %} - +