add backend support for multiple entity sources

This commit is contained in:
doubaniux 2020-11-23 23:18:14 +01:00
parent 36b85a3b00
commit 56a66332fa
17 changed files with 275 additions and 95 deletions

View file

@ -24,6 +24,8 @@ class BookForm(forms.ModelForm):
fields = [
'id',
'title',
'source_site',
'source_url',
'isbn',
'author',
'pub_house',

View file

@ -3,7 +3,8 @@ import django.contrib.postgres.fields as postgres
from django.utils.translation import ugettext_lazy as _
from django.db import models
from django.core.serializers.json import DjangoJSONEncoder
from common.models import Resource, Mark, Review, Tag
from django.shortcuts import reverse
from common.models import Entity, Mark, Review, Tag
from boofilsic.settings import BOOK_MEDIA_PATH_ROOT, DEFAULT_BOOK_IMAGE
from django.utils import timezone
@ -19,7 +20,7 @@ def book_cover_path(instance, filename):
return root + timezone.now().strftime('%Y/%m/%d') + f'{filename}'
class Book(Resource):
class Book(Entity):
# widely recognized name, usually in Chinese
title = models.CharField(_("title"), max_length=200)
subtitle = models.CharField(_("subtitle"), blank=True, default='', max_length=200)
@ -46,7 +47,7 @@ class Book(Resource):
# since data origin is not formatted and might be CNY USD or other currency, use char instead
price = models.CharField(_("pricing"), blank=True, default='', max_length=50)
pages = models.PositiveIntegerField(_("pages"), null=True, blank=True)
isbn = models.CharField(_("ISBN"), blank=True, null=True, max_length=20, unique=True, db_index=True)
isbn = models.CharField(_("ISBN"), blank=True, null=True, max_length=20, db_index=True)
# to store previously scrapped data
cover = models.ImageField(_("cover picture"), upload_to=book_cover_path, default=DEFAULT_BOOK_IMAGE, blank=True)
contents = models.TextField(blank=True, default="")
@ -66,6 +67,9 @@ class Book(Resource):
def __str__(self):
return self.title
def get_absolute_url(self):
return reverse("books:retrieve", args=[self.id])
def get_tags_manager(self):
return self.book_tags
@ -76,8 +80,6 @@ class Book(Resource):
class BookMark(Mark):
# maybe this is the better solution, for it has less complex index
# book = models.ForeignKey(Book, on_delete=models.CASCADE, related_name='book_marks', null=True, unique=True)
book = models.ForeignKey(Book, on_delete=models.CASCADE, related_name='book_marks', null=True)
class Meta:
constraints = [

View file

@ -50,9 +50,30 @@
<script>
// mark required
$("#content input[required]").each(function () {
$("#content *[required]").each(function () {
$(this).prev().prepend("*");
})
});
// when source site is this site, hide url input box and populate it with fake url
// the backend would update this field
if ($("select[name='source_site']").val() == {{ this_site_enum_value }}) {
$("input[name='source_url']").hide();
$("label[for='id_source_url']").hide();
$("input[name='source_url']").val("https://www.temp.com/" + Date.now() + Math.random());
}
$("select[name='source_site']").change(function () {
let value = $(this).val();
if (value == {{ this_site_enum_value }}) {
$("input[name='source_url']").hide();
$("label[for='id_source_url']").hide();
$("input[name='source_url']").val("https://www.temp.com/" + Date.now() + Math.random());
} else {
$("input[name='source_url']").show();
$("label[for='id_source_url']").show();
$("input[name='source_url']").val("");
}
});
</script>
</body>

View file

@ -99,7 +99,7 @@ ISBN: 9787020104345
</div>
<script>
// mark required
$("#content input[required]").each(function () {
$("#content *[required]").each(function () {
$(this).prev().prepend("*");
});
$('form').submit(function () {

View file

@ -14,6 +14,7 @@ from mastodon.api import check_visibility, post_toot, TootVisibilityEnum
from mastodon.utils import rating_to_emoji
from common.utils import PageLinksGenerator
from common.views import PAGE_LINK_NUMBER
from common.models import SourceSiteEnum
from .models import *
from .forms import *
from .forms import BookMarkStatusTranslator
@ -48,7 +49,9 @@ def create(request):
{
'form': form,
'title': _('添加书籍'),
'submit_url': reverse("books:create")
'submit_url': reverse("books:create"),
# provided for frontend js
'this_site_enum_value': SourceSiteEnum.IN_SITE.value,
}
)
elif request.method == 'POST':
@ -57,7 +60,16 @@ def create(request):
form = BookForm(request.POST, request.FILES)
if form.is_valid():
form.instance.last_editor = request.user
form.save()
try:
with transaction.atomic():
form.save()
if form.instance.source_site == SourceSiteEnum.IN_SITE.value:
real_url = form.instance.get_absolute_url()
form.instance.source_url = real_url
form.instance.save()
except IntegrityError as e:
logger.error(e.__str__())
return HttpResponseServerError("integrity error")
return redirect(reverse("books:retrieve", args=[form.instance.id]))
else:
return render(
@ -66,7 +78,9 @@ def create(request):
{
'form': form,
'title': _('添加书籍'),
'submit_url': reverse("books:create")
'submit_url': reverse("books:create"),
# provided for frontend js
'this_site_enum_value': SourceSiteEnum.IN_SITE.value,
}
)
else:
@ -86,7 +100,9 @@ def update(request, id):
{
'form': form,
'title': _('修改书籍'),
'submit_url': reverse("books:update", args=[book.id])
'submit_url': reverse("books:update", args=[book.id]),
# provided for frontend js
'this_site_enum_value': SourceSiteEnum.IN_SITE.value,
}
)
elif request.method == 'POST':
@ -95,7 +111,16 @@ def update(request, id):
if form.is_valid():
form.instance.last_editor = request.user
form.instance.edited_time = timezone.now()
form.save()
try:
with transaction.atomic():
form.save()
if form.instance.source_site == SourceSiteEnum.IN_SITE.value:
real_url = form.instance.get_absolute_url()
form.instance.source_url = real_url
form.instance.save()
except IntegrityError as e:
logger.error(e.__str__())
return HttpResponseServerError("integrity error")
else:
return render(
request,
@ -103,7 +128,9 @@ def update(request, id):
{
'form': form,
'title': _('修改书籍'),
'submit_url': reverse("books:update", args=[book.id])
'submit_url': reverse("books:update", args=[book.id]),
# provided for frontend js
'this_site_enum_value': SourceSiteEnum.IN_SITE.value,
}
)
return redirect(reverse("books:retrieve", args=[form.instance.id]))

View file

@ -7,16 +7,17 @@ from django.db.models import Q
from markdownx.models import MarkdownxField
from users.models import User
from mastodon.api import get_relationships, get_cross_site_id
from .utils import clean_url
from boofilsic.settings import CLIENT_NAME
# abstract base classes
###################################
class SourceSiteEnum(models.IntegerChoices):
DOUBAN = 1, _("豆瓣")
IN_SITE = 1, CLIENT_NAME
DOUBAN = 2, _("豆瓣")
class Resource(models.Model):
class Entity(models.Model):
rating_total_score = models.PositiveIntegerField(null=True, blank=True)
rating_number = models.PositiveIntegerField(null=True, blank=True)
@ -29,8 +30,9 @@ class Resource(models.Model):
brief = models.TextField(blank=True, default="")
other_info = postgres.JSONField(
blank=True, null=True, encoder=DjangoJSONEncoder, default=dict)
# source_url = models.URLField(max_length=500)
# source_site = models.SmallIntegerField()
# source_url should include shceme, which is normally https://
source_url = models.URLField(_("URL"), max_length=500, unique=True)
source_site = models.SmallIntegerField(_("源网站"), choices=SourceSiteEnum.choices)
class Meta:
abstract = True
@ -41,6 +43,10 @@ class Resource(models.Model):
rating__lte=10), name='%(class)s_rating_upperbound'),
]
def get_absolute_url(self):
raise NotImplementedError("Subclass should implement this method")
def save(self, *args, **kwargs):
""" update rating and strip source url scheme & querystring before save to db """
if self.rating_number and self.rating_total_score:
@ -50,7 +56,6 @@ class Resource(models.Model):
self.rating = None
else:
raise IntegrityError()
# self.source = clean_url(self.source)
super().save(*args, **kwargs)
def calculate_rating(self, old_rating, new_rating):
@ -92,7 +97,7 @@ class Resource(models.Model):
def get_tags_manager(self):
"""
Since relation between tag and resource is foreign key, and related name has to be unique,
Since relation between tag and entity is foreign key, and related name has to be unique,
this method works like interface.
"""
raise NotImplementedError("Subclass should implement this method.")
@ -138,19 +143,19 @@ class UserOwnedEntity(models.Model):
abstract = True
@classmethod
def get_available(cls, resource, request_user, token):
def get_available(cls, entity, request_user, token):
# TODO add amount limit for once query
"""
Returns all avaliable user-owned entities related to given resource.
Returns all avaliable user-owned entities related to given entity.
This method handles mute/block relationships and private/public visibilities.
"""
# the foreign key field that points to resource
# has to be named as the lower case name of that resource
query_kwargs = {resource.__class__.__name__.lower(): resource}
# the foreign key field that points to entity
# has to be named as the lower case name of that entity
query_kwargs = {entity.__class__.__name__.lower(): entity}
user_owned_entities = cls.objects.filter(
**query_kwargs).order_by("-edited_time")
# every user should only be abled to have one user owned entity for each resource
# every user should only be abled to have one user owned entity for each entity
# this is guaranteed by models
id_list = []

View file

@ -6,8 +6,11 @@ from lxml import html
import re
from boofilsic.settings import LUMINATI_USERNAME, LUMINATI_PASSWORD, DEBUG
from django.utils.translation import ugettext_lazy as _
from movies.models import MovieGenreEnum
from common.models import SourceSiteEnum
from movies.models import Movie, MovieGenreEnum
from movies.forms import MovieForm
from books.models import Book
from books.forms import BookForm
RE_NUMBERS = re.compile(r"\d+\d*")
@ -36,8 +39,8 @@ PORT = 22225
logger = logging.getLogger(__name__)
# register all implemented scraper in form of {host: class,}
registry = {}
# register all implemented scraper in form of {host: scraper_class,}
scraper_registry = {}
def log_url(func):
@ -61,36 +64,52 @@ def log_url(func):
class AbstractScraper:
# subclasses must specify those two variables
# site means general sites, like amazon/douban etc
site = None
# host means technically hostname
host = None
# corresponding data class
data_class = None
# corresponding form class
form_class = None
# used to extract effective url
regex = None
def __init_subclass__(cls, **kwargs):
# this statement initialize the subclasses
super().__init_subclass__(**kwargs)
assert cls.site is not None, "class variable `site` must be specified"
assert cls.host is not None, "class variable `host` must be specified"
assert cls.data_class is not None, "class variable `data_class` must be specified"
assert cls.form_class is not None, "class variable `form_class` must be specified"
assert cls.regex is not None, "class variable `regex` must be specified"
assert isinstance(cls.host, str), "`host` must be type str"
assert isinstance(cls.site, int), "`site` must be type int"
assert hasattr(cls, 'scrape') and callable(cls.scrape), "scaper must have method `.scrape()`"
# decorate the scrape method
cls.scrape = classmethod(log_url(cls.scrape))
registry[cls.host] = cls
scraper_registry[cls.host] = cls
def scrape(self, url):
"""
Scrape/request model schema specified data from given url and return it.
Implementations of subclasses to this method would be decorated as class method.
return (data_dict, image)
"""
raise NotImplementedError("Subclass should implement this method")
@classmethod
def download_page(cls, url, regex, headers):
url = regex.findall(url)
def get_effective_url(cls, raw_url):
url = cls.regex.findall(raw_url)[0]
if not url:
raise ValueError("not valid url")
else:
url = url[0] + '/'
return url
@classmethod
def download_page(cls, url, headers):
url = cls.get_effective_url(url)
session_id = random.random()
proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %
@ -142,13 +161,15 @@ class AbstractScraper:
class DoubanBookScraper(AbstractScraper):
site = SourceSiteEnum.DOUBAN.value
host = "book.douban.com"
regex = re.compile(r"https://book.douban.com/subject/\d+")
data_class = Book
form_class = BookForm
regex = re.compile(r"https://book.douban.com/subject/\d+/{0,1}")
def scrape(self, url):
regex = self.regex
headers = DEFAULT_REQUEST_HEADERS.copy()
headers['Host'] = self.host
content = self.download_page(url, regex, headers)
content = self.download_page(url, headers)
# parsing starts here
try:
@ -289,7 +310,9 @@ class DoubanBookScraper(AbstractScraper):
'isbn': isbn,
'brief': brief,
'contents': contents,
'other_info': other
'other_info': other,
'source_site': self.site,
'source_url': self.get_effective_url(url),
}
return data, raw_img
@ -297,13 +320,15 @@ class DoubanBookScraper(AbstractScraper):
class DoubanMovieScraper(AbstractScraper):
site = SourceSiteEnum.DOUBAN.value
host = 'movie.douban.com'
regex = re.compile(r"https://movie.douban.com/subject/\d+")
data_class = Movie
form_class = MovieForm
regex = re.compile(r"https://movie.douban.com/subject/\d+/{0,1}")
def scrape(self, url):
regex = self.regex
headers = DEFAULT_REQUEST_HEADERS.copy()
headers['Host'] = 'movie.douban.com'
content = self.download_page(url, regex, headers)
content = self.download_page(url, headers)
# parsing starts here
try:
@ -458,6 +483,8 @@ class DoubanMovieScraper(AbstractScraper):
'single_episode_length': single_episode_length,
'brief': brief,
'is_series': is_series,
'source_site': self.site,
'source_url': self.get_effective_url(url),
}
return data, raw_img

View file

@ -144,6 +144,10 @@ textarea {
width: 100%;
}
select {
width: 100%;
}
label,
legend {
display: block;

File diff suppressed because one or more lines are too long

View file

@ -15,8 +15,8 @@ textarea
min-height: 6.5rem
width: 100%
// input, select
// width: 100%
select
width: 100%
label,
legend

View file

@ -62,17 +62,4 @@ def ChoicesDictGenerator(choices_enum):
for attr in dir(choices_enum):
if not '__' in attr:
choices_dict[getattr(choices_enum, attr).value] = getattr(choices_enum, attr).label
return choices_dict
def clean_url(url):
"""
strip scheme and querystring of the url.
"""
if url.startswith("https://"):
url = url.lstrip("https://")
elif url.startswith("http://"):
url = url.lstrip("http://")
if url.endswith('/'):
url = url.rstrip("/")
url = url.source.split('?')[0].split('#')[0]
return choices_dict

View file

@ -1,15 +1,23 @@
import operator
import logging
from difflib import SequenceMatcher
from django.shortcuts import render
from urllib.parse import urlparse
from django.shortcuts import render, redirect, reverse
from django.contrib.auth.decorators import login_required
from books.models import Book
from movies.models import Movie
from common.models import MarkStatusEnum
from common.utils import PageLinksGenerator
from users.models import Report, User
from django.utils.translation import gettext_lazy as _
from django.core.paginator import Paginator
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError, ObjectDoesNotExist
from django.core.files.uploadedfile import SimpleUploadedFile
from django.db.models import Q, Count
from django.http import HttpResponseBadRequest
from books.models import Book
from movies.models import Movie
from users.models import Report, User
from mastodon.decorators import mastodon_request_included
from common.models import MarkStatusEnum
from common.utils import PageLinksGenerator
from common.scraper import scraper_registry
# how many books have in each set at the home page
@ -27,6 +35,7 @@ PAGE_LINK_NUMBER = 7
# max tags on list page
TAG_NUMBER_ON_LIST = 5
logger = logging.getLogger(__name__)
@login_required
def home(request):
@ -87,10 +96,21 @@ def home(request):
def search(request):
if request.method == 'GET':
# test if input serach string is empty or not excluding param ?c=
empty_querystring_criteria = {k: v for k, v in request.GET.items() if k != 'c'}
if not len(empty_querystring_criteria):
return HttpResponseBadRequest()
# test if user input an URL, if so jump to URL handling function
url_validator = URLValidator()
input_string = request.GET.get('q').strip()
try:
url_validator(input_string)
# validation success
return jump_or_scrape(request, input_string)
except ValidationError as e:
pass
# category, book/movie/record etc
category = request.GET.get("c", default='').strip().lower()
@ -212,3 +232,52 @@ def search(request):
else:
return HttpResponseBadRequest()
@login_required
@mastodon_request_included
def jump_or_scrape(request, url):
"""
1. match url to registered scrapers
2. try to find the url in the db, if exits then jump, else scrape and jump
"""
# redirect to this site
this_site = request.get_host()
if this_site in url:
return redirect(url)
# match url to registerd sites
matched_host = None
for host in scraper_registry:
if host in url:
matched_host = host
break
if matched_host is None:
# invalid url
return render(request, 'common/error.html', {'msg': _("链接非法,查询失败")})
else:
scraper = scraper_registry[matched_host]
try:
# raise ObjectDoesNotExist
entity = scraper.data_class.objects.get(source_url=url)
# if exists then jump to detail page
return redirect(entity)
except ObjectDoesNotExist:
# scrape if not exists
try:
scraped_entity, raw_cover = scraper.scrape(url)
except:
return render(request, 'common/error.html', {'msg': _("爬取数据失败😫")})
scraped_cover = {
'cover': SimpleUploadedFile('temp.jpg', raw_cover)}
form = scraper.form_class(scraped_entity, scraped_cover)
if form.is_valid():
form.instance.last_editor = request.user
form.save()
return redirect(form.instance)
else:
msg = _("爬取数据失败😫")
logger.error(str(form.errors))
return render(request, 'common/error.html', {'msg': msg})

View file

@ -45,6 +45,8 @@ class MovieForm(forms.ModelForm):
fields = [
'id',
'title',
'source_site',
'source_url',
'orig_title',
'other_title',
'imdb_code',

View file

@ -3,7 +3,8 @@ import django.contrib.postgres.fields as postgres
from django.utils.translation import ugettext_lazy as _
from django.db import models
from django.core.serializers.json import DjangoJSONEncoder
from common.models import Resource, Mark, Review, Tag
from django.shortcuts import reverse
from common.models import Entity, Mark, Review, Tag
from common.utils import ChoicesDictGenerator
from boofilsic.settings import MOVIE_MEDIA_PATH_ROOT, DEFAULT_MOVIE_IMAGE
from django.utils import timezone
@ -58,7 +59,7 @@ class MovieGenreEnum(models.TextChoices):
MovieGenreTranslator = ChoicesDictGenerator(MovieGenreEnum)
class Movie(Resource):
class Movie(Entity):
'''
Can either be movie or series.
'''
@ -75,7 +76,7 @@ class Movie(Resource):
default=list,
)
imdb_code = models.CharField(
blank=True, max_length=10, null=True, unique=True, db_index=True)
blank=True, max_length=10, null=True, db_index=True)
director = postgres.ArrayField(
models.CharField(_("director"), blank=True,
default='', max_length=100),
@ -170,6 +171,9 @@ class Movie(Resource):
return self.title
def get_absolute_url(self):
return reverse("movies:retrieve", args=[self.id])
def get_tags_manager(self):
return self.movie_tags

View file

@ -63,28 +63,31 @@
{% endcomment %}
<script>
// mark required
$("#content input[required]").each(function () {
// mark required
$("#content *[required]").each(function () {
$(this).prev().prepend("*");
})
// `is_seires` checkbox
$("#id_season, label[for='id_season']").hide();
$("#id_episodes, label[for='id_episodes']").hide();
$("#id_single_episode_length, label[for='id_single_episode_length']").hide();
$('#id_is_series').change(
function () {
if ($(this).is(':checked')) {
$("#id_season, label[for='id_season']").show();
$("#id_episodes, label[for='id_episodes']").show();
$("#id_single_episode_length, label[for='id_single_episode_length']").show();
} else {
$("#id_season, label[for='id_season']").hide();
$("#id_episodes, label[for='id_episodes']").hide();
$("#id_single_episode_length, label[for='id_single_episode_length']").hide();
}
});
// when source site is this site, hide url input box and populate it with fake url
// the backend would update this field
if ($("select[name='source_site']").val() == {{ this_site_enum_value }}) {
$("input[name='source_url']").hide();
$("label[for='id_source_url']").hide();
$("input[name='source_url']").val("https://www.temp.com/" + Date.now() + Math.random());
}
$("select[name='source_site']").change(function () {
let value = $(this).val();
if (value == {{ this_site_enum_value }
}) {
$("input[name='source_url']").hide();
$("label[for='id_source_url']").hide();
$("input[name='source_url']").val("https://www.temp.com/" + Date.now() + Math.random());
} else {
$("input[name='source_url']").show();
$("label[for='id_source_url']").show();
$("input[name='source_url']").val("");
}
);
});
</script>
</body>

View file

@ -97,7 +97,7 @@
</div>
<script>
// mark required
$("#content input[required]").each(function () {
$("#content *[required]").each(function () {
$(this).prev().prepend("*");
});
$('form').submit(function () {

View file

@ -14,6 +14,7 @@ from mastodon.api import check_visibility, post_toot, TootVisibilityEnum
from mastodon.utils import rating_to_emoji
from common.utils import PageLinksGenerator
from common.views import PAGE_LINK_NUMBER
from common.models import SourceSiteEnum
from .models import *
from .forms import *
from .forms import MovieMarkStatusTranslator
@ -48,7 +49,9 @@ def create(request):
{
'form': form,
'title': _('添加电影/剧集'),
'submit_url': reverse("movies:create")
'submit_url': reverse("movies:create"),
# provided for frontend js
'this_site_enum_value': SourceSiteEnum.IN_SITE.value,
}
)
elif request.method == 'POST':
@ -57,7 +60,16 @@ def create(request):
form = MovieForm(request.POST, request.FILES)
if form.is_valid():
form.instance.last_editor = request.user
form.save()
try:
with transaction.atomic():
form.save()
if form.instance.source_site == SourceSiteEnum.IN_SITE.value:
real_url = form.instance.get_absolute_url()
form.instance.source_url = real_url
form.instance.save()
except IntegrityError as e:
logger.error(e.__str__())
return HttpResponseServerError("integrity error")
return redirect(reverse("movies:retrieve", args=[form.instance.id]))
else:
return render(
@ -66,7 +78,9 @@ def create(request):
{
'form': form,
'title': _('添加电影/剧集'),
'submit_url': reverse("movies:create")
'submit_url': reverse("movies:create"),
# provided for frontend js
'this_site_enum_value': SourceSiteEnum.IN_SITE.value,
}
)
else:
@ -87,7 +101,9 @@ def update(request, id):
{
'form': form,
'title': page_title,
'submit_url': reverse("movies:update", args=[movie.id])
'submit_url': reverse("movies:update", args=[movie.id]),
# provided for frontend js
'this_site_enum_value': SourceSiteEnum.IN_SITE.value,
}
)
elif request.method == 'POST':
@ -97,7 +113,16 @@ def update(request, id):
if form.is_valid():
form.instance.last_editor = request.user
form.instance.edited_time = timezone.now()
form.save()
try:
with transaction.atomic():
form.save()
if form.instance.source_site == SourceSiteEnum.IN_SITE.value:
real_url = form.instance.get_absolute_url()
form.instance.source_url = real_url
form.instance.save()
except IntegrityError as e:
logger.error(e.__str__())
return HttpResponseServerError("integrity error")
else:
return render(
request,
@ -105,7 +130,9 @@ def update(request, id):
{
'form': form,
'title': page_title,
'submit_url': reverse("movies:update", args=[movie.id])
'submit_url': reverse("movies:update", args=[movie.id]),
# provided for frontend js
'this_site_enum_value': SourceSiteEnum.IN_SITE.value,
}
)
return redirect(reverse("movies:retrieve", args=[form.instance.id]))