diff --git a/common/scraper.py b/common/scraper.py index 64a50a87..2a96bba9 100644 --- a/common/scraper.py +++ b/common/scraper.py @@ -46,6 +46,7 @@ scraper_registry = {} def get_normalized_url(raw_url): url = re.sub(r'//m.douban.com/(\w+)/', r'//\1.douban.com/', raw_url) + url = re.sub(r'//www.google.com/books/edition/_/([A-Za-z0-9_\-]+)[\?]*', r'//books.google.com/books?id=\1&', url) return url diff --git a/common/scrapers/google.py b/common/scrapers/google.py index 230889c7..18b0a2c3 100644 --- a/common/scrapers/google.py +++ b/common/scrapers/google.py @@ -21,7 +21,7 @@ from common.scraper import * # https://developers.google.com/books/docs/v1/using class GoogleBooksScraper(AbstractScraper): site_name = SourceSiteEnum.GOOGLEBOOKS.value - host = "books.google.com" + host = ["books.google.com", "www.google.com/books"] data_class = Book form_class = BookForm regex = re.compile(r"https://books\.google\.com/books\?id=([^&#]+)")