From 665884a958ae5478277e969221047c24ffc3526e Mon Sep 17 00:00:00 2001 From: matoro Date: Thu, 5 Mar 2020 08:08:37 -0500 Subject: [PATCH] Add Manganelo scraper --- cum/db.py | 4 +- cum/scrapers/__init__.py | 3 + cum/scrapers/manganelo.py | 124 ++++++++++++++++++++++++++ tests/test_scraper_manganelo.py | 149 ++++++++++++++++++++++++++++++++ 4 files changed, 279 insertions(+), 1 deletion(-) create mode 100644 cum/scrapers/manganelo.py create mode 100644 tests/test_scraper_manganelo.py diff --git a/cum/db.py b/cum/db.py index db1af2a..9b1bf66 100644 --- a/cum/db.py +++ b/cum/db.py @@ -248,7 +248,9 @@ def to_object(self): if parse.netloc in ('www.mangahere.cc', 'm.mangahere.cc'): from cum.scrapers.mangahere import MangahereChapter return MangahereChapter(**kwargs) - + if parse.netloc == 'manganelo.com': + from cum.scrapers.manganelo import ManganeloChapter + return ManganeloChapter(**kwargs) class Group(Base): __tablename__ = 'groups' diff --git a/cum/scrapers/__init__.py b/cum/scrapers/__init__.py index b7a68d9..4823bf9 100644 --- a/cum/scrapers/__init__.py +++ b/cum/scrapers/__init__.py @@ -2,6 +2,7 @@ from cum.scrapers.dynastyscans import DynastyScansChapter, DynastyScansSeries from cum.scrapers.madokami import MadokamiChapter, MadokamiSeries from cum.scrapers.mangadex import MangadexSeries, MangadexChapter +from cum.scrapers.manganelo import ManganeloSeries, ManganeloChapter from cum.scrapers.mangasee import MangaseeSeries, MangaseeChapter from cum.scrapers.mangahere import MangahereSeries, MangahereChapter from cum.scrapers.yuriism import YuriismChapter, YuriismSeries @@ -11,6 +12,7 @@ DynastyScansSeries, MadokamiSeries, MangadexSeries, + ManganeloSeries, MangaseeSeries, MangahereSeries, YuriismSeries, @@ -20,6 +22,7 @@ DynastyScansChapter, MadokamiChapter, MangadexChapter, + ManganeloChapter, MangaseeChapter, MangahereChapter, YuriismChapter, diff --git a/cum/scrapers/manganelo.py b/cum/scrapers/manganelo.py new file mode 100644 index 0000000..35f7ac9 --- /dev/null +++ b/cum/scrapers/manganelo.py @@ -0,0 +1,124 @@ +from bs4 import BeautifulSoup +from cum import config, exceptions, output +from cum.scrapers.base import BaseChapter, BaseSeries, download_pool +from functools import partial +from warnings import filterwarnings +import concurrent.futures +import json +import re +import requests + + +class ManganeloSeries(BaseSeries): + url_re = re.compile(r'https?://manganelo\.com/manga/.+') + + def __init__(self, url, **kwargs): + super().__init__(url, **kwargs) + filterwarnings(action = "ignore", message = "unclosed", category = ResourceWarning) + spage = requests.get(url) + if spage.status_code == 404: + raise exceptions.ScrapingError + self.soup = BeautifulSoup(spage.text, config.get().html_parser) + # 404 pages actually return HTTP 200 + if self.soup.find("title").text == "404 Not Found": + raise exceptions.ScrapingError + self.chapters = self.get_chapters() + + def get_chapters(self): + try: + rows = self.soup.find_all("li", class_="a-h") + except AttributeError: + raise exceptions.ScrapingError() + chapters = [] + for i, row in enumerate(rows): + chap_num = re.match(r"https?://manganelo\.com/chapter/.+/?chapter_([0-9\.]+)", + row.find("a")["href"]).groups()[0] + chap_url = row.find("a")["href"] + chap_name = row.find("a")["title"] + chap_date = row.find_all("span")[1]["title"] + result = ManganeloChapter(name=self.name, + alias=self.alias, + chapter=chap_num, + url=chap_url, + title=chap_name, + groups=[], + upload_date=chap_date) + chapters.append(result) + return chapters + + @property + def name(self): + try: + return re.match(r"(.+) Manga Online Free - Manganelo", + self.soup.find("title").text).groups()[0] + except AttributeError: + raise exceptions.ScrapingError + + +class ManganeloChapter(BaseChapter): + url_re = re.compile((r'https?://manganelo\.com/' + r'chapter/.+/chapter_[0-9\.]')) + upload_date = None + uses_pages = True + + # 404 pages actually return HTTP 200 + # thus this method override + def available(self): + if not getattr(self, "cpage", None): + self.cpage = requests.get(self.url) + if not getattr(self, "soup", None): + self.soup = BeautifulSoup(self.cpage.text, + config.get().html_parser) + return self.soup.find("title").text != "404 Not Found" + + def download(self): + if not getattr(self, "cpage", None): + self.cpage = requests.get(self.url) + if not getattr(self, "soup", None): + self.soup = BeautifulSoup(self.cpage.text, + config.get().html_parser) + + # 404 pages actually return HTTP 200 + if self.soup.find("title").text == "404 Not Found": + raise exceptions.ScrapingError + pages = [ image["src"] for image in self.soup.find("div", class_ = "container-chapter-reader").find_all("img") ] + + futures = [] + files = [None] * len(pages) + req_session = requests.Session() + with self.progress_bar(pages) as bar: + for i, page in enumerate(pages): + retries = 0 + while retries < 10: + try: + r = req_session.get(page, stream=True) + if r.status_code != 200: + output.warning('Failed to fetch page with status {}, retrying #{}' + .format(str(r.status_code), str(retries))) + retries += 1 + else: + break + except requests.exceptions.ConnectionError: + retries += 1 + if r.status_code != 200: + output.error('Failed to fetch page with status {}, giving up' + .format(str(r.status_code))) + raise ValueError + fut = download_pool.submit(self.page_download_task, i, r) + fut.add_done_callback(partial(self.page_download_finish, + bar, files)) + futures.append(fut) + concurrent.futures.wait(futures) + self.create_zip(files) + req_session.close() + + def from_url(url): + cpage = requests.get(url) + soup = BeautifulSoup(cpage.text, config.get().html_parser) + iname = re.match("https?://manganelo\.com/chapter/(.+)/chapter_[0-9\.]+", + url).groups()[0] + series = ManganeloSeries("https://manganelo.com/manga/" + iname) + for chapter in series.chapters: + if chapter.url == url: + return chapter + return None diff --git a/tests/test_scraper_manganelo.py b/tests/test_scraper_manganelo.py new file mode 100644 index 0000000..f28a69c --- /dev/null +++ b/tests/test_scraper_manganelo.py @@ -0,0 +1,149 @@ +from bs4 import BeautifulSoup +from cum import config, exceptions +from nose.tools import nottest +from urllib.parse import urljoin +from warnings import filterwarnings +import cumtest +import os +import requests +import unittest +import zipfile + + +class TestManganelo(cumtest.CumTest): + MANGANELO_URL = 'https://manganelo.com/genre-all' + + def setUp(self): + super().setUp() + global manganelo + filterwarnings(action = "ignore", message = "unclosed", category = ResourceWarning) + from cum.scrapers import manganelo + + def tearDown(self): + self.directory.cleanup() + + def get_five_latest_releases(self): + r = requests.get(self.MANGANELO_URL) + soup = BeautifulSoup(r.text, config.get().html_parser) + chapters = soup.find_all("a", class_="genres-item-chap") + links = [x["href"] for x in chapters] + return links[:5] + + @nottest + def series_information_tester(self, data): + series = manganelo.ManganeloSeries(data['url']) + self.assertEqual(series.name, data['name']) + self.assertEqual(series.alias, data['alias']) + self.assertEqual(series.url, data['url']) + self.assertIs(series.directory, None) + self.assertEqual(len(series.chapters), len(data['chapters'])) + for chapter in series.chapters: + self.assertEqual(chapter.name, data['name']) + self.assertEqual(chapter.alias, data['alias']) + self.assertIn(chapter.chapter, data['chapters']) + data['chapters'].remove(chapter.chapter) + self.assertIs(chapter.directory, None) + self.assertEqual(len(data['chapters']), 0) + + # This test is disabled temporarily due to the architecture of + # the chapter.from_url method, which assumes that if a chapter + # exists then it will be listed on the series page. Manganelo + # seems to violate this assumption, in that there are chapters + # which are accessible from the "latest chapters" page but which + # are not listed on their respective series' pages, at least + # not immediately. + # TODO: come back to this test and find a way to construct a + # chapter without requiring metadata from the series page + def _test_chapter_download_latest(self): + latest_releases = self.get_five_latest_releases() + for release in latest_releases: + try: + chapter = manganelo.ManganeloChapter.from_url(release) + except exceptions.ScrapingError as e: + print('scraping error for {} - {}'.format(release, e)) + continue + else: + chapter.get(use_db=False) + + def test_chapter_filename_decimal(self): + URL = "https://manganelo.com/chapter/citrus_saburo_uta/chapter_24.6" + chapter = manganelo.ManganeloChapter.from_url(URL) + path = os.path.join(self.directory.name, 'Citrus Saburo Uta', + 'Citrus Saburo Uta - c024 x6 [Unknown].zip') + self.assertEqual(chapter.chapter, '24.6') + self.assertEqual(chapter.filename, path) + + def test_chapter_information_normal(self): + URL = "https://manganelo.com/chapter/ramen_daisuki_koizumisan/chapter_18" + chapter = manganelo.ManganeloChapter.from_url(URL) + self.assertEqual(chapter.alias, 'ramen-daisuki-koizumi-san') + self.assertTrue(chapter.available()) + self.assertEqual(chapter.chapter, '18') + self.assertEqual(chapter.name, 'Ramen Daisuki Koizumi-San') + self.assertEqual(chapter.title, 'Ramen Daisuki Koizumi-san Chapter 18') + path = os.path.join(self.directory.name, + 'Ramen Daisuki Koizumi-San', + 'Ramen Daisuki Koizumi-San - c018 [Unknown].zip') + self.assertEqual(chapter.filename, path) + chapter.download() + self.assertTrue(os.path.isfile(path)) + with zipfile.ZipFile(path) as chapter_zip: + files = chapter_zip.infolist() + self.assertEqual(len(files), 8) + + def test_chapter_information_chapterzero(self): + URL = "https://manganelo.com/chapter/inu_to_hasami_wa_tsukaiyou/chapter_0" + chapter = manganelo.ManganeloChapter.from_url(URL) + self.assertEqual(chapter.alias, 'inu-to-hasami-wa-tsukaiyou') + self.assertEqual(chapter.chapter, '0') + self.assertEqual(chapter.name, 'Inu To Hasami Wa Tsukaiyou') + self.assertEqual(chapter.title, 'Inu to Hasami wa Tsukaiyou Vol.1 Chapter 0') + path = os.path.join( + self.directory.name, 'Inu To Hasami Wa Tsukaiyou', + 'Inu To Hasami Wa Tsukaiyou - c000 [Unknown].zip') + self.assertEqual(chapter.filename, path) + chapter.download() + self.assertTrue(os.path.isfile(path)) + with zipfile.ZipFile(path) as chapter_zip: + files = chapter_zip.infolist() + self.assertEqual(len(files), 32) + + def test_series_invalid(self): + URL = "https://manganelo.com/manga/test_bad_manga_name" + with self.assertRaises(exceptions.ScrapingError): + series = manganelo.ManganeloSeries(url=URL) + + def test_chapter_unavailable(self): + URL = "https://manganelo.com/chapter/oyasumi_punpun/chapter_999" + chapter = manganelo.ManganeloChapter(url=URL) + self.assertFalse(chapter.available()) + + def test_series_oneword(self): + data = {'alias': 'aria', + 'chapters': ['1', '2', '3', '4', '5', '6', '7', '8', + '9', '10', '10.5', '11', '12', '13', '14', '15', + '16', '17', '18', '19', '20', '21', '22', '23', + '24', '25', '26', '27', '28', '29', '30', '30.5', + '31', '32', '33', '34', '35', '35.5', '36', + '37', '37.5', '38', '39', '40', '41', '42', '43', + '44', '45', '45.5', '46', '47', '48', '49', + '50', '50.5', '51', '52', '53', '54', '55', '56', + '57', '57.5', '58', '59', '60', '60.1'], + 'name': 'Aria', + 'url': 'https://manganelo.com/manga/aria'} + self.series_information_tester(data) + + def test_series_multiplewords(self): + data = {'alias': 'prunus-girl', + 'chapters': ['1', '1.5', '2', '3', '4', '5', '5.5', '6', '7', '8', + '9', '10', '11', '11.5', '12', '13', '14', '15', + '16', '16.5', '17', '18', '19', '20', '21', '22', + '23', '24', '25', '26', '27', '28', '29', '30', + '31', '32', '32.5', '33', '34', '35', '36', '37', + '38', '39', '40', '41', '42', '42.5'], + 'name': 'Prunus Girl', + 'url': 'https://manganelo.com/manga/prunus_girl'} + self.series_information_tester(data) + +if __name__ == '__main__': + unittest.main()