Add Manganelo scraper

Hamuko · Mar 7, 2020 · 665884a · 665884a
1 parent b313be1
commit 665884a
Show file tree

Hide file tree

Showing 4 changed files with 279 additions and 1 deletion.
diff --git a/cum/db.py b/cum/db.py
@@ -248,7 +248,9 @@ def to_object(self):
         if parse.netloc in ('www.mangahere.cc', 'm.mangahere.cc'):
             from cum.scrapers.mangahere import MangahereChapter
             return MangahereChapter(**kwargs)
-
+        if parse.netloc == 'manganelo.com':
+            from cum.scrapers.manganelo import ManganeloChapter
+            return ManganeloChapter(**kwargs)
 
 class Group(Base):
     __tablename__ = 'groups'

diff --git a/cum/scrapers/__init__.py b/cum/scrapers/__init__.py
@@ -2,6 +2,7 @@
 from cum.scrapers.dynastyscans import DynastyScansChapter, DynastyScansSeries
 from cum.scrapers.madokami import MadokamiChapter, MadokamiSeries
 from cum.scrapers.mangadex import MangadexSeries, MangadexChapter
+from cum.scrapers.manganelo import ManganeloSeries, ManganeloChapter
 from cum.scrapers.mangasee import MangaseeSeries, MangaseeChapter
 from cum.scrapers.mangahere import MangahereSeries, MangahereChapter
 from cum.scrapers.yuriism import YuriismChapter, YuriismSeries
@@ -11,6 +12,7 @@
     DynastyScansSeries,
     MadokamiSeries,
     MangadexSeries,
+    ManganeloSeries,
     MangaseeSeries,
     MangahereSeries,
     YuriismSeries,
@@ -20,6 +22,7 @@
     DynastyScansChapter,
     MadokamiChapter,
     MangadexChapter,
+    ManganeloChapter,
     MangaseeChapter,
     MangahereChapter,
     YuriismChapter,

diff --git a/cum/scrapers/manganelo.py b/cum/scrapers/manganelo.py
@@ -0,0 +1,124 @@
+from bs4 import BeautifulSoup
+from cum import config, exceptions, output
+from cum.scrapers.base import BaseChapter, BaseSeries, download_pool
+from functools import partial
+from warnings import filterwarnings
+import concurrent.futures
+import json
+import re
+import requests
+
+
+class ManganeloSeries(BaseSeries):
+    url_re = re.compile(r'https?://manganelo\.com/manga/.+')
+
+    def __init__(self, url, **kwargs):
+        super().__init__(url, **kwargs)
+        filterwarnings(action = "ignore", message = "unclosed", category = ResourceWarning)
+        spage = requests.get(url)
+        if spage.status_code == 404:
+            raise exceptions.ScrapingError
+        self.soup = BeautifulSoup(spage.text, config.get().html_parser)
+        # 404 pages actually return HTTP 200
+        if self.soup.find("title").text == "404 Not Found":
+            raise exceptions.ScrapingError
+        self.chapters = self.get_chapters()
+
+    def get_chapters(self):
+        try:
+            rows = self.soup.find_all("li", class_="a-h")
+        except AttributeError:
+            raise exceptions.ScrapingError()
+        chapters = []
+        for i, row in enumerate(rows):
+            chap_num = re.match(r"https?://manganelo\.com/chapter/.+/?chapter_([0-9\.]+)",
+                                row.find("a")["href"]).groups()[0]
+            chap_url = row.find("a")["href"]
+            chap_name = row.find("a")["title"]
+            chap_date = row.find_all("span")[1]["title"]
+            result = ManganeloChapter(name=self.name,
+                                     alias=self.alias,
+                                     chapter=chap_num,
+                                     url=chap_url,
+                                     title=chap_name,
+                                     groups=[],
+                                     upload_date=chap_date)
+            chapters.append(result)
+        return chapters
+
+    @property
+    def name(self):
+        try:
+            return re.match(r"(.+) Manga Online Free - Manganelo",
+                            self.soup.find("title").text).groups()[0]
+        except AttributeError:
+            raise exceptions.ScrapingError
+
+
+class ManganeloChapter(BaseChapter):
+    url_re = re.compile((r'https?://manganelo\.com/'
+                        r'chapter/.+/chapter_[0-9\.]'))
+    upload_date = None
+    uses_pages = True
+
+    # 404 pages actually return HTTP 200
+    # thus this method override
+    def available(self):
+        if not getattr(self, "cpage", None):
+            self.cpage = requests.get(self.url)
+        if not getattr(self, "soup", None):
+            self.soup = BeautifulSoup(self.cpage.text,
+                                      config.get().html_parser)
+        return self.soup.find("title").text != "404 Not Found"
+
+    def download(self):
+        if not getattr(self, "cpage", None):
+            self.cpage = requests.get(self.url)
+        if not getattr(self, "soup", None):
+            self.soup = BeautifulSoup(self.cpage.text,
+                                      config.get().html_parser)
+
+        # 404 pages actually return HTTP 200
+        if self.soup.find("title").text == "404 Not Found":
+            raise exceptions.ScrapingError
+        pages = [ image["src"] for image in self.soup.find("div", class_ = "container-chapter-reader").find_all("img") ]
+
+        futures = []
+        files = [None] * len(pages)
+        req_session = requests.Session()
+        with self.progress_bar(pages) as bar:
+            for i, page in enumerate(pages):
+                retries = 0
+                while retries < 10:
+                    try:
+                        r = req_session.get(page, stream=True)
+                        if r.status_code != 200:
+                            output.warning('Failed to fetch page with status {}, retrying #{}'
+                                            .format(str(r.status_code), str(retries)))
+                            retries += 1
+                        else:
+                            break
+                    except requests.exceptions.ConnectionError:
+                        retries += 1
+                if r.status_code != 200:
+                    output.error('Failed to fetch page with status {}, giving up'
+                                    .format(str(r.status_code)))
+                    raise ValueError
+                fut = download_pool.submit(self.page_download_task, i, r)
+                fut.add_done_callback(partial(self.page_download_finish,
+                                              bar, files))
+                futures.append(fut)
+            concurrent.futures.wait(futures)
+            self.create_zip(files)
+            req_session.close()
+
+    def from_url(url):
+        cpage = requests.get(url)
+        soup = BeautifulSoup(cpage.text, config.get().html_parser)
+        iname = re.match("https?://manganelo\.com/chapter/(.+)/chapter_[0-9\.]+",
+                            url).groups()[0]
+        series = ManganeloSeries("https://manganelo.com/manga/" + iname)
+        for chapter in series.chapters:
+            if chapter.url == url:
+                return chapter
+        return None
diff --git a/tests/test_scraper_manganelo.py b/tests/test_scraper_manganelo.py
@@ -0,0 +1,149 @@
+from bs4 import BeautifulSoup
+from cum import config, exceptions
+from nose.tools import nottest
+from urllib.parse import urljoin
+from warnings import filterwarnings
+import cumtest
+import os
+import requests
+import unittest
+import zipfile
+
+
+class TestManganelo(cumtest.CumTest):
+    MANGANELO_URL = 'https://manganelo.com/genre-all'
+
+    def setUp(self):
+        super().setUp()
+        global manganelo
+        filterwarnings(action = "ignore", message = "unclosed", category = ResourceWarning)
+        from cum.scrapers import manganelo
+
+    def tearDown(self):
+        self.directory.cleanup()
+
+    def get_five_latest_releases(self):
+        r = requests.get(self.MANGANELO_URL)
+        soup = BeautifulSoup(r.text, config.get().html_parser)
+        chapters = soup.find_all("a", class_="genres-item-chap")
+        links = [x["href"] for x in chapters]
+        return links[:5]
+
+    @nottest
+    def series_information_tester(self, data):
+        series = manganelo.ManganeloSeries(data['url'])
+        self.assertEqual(series.name, data['name'])
+        self.assertEqual(series.alias, data['alias'])
+        self.assertEqual(series.url, data['url'])
+        self.assertIs(series.directory, None)
+        self.assertEqual(len(series.chapters), len(data['chapters']))
+        for chapter in series.chapters:
+            self.assertEqual(chapter.name, data['name'])
+            self.assertEqual(chapter.alias, data['alias'])
+            self.assertIn(chapter.chapter, data['chapters'])
+            data['chapters'].remove(chapter.chapter)
+            self.assertIs(chapter.directory, None)
+        self.assertEqual(len(data['chapters']), 0)
+
+    # This test is disabled temporarily due to the architecture of
+    # the chapter.from_url method, which assumes that if a chapter
+    # exists then it will be listed on the series page.  Manganelo
+    # seems to violate this assumption, in that there are chapters
+    # which are accessible from the "latest chapters" page but which
+    # are not listed on their respective series' pages, at least
+    # not immediately.
+    # TODO: come back to this test and find a way to construct a
+    # chapter without requiring metadata from the series page
+    def _test_chapter_download_latest(self):
+        latest_releases = self.get_five_latest_releases()
+        for release in latest_releases:
+            try:
+                chapter = manganelo.ManganeloChapter.from_url(release)
+            except exceptions.ScrapingError as e:
+                print('scraping error for {} - {}'.format(release, e))
+                continue
+            else:
+                chapter.get(use_db=False)
+
+    def test_chapter_filename_decimal(self):
+        URL = "https://manganelo.com/chapter/citrus_saburo_uta/chapter_24.6"
+        chapter = manganelo.ManganeloChapter.from_url(URL)
+        path = os.path.join(self.directory.name, 'Citrus Saburo Uta',
+                            'Citrus Saburo Uta - c024 x6 [Unknown].zip')
+        self.assertEqual(chapter.chapter, '24.6')
+        self.assertEqual(chapter.filename, path)
+
+    def test_chapter_information_normal(self):
+        URL = "https://manganelo.com/chapter/ramen_daisuki_koizumisan/chapter_18"
+        chapter = manganelo.ManganeloChapter.from_url(URL)
+        self.assertEqual(chapter.alias, 'ramen-daisuki-koizumi-san')
+        self.assertTrue(chapter.available())
+        self.assertEqual(chapter.chapter, '18')
+        self.assertEqual(chapter.name, 'Ramen Daisuki Koizumi-San')
+        self.assertEqual(chapter.title, 'Ramen Daisuki Koizumi-san Chapter 18')
+        path = os.path.join(self.directory.name,
+                            'Ramen Daisuki Koizumi-San',
+                            'Ramen Daisuki Koizumi-San - c018 [Unknown].zip')
+        self.assertEqual(chapter.filename, path)
+        chapter.download()
+        self.assertTrue(os.path.isfile(path))
+        with zipfile.ZipFile(path) as chapter_zip:
+            files = chapter_zip.infolist()
+            self.assertEqual(len(files), 8)
+
+    def test_chapter_information_chapterzero(self):
+        URL = "https://manganelo.com/chapter/inu_to_hasami_wa_tsukaiyou/chapter_0"
+        chapter = manganelo.ManganeloChapter.from_url(URL)
+        self.assertEqual(chapter.alias, 'inu-to-hasami-wa-tsukaiyou')
+        self.assertEqual(chapter.chapter, '0')
+        self.assertEqual(chapter.name, 'Inu To Hasami Wa Tsukaiyou')
+        self.assertEqual(chapter.title, 'Inu to Hasami wa Tsukaiyou Vol.1 Chapter 0')
+        path = os.path.join(
+            self.directory.name, 'Inu To Hasami Wa Tsukaiyou',
+            'Inu To Hasami Wa Tsukaiyou - c000 [Unknown].zip')
+        self.assertEqual(chapter.filename, path)
+        chapter.download()
+        self.assertTrue(os.path.isfile(path))
+        with zipfile.ZipFile(path) as chapter_zip:
+            files = chapter_zip.infolist()
+            self.assertEqual(len(files), 32)
+
+    def test_series_invalid(self):
+        URL = "https://manganelo.com/manga/test_bad_manga_name"
+        with self.assertRaises(exceptions.ScrapingError):
+            series = manganelo.ManganeloSeries(url=URL)
+
+    def test_chapter_unavailable(self):
+        URL = "https://manganelo.com/chapter/oyasumi_punpun/chapter_999"
+        chapter = manganelo.ManganeloChapter(url=URL)
+        self.assertFalse(chapter.available())
+
+    def test_series_oneword(self):
+        data = {'alias': 'aria',
+                'chapters': ['1', '2', '3', '4', '5', '6', '7', '8',
+                             '9', '10', '10.5', '11', '12', '13', '14', '15',
+                             '16', '17', '18', '19', '20', '21', '22', '23',
+                             '24', '25', '26', '27', '28', '29', '30', '30.5',
+                             '31', '32', '33', '34', '35', '35.5', '36',
+                             '37', '37.5', '38', '39', '40', '41', '42', '43',
+                             '44', '45', '45.5', '46', '47', '48', '49',
+                             '50', '50.5', '51', '52', '53', '54', '55', '56',
+                             '57', '57.5', '58', '59', '60', '60.1'],
+                'name': 'Aria',
+                'url': 'https://manganelo.com/manga/aria'}
+        self.series_information_tester(data)
+
+    def test_series_multiplewords(self):
+        data = {'alias': 'prunus-girl',
+                'chapters': ['1', '1.5', '2', '3', '4', '5', '5.5', '6', '7', '8',
+                             '9', '10', '11', '11.5', '12', '13', '14', '15',
+                             '16', '16.5', '17', '18', '19', '20', '21', '22',
+                             '23', '24', '25', '26', '27', '28', '29', '30',
+                             '31', '32', '32.5', '33', '34', '35', '36', '37',
+                             '38', '39', '40', '41', '42', '42.5'],
+                'name': 'Prunus Girl',
+                'url': 'https://manganelo.com/manga/prunus_girl'}
+        self.series_information_tester(data)
+
+if __name__ == '__main__':
+    unittest.main()