add mangahere scraper; numerous misc fixes

remove debugging imports, add more tests to mangasee scraper, add support for multi-volume/multi-season titles, fix 404 detection on mangasee scraper, change beautifulsoup element parsing to find() instead of find_all()
Hamuko · Feb 15, 2019 · 2e1de8d · 2e1de8d
1 parent 9d4325c
commit 2e1de8d
Show file tree

Hide file tree

Showing 5 changed files with 409 additions and 8 deletions.
diff --git a/cum/scrapers/base.py b/cum/scrapers/base.py
@@ -195,6 +195,11 @@ def filename(self):
         elif match(r'[0-9]*\.[0-9]*$', self.chapter):
             number, decimal = self.chapter.split('.')
             chapter = 'c{:0>3} x{}'.format(number, decimal)
+        # Individually numbered chapter with double-decimal (e.g. '2.164.5').
+        # Used by titles with multiple volumes/seasons and special chapters.
+        elif match(r'[0-9]*(\.[0-9]*){2}$', self.chapter):
+            volume, number, decimal = self.chapter.split('.')
+            chapter = 'c{:0>3} x{:0>3}.{}'.format(volume, number, decimal)
         # Failing all else, e.g. 'Special'. Becomes 'c000 [Special]'.
         else:
             chapter = 'c000 [{}]'.format(self.chapter)

diff --git a/cum/scrapers/mangahere.py b/cum/scrapers/mangahere.py
@@ -0,0 +1,129 @@
+from bs4 import BeautifulSoup
+from cum import config, exceptions
+from cum.scrapers.base import BaseChapter, BaseSeries, download_pool
+from functools import partial
+import concurrent.futures
+import re
+import requests
+
+
+class MangahereSeries(BaseSeries):
+    url_re = re.compile(r'https?://((www|m)\.)?mangahere\.cc/manga/.+')
+
+    def __init__(self, url, **kwargs):
+        super().__init__(url, **kwargs)
+        # convert mobile link to desktop
+        spage = requests.get(url.replace("m.", "www."))
+        if spage.status_code == 404:
+            raise exceptions.ScrapingError
+        self.soup = BeautifulSoup(spage.text, config.get().html_parser)
+        self.chapters = self.get_chapters()
+
+    def get_chapters(self):
+        try:
+            rows = self.soup.find("ul", class_="detail-main-list")\
+                .find_all("li")
+        except AttributeError:
+            raise exceptions.ScrapingError()
+        chapters = []
+        for i, row in enumerate(rows):
+            chap_num = re.match((r"/manga/[^/]+((/v[0-9]+)?"
+                                r"/c[0-9\.]+)/[0-9]+\.html$"),
+                                row.find("a")["href"]).groups()[0]\
+                                .replace("/", "")
+            if "v" in chap_num:
+                chap_num = chap_num.replace("v", "").replace("c", ".")
+            else:
+                chap_num = chap_num.replace("c", "")
+            if chap_num == "000":
+                chap_num = "0"
+            else:
+                chap_num = chap_num.lstrip("0")
+            # convert mobile link to desktop
+            chap_url = "https://www.mangahere.cc" + \
+                row.find("a")["href"].replace("/roll_manga/", "/manga/")
+            chap_name = row.find("p", class_="title3").text
+            chap_date = row.find("p", class_="title2").text
+            result = MangahereChapter(name=self.name,
+                                      alias=self.alias,
+                                      chapter=chap_num,
+                                      url=chap_url,
+                                      title=chap_name,
+                                      groups=[],
+                                      upload_date=chap_date)
+            chapters.append(result)
+        return chapters
+
+    @property
+    def name(self):
+        try:
+            return re.match(r".+ - Read (.+) Online at MangaHere$",
+                            self.soup.find("title").text).groups()[0]
+        except AttributeError:
+            raise exceptions.ScrapingError
+
+
+class MangahereChapter(BaseChapter):
+    url_re = re.compile((r'https?://((www|m)\.)?mangahere\.cc'
+                        r'/(roll_)?manga(/v[0-9]+)?/c[0-9\.]+/[0-9]+\.html$'))
+    upload_date = None
+    uses_pages = True
+
+    def download(self):
+        if not getattr(self, "cpage", None):
+            self.cpage = requests.get(self.url.replace("www.", "m.")
+                                      .replace("/manga/", "/roll_manga/"))
+        if not getattr(self, "soup", None):
+            self.soup = BeautifulSoup(self.cpage.text,
+                                      config.get().html_parser)
+
+        image_list = self.soup.find("div", class_="mangaread-img")\
+            .find_all("img")
+        pages = []
+        for image in image_list:
+            pages.append(image["data-original"].replace("http://", "https://"))
+
+        futures = []
+        files = [None] * len(pages)
+        req_session = requests.Session()
+        with self.progress_bar(pages) as bar:
+            for i, page in enumerate(pages):
+                retries = 0
+                while retries < 10:
+                    try:
+                        r = req_session.get(page, stream=True)
+                        break
+                    except requests.exceptions.ConnectionError:
+                        retries += 1
+                if r.status_code != 200:
+                    r.close()
+                    raise ValueError
+                fut = download_pool.submit(self.page_download_task, i, r)
+                fut.add_done_callback(partial(self.page_download_finish,
+                                              bar, files))
+                futures.append(fut)
+            concurrent.futures.wait(futures)
+            self.create_zip(files)
+
+    def from_url(url):
+        chap_num = re.match((r"https?://((www|m)\.)?mangahere\.cc/(roll_)?"
+                             r"manga/[^/]+((/v[0-9]+)?/c[0-9\.]+)"
+                             r"/[0-9]+\.html"), url)\
+            .groups()[3].replace("/", "")
+        if "v" in chap_num:
+            chap_num = chap_num.replace("v", "").replace("c", ".")
+        else:
+            chap_num = chap_num.replace("c", "")
+        if chap_num == "000":
+            chap_num = "0"
+        else:
+            chap_num = chap_num.lstrip("0")
+        parent_url = re.match((r"(https?://((www|m)\.)?mangahere\.cc/(roll_)?"
+                               r"manga/[^/]+)(/v[0-9]+)?/"
+                               r"c[0-9\.]+/[0-9]+\.html"),
+                              url).groups()[0]
+        series = MangahereSeries(parent_url)
+        for chapter in series.chapters:
+            if chapter.chapter == str(chap_num):
+                return chapter
+        return None
diff --git a/cum/scrapers/mangasee.py b/cum/scrapers/mangasee.py
@@ -6,18 +6,32 @@
 import json
 import re
 import requests
-import traceback
 
 
 class MangaseeSeries(BaseSeries):
     url_re = re.compile(r'https?://mangaseeonline\.us/manga/.+')
+    multi_season_regex = re.compile((r"(https?://mangaseeonline\.us)"
+                                     r"?/read-online/"
+                                     r".+-chapter-[0-9\.]+-index-"
+                                     r"([0-9]+)-page-[0-9]+\.html"))
 
     def __init__(self, url, **kwargs):
         super().__init__(url, **kwargs)
         spage = requests.get(url)
+        if spage.status_code == 404:
+            raise exceptions.ScrapingError
         self.soup = BeautifulSoup(spage.text, config.get().html_parser)
         self.chapters = self.get_chapters()
 
+    def _get_chapnum_multiseason_series(self, url, chap_num):
+        if not re.match(self.multi_season_regex, url):
+            # chapter is from season 1
+            return "01." + chap_num.zfill(3)
+        else:
+            # chapter is from season >1
+            season = re.match(self.multi_season_regex, url).groups()[1]
+            return season.zfill(2) + "." + chap_num.zfill(3)
+
     def get_chapters(self):
         try:
             rows = self.soup.find_all("a", class_="list-group-item")
@@ -27,9 +41,12 @@ def get_chapters(self):
         for i, row in enumerate(rows):
             chap_num = re.match(r"Read .+ Chapter ([0-9\.]+) For Free Online",
                                 row["title"]).groups()[0]
+            if not hasattr(self, "is_multi_season"):
+                if re.match(self.multi_season_regex, row["href"]):
+                    self.is_multi_season = True
             chap_url = "https://mangaseeonline.us" + row["href"]
-            chap_name = row.find_all("span")[0].text
-            chap_date = row.find_all("time")[0].text
+            chap_name = row.find("span").text
+            chap_date = row.find("time").text
             result = MangaseeChapter(name=self.name,
                                      alias=self.alias,
                                      chapter=chap_num,
@@ -38,15 +55,24 @@ def get_chapters(self):
                                      groups=[],
                                      upload_date=chap_date)
             chapters.append(result)
+        # the chapters in the first season of a multi-season title
+        # are indistinguishable from a non-multi-season title.  thus
+        # we must retroactively reanalyze all chapters and adjust
+        # chapter numbers if *any* are multi-season
+        if hasattr(self, "is_multi_season"):
+            for chapter in chapters:
+                chapter.chapter = self.\
+                    _get_chapnum_multiseason_series(chapter.url,
+                                                    chapter.chapter)
+
         return chapters
 
     @property
     def name(self):
         try:
             return re.match(r"Read (.+) Man[a-z]+ For Free  \| MangaSee",
-                            self.soup.find_all("title")[0].text).groups()[0]
+                            self.soup.find("title").text).groups()[0]
         except AttributeError:
-            print(traceback.format_exc())
             raise exceptions.ScrapingError
 
 
@@ -106,10 +132,10 @@ def download(self):
     def from_url(url):
         cpage = requests.get(url)
         soup = BeautifulSoup(cpage.text, config.get().html_parser)
-        chap_num = soup.find_all("span", class_="CurChapter")[0].text
-        iname = soup.find_all("a", class_="list-link")[0]["href"]
+        # chap_num = soup.find("span", class_="CurChapter").text
+        iname = soup.find("a", class_="list-link")["href"]
         series = MangaseeSeries("https://mangaseeonline.us" + iname)
         for chapter in series.chapters:
-            if chapter.chapter == str(chap_num):
+            if chapter.url == url:
                 return chapter
         return None