Skip to content

Commit

Permalink
add mangahere scraper; numerous misc fixes
Browse files Browse the repository at this point in the history
remove debugging imports, add more tests to mangasee scraper,
add support for multi-volume/multi-season titles, fix 404
detection on mangasee scraper, change beautifulsoup element
parsing to find() instead of find_all()
  • Loading branch information
matoro committed Feb 15, 2019
1 parent 9d4325c commit 2e1de8d
Show file tree
Hide file tree
Showing 5 changed files with 409 additions and 8 deletions.
5 changes: 5 additions & 0 deletions cum/scrapers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,11 @@ def filename(self):
elif match(r'[0-9]*\.[0-9]*$', self.chapter):
number, decimal = self.chapter.split('.')
chapter = 'c{:0>3} x{}'.format(number, decimal)
# Individually numbered chapter with double-decimal (e.g. '2.164.5').
# Used by titles with multiple volumes/seasons and special chapters.
elif match(r'[0-9]*(\.[0-9]*){2}$', self.chapter):
volume, number, decimal = self.chapter.split('.')
chapter = 'c{:0>3} x{:0>3}.{}'.format(volume, number, decimal)
# Failing all else, e.g. 'Special'. Becomes 'c000 [Special]'.
else:
chapter = 'c000 [{}]'.format(self.chapter)
Expand Down
129 changes: 129 additions & 0 deletions cum/scrapers/mangahere.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from bs4 import BeautifulSoup
from cum import config, exceptions
from cum.scrapers.base import BaseChapter, BaseSeries, download_pool
from functools import partial
import concurrent.futures
import re
import requests


class MangahereSeries(BaseSeries):
url_re = re.compile(r'https?://((www|m)\.)?mangahere\.cc/manga/.+')

def __init__(self, url, **kwargs):
super().__init__(url, **kwargs)
# convert mobile link to desktop
spage = requests.get(url.replace("m.", "www."))
if spage.status_code == 404:
raise exceptions.ScrapingError
self.soup = BeautifulSoup(spage.text, config.get().html_parser)
self.chapters = self.get_chapters()

def get_chapters(self):
try:
rows = self.soup.find("ul", class_="detail-main-list")\
.find_all("li")
except AttributeError:
raise exceptions.ScrapingError()
chapters = []
for i, row in enumerate(rows):
chap_num = re.match((r"/manga/[^/]+((/v[0-9]+)?"
r"/c[0-9\.]+)/[0-9]+\.html$"),
row.find("a")["href"]).groups()[0]\
.replace("/", "")
if "v" in chap_num:
chap_num = chap_num.replace("v", "").replace("c", ".")
else:
chap_num = chap_num.replace("c", "")
if chap_num == "000":
chap_num = "0"
else:
chap_num = chap_num.lstrip("0")
# convert mobile link to desktop
chap_url = "https://www.mangahere.cc" + \
row.find("a")["href"].replace("/roll_manga/", "/manga/")
chap_name = row.find("p", class_="title3").text
chap_date = row.find("p", class_="title2").text
result = MangahereChapter(name=self.name,
alias=self.alias,
chapter=chap_num,
url=chap_url,
title=chap_name,
groups=[],
upload_date=chap_date)
chapters.append(result)
return chapters

@property
def name(self):
try:
return re.match(r".+ - Read (.+) Online at MangaHere$",
self.soup.find("title").text).groups()[0]
except AttributeError:
raise exceptions.ScrapingError


class MangahereChapter(BaseChapter):
url_re = re.compile((r'https?://((www|m)\.)?mangahere\.cc'
r'/(roll_)?manga(/v[0-9]+)?/c[0-9\.]+/[0-9]+\.html$'))
upload_date = None
uses_pages = True

def download(self):
if not getattr(self, "cpage", None):
self.cpage = requests.get(self.url.replace("www.", "m.")
.replace("/manga/", "/roll_manga/"))
if not getattr(self, "soup", None):
self.soup = BeautifulSoup(self.cpage.text,
config.get().html_parser)

image_list = self.soup.find("div", class_="mangaread-img")\
.find_all("img")
pages = []
for image in image_list:
pages.append(image["data-original"].replace("http://", "https://"))

futures = []
files = [None] * len(pages)
req_session = requests.Session()
with self.progress_bar(pages) as bar:
for i, page in enumerate(pages):
retries = 0
while retries < 10:
try:
r = req_session.get(page, stream=True)
break
except requests.exceptions.ConnectionError:
retries += 1
if r.status_code != 200:
r.close()
raise ValueError
fut = download_pool.submit(self.page_download_task, i, r)
fut.add_done_callback(partial(self.page_download_finish,
bar, files))
futures.append(fut)
concurrent.futures.wait(futures)
self.create_zip(files)

def from_url(url):
chap_num = re.match((r"https?://((www|m)\.)?mangahere\.cc/(roll_)?"
r"manga/[^/]+((/v[0-9]+)?/c[0-9\.]+)"
r"/[0-9]+\.html"), url)\
.groups()[3].replace("/", "")
if "v" in chap_num:
chap_num = chap_num.replace("v", "").replace("c", ".")
else:
chap_num = chap_num.replace("c", "")
if chap_num == "000":
chap_num = "0"
else:
chap_num = chap_num.lstrip("0")
parent_url = re.match((r"(https?://((www|m)\.)?mangahere\.cc/(roll_)?"
r"manga/[^/]+)(/v[0-9]+)?/"
r"c[0-9\.]+/[0-9]+\.html"),
url).groups()[0]
series = MangahereSeries(parent_url)
for chapter in series.chapters:
if chapter.chapter == str(chap_num):
return chapter
return None
42 changes: 34 additions & 8 deletions cum/scrapers/mangasee.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,32 @@
import json
import re
import requests
import traceback


class MangaseeSeries(BaseSeries):
url_re = re.compile(r'https?://mangaseeonline\.us/manga/.+')
multi_season_regex = re.compile((r"(https?://mangaseeonline\.us)"
r"?/read-online/"
r".+-chapter-[0-9\.]+-index-"
r"([0-9]+)-page-[0-9]+\.html"))

def __init__(self, url, **kwargs):
super().__init__(url, **kwargs)
spage = requests.get(url)
if spage.status_code == 404:
raise exceptions.ScrapingError
self.soup = BeautifulSoup(spage.text, config.get().html_parser)
self.chapters = self.get_chapters()

def _get_chapnum_multiseason_series(self, url, chap_num):
if not re.match(self.multi_season_regex, url):
# chapter is from season 1
return "01." + chap_num.zfill(3)
else:
# chapter is from season >1
season = re.match(self.multi_season_regex, url).groups()[1]
return season.zfill(2) + "." + chap_num.zfill(3)

def get_chapters(self):
try:
rows = self.soup.find_all("a", class_="list-group-item")
Expand All @@ -27,9 +41,12 @@ def get_chapters(self):
for i, row in enumerate(rows):
chap_num = re.match(r"Read .+ Chapter ([0-9\.]+) For Free Online",
row["title"]).groups()[0]
if not hasattr(self, "is_multi_season"):
if re.match(self.multi_season_regex, row["href"]):
self.is_multi_season = True
chap_url = "https://mangaseeonline.us" + row["href"]
chap_name = row.find_all("span")[0].text
chap_date = row.find_all("time")[0].text
chap_name = row.find("span").text
chap_date = row.find("time").text
result = MangaseeChapter(name=self.name,
alias=self.alias,
chapter=chap_num,
Expand All @@ -38,15 +55,24 @@ def get_chapters(self):
groups=[],
upload_date=chap_date)
chapters.append(result)
# the chapters in the first season of a multi-season title
# are indistinguishable from a non-multi-season title. thus
# we must retroactively reanalyze all chapters and adjust
# chapter numbers if *any* are multi-season
if hasattr(self, "is_multi_season"):
for chapter in chapters:
chapter.chapter = self.\
_get_chapnum_multiseason_series(chapter.url,
chapter.chapter)

return chapters

@property
def name(self):
try:
return re.match(r"Read (.+) Man[a-z]+ For Free \| MangaSee",
self.soup.find_all("title")[0].text).groups()[0]
self.soup.find("title").text).groups()[0]
except AttributeError:
print(traceback.format_exc())
raise exceptions.ScrapingError


Expand Down Expand Up @@ -106,10 +132,10 @@ def download(self):
def from_url(url):
cpage = requests.get(url)
soup = BeautifulSoup(cpage.text, config.get().html_parser)
chap_num = soup.find_all("span", class_="CurChapter")[0].text
iname = soup.find_all("a", class_="list-link")[0]["href"]
# chap_num = soup.find("span", class_="CurChapter").text
iname = soup.find("a", class_="list-link")["href"]
series = MangaseeSeries("https://mangaseeonline.us" + iname)
for chapter in series.chapters:
if chapter.chapter == str(chap_num):
if chapter.url == url:
return chapter
return None
Loading

0 comments on commit 2e1de8d

Please sign in to comment.