Skip to content

Commit

Permalink
Add Mangasee scraper
Browse files Browse the repository at this point in the history
Adds support for Mangasee (mangaseeonline.us)
  • Loading branch information
matoro committed Feb 8, 2019
1 parent 8bec439 commit f04f80e
Show file tree
Hide file tree
Showing 4 changed files with 258 additions and 0 deletions.
3 changes: 3 additions & 0 deletions cum/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,9 @@ def to_object(self):
if parse.netloc == 'www.yuri-ism.net':
from cum.scrapers.yuriism import YuriismChapter
return YuriismChapter(**kwargs)
if parse.netloc == 'mangaseeonline.us':
from cum.scrapers.mangasee import MangaseeChapter
return MangaseeChapter(**kwargs)


class Group(Base):
Expand Down
3 changes: 3 additions & 0 deletions cum/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,22 @@
from cum.scrapers.dynastyscans import DynastyScansChapter, DynastyScansSeries
from cum.scrapers.madokami import MadokamiChapter, MadokamiSeries
from cum.scrapers.mangadex import MangadexSeries, MangadexChapter
from cum.scrapers.mangasee import MangaseeSeries, MangaseeChapter
from cum.scrapers.yuriism import YuriismChapter, YuriismSeries

series_scrapers = [
DokiReaderSeries,
DynastyScansSeries,
MadokamiSeries,
MangadexSeries,
MangaseeSeries,
YuriismSeries,
]
chapter_scrapers = [
DokiReaderChapter,
DynastyScansChapter,
MadokamiChapter,
MangadexChapter,
MangaseeChapter,
YuriismChapter,
]
114 changes: 114 additions & 0 deletions cum/scrapers/mangasee.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from bs4 import BeautifulSoup
from cum import config, exceptions
from cum.scrapers.base import BaseChapter, BaseSeries, download_pool
from functools import partial
import concurrent.futures
import json
import re
import requests
import traceback


class MangaseeSeries(BaseSeries):
url_re = re.compile(r'https?://mangaseeonline\.us/manga/.+')

def __init__(self, url, **kwargs):
super().__init__(url, **kwargs)
spage = requests.get(url)
self.soup = BeautifulSoup(spage.text, config.get().html_parser)
self.chapters = self.get_chapters()

def get_chapters(self):
try:
rows = self.soup.find_all("a", class_="list-group-item")
except AttributeError:
raise exceptions.ScrapingError()
chapters = []
for i, row in enumerate(rows):
chap_num = re.match(r"Read .+ Chapter ([0-9\.]+) For Free Online",
row["title"]).groups()[0]
chap_url = "https://mangaseeonline.us" + row["href"]
chap_name = row.find_all("span")[0].text
chap_date = row.find_all("time")[0].text
result = MangaseeChapter(name=self.name,
alias=self.alias,
chapter=chap_num,
url=chap_url,
title=chap_name,
groups=[],
upload_date=chap_date)
chapters.append(result)
return chapters

@property
def name(self):
try:
return re.match(r"Read (.+) Man[a-z]+ For Free \| MangaSee",
self.soup.find_all("title")[0].text).groups()[0]
except AttributeError:
print(traceback.format_exc())
raise exceptions.ScrapingError


class MangaseeChapter(BaseChapter):
url_re = re.compile((r'https?://mangaseeonline\.us/'
r'read-online/.+-chapter-[0-9\.]+-page-[0-9]+\.html'))
upload_date = None
uses_pages = True

def download(self):
if not getattr(self, "cpage", None):
self.cpage = requests.get(self.url)
if not getattr(self, "soup", None):
self.soup = BeautifulSoup(self.cpage.text,
config.get().html_parser)

for script in self.soup.find_all("script"):
if re.match("\n\tChapterArr=.+", script.text):
image_list = script.text
continue

image_list = re.sub("\n\tChapterArr=", "", image_list)
image_list = re.sub(";\n\t?", "", image_list)
image_list = re.sub("PageArr=", ",", image_list)
image_list = "[" + image_list + "]"
image_list = json.loads(image_list)[1]
pages = []
for image in image_list:
if image != "CurPage":
if re.match(".+blogspot.+", image_list[image]):
image_list[image] = image_list[image].\
replace("http://", "https://")
pages.append(image_list[image])

futures = []
files = [None] * len(pages)
with self.progress_bar(pages) as bar:
for i, page in enumerate(pages):
retries = 0
while retries < 3:
try:
r = requests.get(page, stream=True)
break
except requests.exceptions.ConnectionError:
retries += 1
if r.status_code != 200:
r.close()
raise ValueError
fut = download_pool.submit(self.page_download_task, i, r)
fut.add_done_callback(partial(self.page_download_finish,
bar, files))
futures.append(fut)
concurrent.futures.wait(futures)
self.create_zip(files)

def from_url(url):
cpage = requests.get(url)
soup = BeautifulSoup(cpage.text, config.get().html_parser)
chap_num = soup.find_all("span", class_="CurChapter")[0].text
iname = soup.find_all("a", class_="list-link")[0]["href"]
series = MangaseeSeries("https://mangaseeonline.us" + iname)
for chapter in series.chapters:
if chapter.chapter == str(chap_num):
return chapter
return None
138 changes: 138 additions & 0 deletions tests/test_scraper_mangasee.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from bs4 import BeautifulSoup
from cum import config, exceptions
from nose.tools import nottest
from urllib.parse import urljoin
import cumtest
import os
import requests
import unittest
import zipfile


class TestMangasee(cumtest.CumTest):
MANGASEE_URL = 'https://mangaseeonline.us/'

def setUp(self):
super().setUp()
global mangasee
from cum.scrapers import mangasee

def tearDown(self):
self.directory.cleanup()

def get_five_latest_releases(self):
r = requests.get(self.MANGASEE_URL)
soup = BeautifulSoup(r.text, config.get().html_parser)
chapters = soup.find_all("a", class_="latestSeries")
links = [urljoin(self.MANGASEE_URL, x.get("href")) for x in chapters]
return links[:5]

@nottest
def series_information_tester(self, data):
series = mangasee.MangaseeSeries(data['url'])
self.assertEqual(series.name, data['name'])
self.assertEqual(series.alias, data['alias'])
self.assertEqual(series.url, data['url'])
self.assertIs(series.directory, None)
self.assertEqual(len(series.chapters), len(data['chapters']))
for chapter in series.chapters:
self.assertEqual(chapter.name, data['name'])
self.assertEqual(chapter.alias, data['alias'])
self.assertIn(chapter.chapter, data['chapters'])
data['chapters'].remove(chapter.chapter)
self.assertIs(chapter.directory, None)
self.assertEqual(len(data['chapters']), 0)

def test_chapter_download_latest(self):
latest_releases = self.get_five_latest_releases()
for release in latest_releases:
try:
chapter = mangasee.MangaseeChapter.from_url(release)
except exceptions.ScrapingError as e:
print('scraping error for {} - {}'.format(release, e))
continue
else:
chapter.get(use_db=False)

def test_chapter_filename_decimal(self):
URL = "https://mangaseeonline.us/read-online/" + \
"Citrus-S-A-B-U-R-O-Uta-chapter-20.5-page-1.html"
chapter = mangasee.MangaseeChapter.from_url(URL)
path = os.path.join(self.directory.name, 'Citrus SABURO Uta',
'Citrus SABURO Uta - c020 x5 [Unknown].zip')
self.assertEqual(chapter.chapter, '20.5')
self.assertEqual(chapter.filename, path)

def test_chapter_information_normal(self):
URL = "https://mangaseeonline.us/read-online/" + \
"Ramen-Daisuki-Koizumi-San-chapter-18-page-1.html"
chapter = mangasee.MangaseeChapter.from_url(URL)
self.assertEqual(chapter.alias, 'ramen-daisuki-koizumi-san')
self.assertTrue(chapter.available())
self.assertEqual(chapter.chapter, '18')
self.assertEqual(chapter.name, 'Ramen Daisuki Koizumi-san')
self.assertEqual(chapter.title, 'Chapter 18')
path = os.path.join(self.directory.name,
'Ramen Daisuki Koizumi-san',
'Ramen Daisuki Koizumi-san - c018 [Unknown].zip')
self.assertEqual(chapter.filename, path)
chapter.download()
self.assertTrue(os.path.isfile(path))
with zipfile.ZipFile(path) as chapter_zip:
files = chapter_zip.infolist()
self.assertEqual(len(files), 8)

def test_chapter_information_chapterzero(self):
URL = "https://mangaseeonline.us/read-online/" + \
"Inu-To-Hasami-Wa-Tsukaiyou-chapter-0-page-1.html"
chapter = mangasee.MangaseeChapter.from_url(URL)
self.assertEqual(chapter.alias, 'inu-to-hasami-wa-tsukaiyou')
self.assertEqual(chapter.chapter, '0')
self.assertEqual(chapter.name, 'Inu to Hasami wa Tsukaiyou')
self.assertEqual(chapter.title, 'Chapter 0')
path = os.path.join(
self.directory.name, 'Inu to Hasami wa Tsukaiyou',
'Inu to Hasami wa Tsukaiyou - c000 [Unknown].zip')
self.assertEqual(chapter.filename, path)
chapter.download()
self.assertTrue(os.path.isfile(path))
with zipfile.ZipFile(path) as chapter_zip:
files = chapter_zip.infolist()
self.assertEqual(len(files), 51)

def test_chapter_unavailable(self):
URL = "https://mangaseeonline.us/read-online/" + \
"Oyasumi-Punpun-chapter-999-page-1.html"
chapter = mangasee.MangaseeChapter(url=URL)
self.assertFalse(chapter.available())

def test_series_oneword(self):
data = {'alias': 'aria',
'chapters': ['1', '2', '3', '4', '5', '6', '7', '8',
'9', '10', '10.5', '11', '12', '13', '14', '15',
'16', '17', '18', '19', '20', '21', '22', '23',
'24', '25', '26', '27', '28', '29', '30', '30.5',
'31', '32', '33', '34', '35', '35.5', '36',
'37', '37.5', '38', '39', '40', '41', '42', '43',
'44', '45', '45.5', '46', '47', '48', '49',
'50', '50.5', '51', '52', '53', '54', '55', '56',
'57', '57.5', '58', '59', '60', '60.5'],
'name': 'Aria',
'url': 'https://mangaseeonline.us/manga/Aria'}
self.series_information_tester(data)

def test_series_multiplewords(self):
data = {'alias': 'prunus-girl',
'chapters': ['1', '2', '3', '4', '5', '6', '7', '8',
'9', '10', '11', '12', '13', '14', '15',
'16', '17', '18', '19', '20', '21', '22',
'23', '24', '25', '26', '27', '28', '29', '30',
'31', '32', '32.5', '33', '34', '35', '36', '37',
'38', '39', '40', '41', '42', '43'],
'name': 'Prunus Girl',
'url': 'https://mangaseeonline.us/manga/Prunus-Girl'}
self.series_information_tester(data)


if __name__ == '__main__':
unittest.main()

0 comments on commit f04f80e

Please sign in to comment.