Skip to content

Commit

Permalink
Add Manganelo scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
matoro committed Mar 7, 2020
1 parent b313be1 commit 665884a
Show file tree
Hide file tree
Showing 4 changed files with 279 additions and 1 deletion.
4 changes: 3 additions & 1 deletion cum/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,9 @@ def to_object(self):
if parse.netloc in ('www.mangahere.cc', 'm.mangahere.cc'):
from cum.scrapers.mangahere import MangahereChapter
return MangahereChapter(**kwargs)

if parse.netloc == 'manganelo.com':
from cum.scrapers.manganelo import ManganeloChapter
return ManganeloChapter(**kwargs)

class Group(Base):
__tablename__ = 'groups'
Expand Down
3 changes: 3 additions & 0 deletions cum/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from cum.scrapers.dynastyscans import DynastyScansChapter, DynastyScansSeries
from cum.scrapers.madokami import MadokamiChapter, MadokamiSeries
from cum.scrapers.mangadex import MangadexSeries, MangadexChapter
from cum.scrapers.manganelo import ManganeloSeries, ManganeloChapter
from cum.scrapers.mangasee import MangaseeSeries, MangaseeChapter
from cum.scrapers.mangahere import MangahereSeries, MangahereChapter
from cum.scrapers.yuriism import YuriismChapter, YuriismSeries
Expand All @@ -11,6 +12,7 @@
DynastyScansSeries,
MadokamiSeries,
MangadexSeries,
ManganeloSeries,
MangaseeSeries,
MangahereSeries,
YuriismSeries,
Expand All @@ -20,6 +22,7 @@
DynastyScansChapter,
MadokamiChapter,
MangadexChapter,
ManganeloChapter,
MangaseeChapter,
MangahereChapter,
YuriismChapter,
Expand Down
124 changes: 124 additions & 0 deletions cum/scrapers/manganelo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
from bs4 import BeautifulSoup
from cum import config, exceptions, output
from cum.scrapers.base import BaseChapter, BaseSeries, download_pool
from functools import partial
from warnings import filterwarnings
import concurrent.futures
import json
import re
import requests


class ManganeloSeries(BaseSeries):
url_re = re.compile(r'https?://manganelo\.com/manga/.+')

def __init__(self, url, **kwargs):
super().__init__(url, **kwargs)
filterwarnings(action = "ignore", message = "unclosed", category = ResourceWarning)
spage = requests.get(url)
if spage.status_code == 404:
raise exceptions.ScrapingError
self.soup = BeautifulSoup(spage.text, config.get().html_parser)
# 404 pages actually return HTTP 200
if self.soup.find("title").text == "404 Not Found":
raise exceptions.ScrapingError
self.chapters = self.get_chapters()

def get_chapters(self):
try:
rows = self.soup.find_all("li", class_="a-h")
except AttributeError:
raise exceptions.ScrapingError()
chapters = []
for i, row in enumerate(rows):
chap_num = re.match(r"https?://manganelo\.com/chapter/.+/?chapter_([0-9\.]+)",
row.find("a")["href"]).groups()[0]
chap_url = row.find("a")["href"]
chap_name = row.find("a")["title"]
chap_date = row.find_all("span")[1]["title"]
result = ManganeloChapter(name=self.name,
alias=self.alias,
chapter=chap_num,
url=chap_url,
title=chap_name,
groups=[],
upload_date=chap_date)
chapters.append(result)
return chapters

@property
def name(self):
try:
return re.match(r"(.+) Manga Online Free - Manganelo",
self.soup.find("title").text).groups()[0]
except AttributeError:
raise exceptions.ScrapingError


class ManganeloChapter(BaseChapter):
url_re = re.compile((r'https?://manganelo\.com/'
r'chapter/.+/chapter_[0-9\.]'))
upload_date = None
uses_pages = True

# 404 pages actually return HTTP 200
# thus this method override
def available(self):
if not getattr(self, "cpage", None):
self.cpage = requests.get(self.url)
if not getattr(self, "soup", None):
self.soup = BeautifulSoup(self.cpage.text,
config.get().html_parser)
return self.soup.find("title").text != "404 Not Found"

def download(self):
if not getattr(self, "cpage", None):
self.cpage = requests.get(self.url)
if not getattr(self, "soup", None):
self.soup = BeautifulSoup(self.cpage.text,
config.get().html_parser)

# 404 pages actually return HTTP 200
if self.soup.find("title").text == "404 Not Found":
raise exceptions.ScrapingError
pages = [ image["src"] for image in self.soup.find("div", class_ = "container-chapter-reader").find_all("img") ]

futures = []
files = [None] * len(pages)
req_session = requests.Session()
with self.progress_bar(pages) as bar:
for i, page in enumerate(pages):
retries = 0
while retries < 10:
try:
r = req_session.get(page, stream=True)
if r.status_code != 200:
output.warning('Failed to fetch page with status {}, retrying #{}'
.format(str(r.status_code), str(retries)))
retries += 1
else:
break
except requests.exceptions.ConnectionError:
retries += 1
if r.status_code != 200:
output.error('Failed to fetch page with status {}, giving up'
.format(str(r.status_code)))
raise ValueError
fut = download_pool.submit(self.page_download_task, i, r)
fut.add_done_callback(partial(self.page_download_finish,
bar, files))
futures.append(fut)
concurrent.futures.wait(futures)
self.create_zip(files)
req_session.close()

def from_url(url):
cpage = requests.get(url)
soup = BeautifulSoup(cpage.text, config.get().html_parser)
iname = re.match("https?://manganelo\.com/chapter/(.+)/chapter_[0-9\.]+",
url).groups()[0]
series = ManganeloSeries("https://manganelo.com/manga/" + iname)
for chapter in series.chapters:
if chapter.url == url:
return chapter
return None
149 changes: 149 additions & 0 deletions tests/test_scraper_manganelo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
from bs4 import BeautifulSoup
from cum import config, exceptions
from nose.tools import nottest
from urllib.parse import urljoin
from warnings import filterwarnings
import cumtest
import os
import requests
import unittest
import zipfile


class TestManganelo(cumtest.CumTest):
MANGANELO_URL = 'https://manganelo.com/genre-all'

def setUp(self):
super().setUp()
global manganelo
filterwarnings(action = "ignore", message = "unclosed", category = ResourceWarning)
from cum.scrapers import manganelo

def tearDown(self):
self.directory.cleanup()

def get_five_latest_releases(self):
r = requests.get(self.MANGANELO_URL)
soup = BeautifulSoup(r.text, config.get().html_parser)
chapters = soup.find_all("a", class_="genres-item-chap")
links = [x["href"] for x in chapters]
return links[:5]

@nottest
def series_information_tester(self, data):
series = manganelo.ManganeloSeries(data['url'])
self.assertEqual(series.name, data['name'])
self.assertEqual(series.alias, data['alias'])
self.assertEqual(series.url, data['url'])
self.assertIs(series.directory, None)
self.assertEqual(len(series.chapters), len(data['chapters']))
for chapter in series.chapters:
self.assertEqual(chapter.name, data['name'])
self.assertEqual(chapter.alias, data['alias'])
self.assertIn(chapter.chapter, data['chapters'])
data['chapters'].remove(chapter.chapter)
self.assertIs(chapter.directory, None)
self.assertEqual(len(data['chapters']), 0)

# This test is disabled temporarily due to the architecture of
# the chapter.from_url method, which assumes that if a chapter
# exists then it will be listed on the series page. Manganelo
# seems to violate this assumption, in that there are chapters
# which are accessible from the "latest chapters" page but which
# are not listed on their respective series' pages, at least
# not immediately.
# TODO: come back to this test and find a way to construct a
# chapter without requiring metadata from the series page
def _test_chapter_download_latest(self):
latest_releases = self.get_five_latest_releases()
for release in latest_releases:
try:
chapter = manganelo.ManganeloChapter.from_url(release)
except exceptions.ScrapingError as e:
print('scraping error for {} - {}'.format(release, e))
continue
else:
chapter.get(use_db=False)

def test_chapter_filename_decimal(self):
URL = "https://manganelo.com/chapter/citrus_saburo_uta/chapter_24.6"
chapter = manganelo.ManganeloChapter.from_url(URL)
path = os.path.join(self.directory.name, 'Citrus Saburo Uta',
'Citrus Saburo Uta - c024 x6 [Unknown].zip')
self.assertEqual(chapter.chapter, '24.6')
self.assertEqual(chapter.filename, path)

def test_chapter_information_normal(self):
URL = "https://manganelo.com/chapter/ramen_daisuki_koizumisan/chapter_18"
chapter = manganelo.ManganeloChapter.from_url(URL)
self.assertEqual(chapter.alias, 'ramen-daisuki-koizumi-san')
self.assertTrue(chapter.available())
self.assertEqual(chapter.chapter, '18')
self.assertEqual(chapter.name, 'Ramen Daisuki Koizumi-San')
self.assertEqual(chapter.title, 'Ramen Daisuki Koizumi-san Chapter 18')
path = os.path.join(self.directory.name,
'Ramen Daisuki Koizumi-San',
'Ramen Daisuki Koizumi-San - c018 [Unknown].zip')
self.assertEqual(chapter.filename, path)
chapter.download()
self.assertTrue(os.path.isfile(path))
with zipfile.ZipFile(path) as chapter_zip:
files = chapter_zip.infolist()
self.assertEqual(len(files), 8)

def test_chapter_information_chapterzero(self):
URL = "https://manganelo.com/chapter/inu_to_hasami_wa_tsukaiyou/chapter_0"
chapter = manganelo.ManganeloChapter.from_url(URL)
self.assertEqual(chapter.alias, 'inu-to-hasami-wa-tsukaiyou')
self.assertEqual(chapter.chapter, '0')
self.assertEqual(chapter.name, 'Inu To Hasami Wa Tsukaiyou')
self.assertEqual(chapter.title, 'Inu to Hasami wa Tsukaiyou Vol.1 Chapter 0')
path = os.path.join(
self.directory.name, 'Inu To Hasami Wa Tsukaiyou',
'Inu To Hasami Wa Tsukaiyou - c000 [Unknown].zip')
self.assertEqual(chapter.filename, path)
chapter.download()
self.assertTrue(os.path.isfile(path))
with zipfile.ZipFile(path) as chapter_zip:
files = chapter_zip.infolist()
self.assertEqual(len(files), 32)

def test_series_invalid(self):
URL = "https://manganelo.com/manga/test_bad_manga_name"
with self.assertRaises(exceptions.ScrapingError):
series = manganelo.ManganeloSeries(url=URL)

def test_chapter_unavailable(self):
URL = "https://manganelo.com/chapter/oyasumi_punpun/chapter_999"
chapter = manganelo.ManganeloChapter(url=URL)
self.assertFalse(chapter.available())

def test_series_oneword(self):
data = {'alias': 'aria',
'chapters': ['1', '2', '3', '4', '5', '6', '7', '8',
'9', '10', '10.5', '11', '12', '13', '14', '15',
'16', '17', '18', '19', '20', '21', '22', '23',
'24', '25', '26', '27', '28', '29', '30', '30.5',
'31', '32', '33', '34', '35', '35.5', '36',
'37', '37.5', '38', '39', '40', '41', '42', '43',
'44', '45', '45.5', '46', '47', '48', '49',
'50', '50.5', '51', '52', '53', '54', '55', '56',
'57', '57.5', '58', '59', '60', '60.1'],
'name': 'Aria',
'url': 'https://manganelo.com/manga/aria'}
self.series_information_tester(data)

def test_series_multiplewords(self):
data = {'alias': 'prunus-girl',
'chapters': ['1', '1.5', '2', '3', '4', '5', '5.5', '6', '7', '8',
'9', '10', '11', '11.5', '12', '13', '14', '15',
'16', '16.5', '17', '18', '19', '20', '21', '22',
'23', '24', '25', '26', '27', '28', '29', '30',
'31', '32', '32.5', '33', '34', '35', '36', '37',
'38', '39', '40', '41', '42', '42.5'],
'name': 'Prunus Girl',
'url': 'https://manganelo.com/manga/prunus_girl'}
self.series_information_tester(data)

if __name__ == '__main__':
unittest.main()

0 comments on commit 665884a

Please sign in to comment.