Skip to content

Commit

Permalink
feat: support manually download as cbz
Browse files Browse the repository at this point in the history
  • Loading branch information
everpcpc committed Mar 7, 2024
1 parent adc2a26 commit cf0a0c6
Show file tree
Hide file tree
Showing 8 changed files with 673 additions and 636 deletions.
1,127 changes: 545 additions & 582 deletions Pipfile.lock

Large diffs are not rendered by default.

18 changes: 12 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,25 @@ $ pipenv sync
## Usage

``` Shell
$ pipenv run python comicbook.py --comic https://nhentai.net/g/{id}/
$ pipenv run python comicbook.py --comic http://g.e-hentai.org/g/{gid}/{token}/
$ pipenv run python comicbook.py --comic http://wnacg.com/photos-view-id-{aid}.html
$ pipenv run python comicbook.py --help

# manually download
$ pipenv run python comicbook.py --comic http://wnacg.com/photos-view-id-{aid}.html

# manually download to target dir
$ pipenv run python comicbook.py --comic http://g.e-hentai.org/g/{gid}/{token}/ --output /path/to/output

# manually download to target dir with cbz format
$ pipenv run python comicbook.py --comic https://nhentai.net/g/{id}/ --output /path/to/output --format cbz

# run bot.
$ pipenv run python telegrambot.py
```

## Webapp

```Shell
$ pipenv run uvicorn --host 127.0.0.1 --port 5000 webapp:app

# run server and bot.
$ pipenv run python comicbook.py --server --telegram-bot
```

## Celery task worker
Expand Down
72 changes: 27 additions & 45 deletions comicbook.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,37 @@
# coding: UTF-8

import sys
import json
import getopt
import argparse
import logging

from crawler import Crawler

import config
from bot import ComicbookTelegramBot

logging.basicConfig(level=logging.INFO)

version = "1.1.0"

if __name__ == "__main__":
help = """comicbook options:
-h, --help Show help.
-v, --version Show version and exit.
-c, --comic a comic link on > nhentai.net
> e-hentai.org
> wnacg.com
-o, --output Specify a output path.(temporarily disabled)
-t, --telegram-bot Run telegram bot.
"""

link = ""
output = ""

if len(sys.argv) == 1:
print(help)
sys.exit()
argv = sys.argv[1:]
try:
opts, args = getopt.getopt(
argv, "hvc:o:ts", ["help", "version", "comic=", "output=", "telegram-bot"]
)
except getopt.GetoptError:
print(help)
sys.exit()
for opt, arg in opts:
if opt in ("-h", "--help"):
print(help)
if opt in ("-c", "--comic"):
link = arg
result = Crawler.download(link)
result.pop("item")
print(json.dumps(result, indent=2))
if opt in ("-o", "--output"):
output = arg
if opt in ("-v", "--version"):
print(version)

if ("-t", "") in opts or ("--telegram-bot", "") in opts:
bot = ComicbookTelegramBot(config.TELEGRAM_BOT_TOKEN)
bot.start()
parser = argparse.ArgumentParser(description="comicbook")
parser.add_argument(
"-c",
"--comic",
required=True,
help="a comic link on nhentai.net, e-hentai.org, wnacg.com",
)
parser.add_argument(
"-f",
"--format",
default="epub",
choices=["epub", "cbz"],
help="Specify a format.",
)
parser.add_argument("-o", "--output", help="Specify a output path.")
args = parser.parse_args()

if args.output:
result = Crawler.download_manually(args.comic, args.format, args.output)
print(result)
else:
result = Crawler.download(args.comic)
result.pop("item")
print(json.dumps(result, indent=2))
9 changes: 8 additions & 1 deletion crawler/helper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .item import item_from_url
from .utils.storage import Storage
from .tasks import crawl_comic, get_progress
from .tasks import crawl_comic, crawl_comic_manually, get_progress


class Crawler:
Expand Down Expand Up @@ -68,3 +68,10 @@ def download(cls, url):
return result
crawl_comic(url)
return cls.check(url)

@classmethod
def download_manually(cls, url, ftype, output):
"""
Download comic as epub/cbz file to output path.
"""
return crawl_comic_manually(url, ftype, output)
44 changes: 44 additions & 0 deletions crawler/pipelines/comic_cbz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# coding: UTF-8
import os
import logging
from zipfile import ZipFile

import requests

from crawler.utils import ua
import config

logger = logging.getLogger("pipeline")
logger.setLevel(logging.INFO)


class ComicPipeline:
def __init__(self, item):
self.item = item
self.cbz = None

def generate(self, dir):
self.cbz = ZipFile(dir, "w")
slog = logger.getChild(f"{self.item.domain}-{self.item.id}")

slog.info("start to download image resources")
count = len(self.item.image_urls)

session = requests.Session()
session.headers.update({"User-Agent": ua.get_random_ua()})
session.proxies.update(config.PROXY)

for (index, url) in enumerate(self.item.image_urls):
r = session.get(url)
if r.ok:
slog.info("[%d/%d] %s [OK]", index + 1, count, url)
image_name = url.split("/")[-1]
self.cbz.writestr(image_name, r.content)
else:
slog.info("[%d/%d] %s [FAIL]", index + 1, count, url)
return False
slog.info("download completed")

slog.info("cbzify...")
self.cbz.close()
slog.info("work done")
1 change: 1 addition & 0 deletions crawler/spiders/wnacg.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def crawl(self, item):
len(pages) > 1 and pages[0] != pages[len(pages) - 1]
):
current_page = pages[len(pages) - 1]
logger.info("crawl page [%d]: %s", len(pages), current_page)
p = session.get(current_page)
sel = etree.HTML(p.text)
img_url = sel.xpath('//*[@id="picarea"]')[0].get("src")
Expand Down
29 changes: 27 additions & 2 deletions crawler/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

from .celery import app
from .item import item_from_url
from .pipelines.comic_epub import ComicPipeline
from .pipelines.comic_epub import ComicPipeline as EPUBPipeline
from .pipelines.comic_cbz import ComicPipeline as CBZPipeline
from .spiders.ehentai import EhentaiSpider
from .spiders.nhentai import NhentaiSpider
from .spiders.wnacg import WnacgSpider
Expand Down Expand Up @@ -63,7 +64,7 @@ def crawl_comic(url):
return f"ERR: already in progress: {_domain.value} {_id}"

set_progress(_domain, _id, 0.01)
pipeline = ComicPipeline(item)
pipeline = EPUBPipeline(item)
dir = storage.get_comic_file_downloading_path()

def progress_callback(progress):
Expand All @@ -90,3 +91,27 @@ def done_callback():
done_callback=done_callback,
)
return f"DONE: {_domain.value} {_id}"


def crawl_comic_manually(url, ftype, output):
item = item_from_url(url)
_domain = item.domain
_id = item.id
spider = SPIDERS[_domain](url)
item = spider.crawl(item=item)
if item is None:
return f"ERR: crawl failed: {_domain.value} {_id}"
if not item.titles:
return f"ERR: no title: {_domain.value} {_id}"

if ftype == "epub":
pipeline = EPUBPipeline(item)
elif ftype == "cbz":
pipeline = CBZPipeline(item)
else:
return f"ERR: unsupported format: {ftype}"
tmp_dir = os.path.join(output, f"{_domain.value}@{_id}.tmp")
dir = os.path.join(output, f"{_domain.value}@{_id}.{ftype}")
pipeline.generate(dir=tmp_dir)
os.rename(tmp_dir, dir)
return f"DONE: {_domain.value} {_id}"
9 changes: 9 additions & 0 deletions telegrambot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# coding: UTF-8

import config
from bot import ComicbookTelegramBot


if __name__ == "__main__":
bot = ComicbookTelegramBot(config.TELEGRAM_BOT_TOKEN)
bot.start()

0 comments on commit cf0a0c6

Please sign in to comment.