Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
mikf committed Oct 12, 2014
1 parent f738f3a commit deef91e
Show file tree
Hide file tree
Showing 18 changed files with 922 additions and 414 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__pycache__/
*.py[cod]
662 changes: 248 additions & 414 deletions LICENSE

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions gallery-dl
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import gallery_dl

if __name__ == '__main__':
gallery_dl.main()
44 changes: 44 additions & 0 deletions gallery_dl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
__author__ = "Mike Fährmann"
__copyright__ = "Copyright 2014, Mike Fährmann"

__license__ = "GPLv3"
__version__ = "0.4"
__maintainer__ = "Mike Fährmann"
__email__ = "[email protected]"

import os
import sys
import argparse
import configparser

from . import extractor
from . import downloader

def parse_cmdline_options():
p = argparse.ArgumentParser(
description='Download images from various sources')
p.add_argument("-c", "--config",
default="~/.config/gallery/config", metavar="CFG", help="alternate configuration file")
p.add_argument("-d", "--dest",
metavar="DEST", help="destination directory")
p.add_argument("urls", nargs="+",
metavar="URL", help="url to download images from")
return p.parse_args()

def parse_config_file(path):
config = configparser.ConfigParser(
interpolation=None,
)
config.optionxform = lambda opt:opt
config.read(os.path.expanduser(path))
return config

def main():
opts = parse_cmdline_options()
conf = parse_config_file(opts.config)
extf = extractor.ExtractorFinder(conf)
dlmg = downloader.DownloadManager(opts, conf)

for url in opts.urls:
ex = extf.match(url)
dlmg.add(ex)
54 changes: 54 additions & 0 deletions gallery_dl/downloader/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import sys
import importlib

class DownloadManager():

def __init__(self, opts, conf):
self.opts = opts
self.conf = conf
self.downloaders = {}

def add(self, extr):
if self.opts.dest:
dest = self.opts.dest
elif extr.category in self.conf:
dest = self.conf[extr.category].get("destination", "/tmp/")
else:
dest = self.conf["general"].get("destination", "/tmp/")
dest = os.path.join(dest, extr.category, extr.directory)
os.makedirs(dest, exist_ok=True)

for url, filename in extr:
path = os.path.join(dest, filename)
if os.path.exists(path):
self.print_skip(path)
continue
dl = self.get_downloader(extr, url)
self.print_start(path)
tries = dl.download(url, path)
self.print_success(path, tries)

def get_downloader(self, extr, url):
end = url.find("://")
proto = url[:end] if end != -1 else "http"
if proto not in self.downloaders:
# import downloader
module = importlib.import_module("."+proto, __package__)
self.downloaders[proto] = module.Downloader
return self.downloaders[proto](extr)

@staticmethod
def print_start(path):
print(path, end="")
sys.stdout.flush()

@staticmethod
def print_skip(path):
print("\033[2m", path, "\033[0m", sep="")

@staticmethod
def print_success(path, tries):
if tries == 0:
print("\r", end="")
print("\r\033[1;32m", path, "\033[0m", sep="")
21 changes: 21 additions & 0 deletions gallery_dl/downloader/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os

class BasicDownloader():

max_tries = 5

def download(self, url, path):
with open(path, "wb") as file:
try:
return self.download_impl(url, file)
file.close()
except:
# make sure to remove file if download failed
os.unlink(path)
raise

@staticmethod
def print_error(file, error, tries, max_tries=5):
if tries == 1 and hasattr(file, "name"):
print("\r\033[1;31m", file.name, sep="")
print("\033[0;31m[Error]\033[0m ", error, " (", tries, "/", max_tries, ")", sep="")
42 changes: 42 additions & 0 deletions gallery_dl/downloader/http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from .common import BasicDownloader
import time
import requests

class Downloader(BasicDownloader):

def __init__(self, extr):
BasicDownloader.__init__(self)
self.session = extr.session

def download_impl(self, url, file):
tries = 0
while True:
# try to connect to remote source
try:
response = self.session.get(url, stream=True, verify=True)
except requests.exceptions.ConnectionError as e:
tries += 1
self.print_error(file, e, tries, self.max_tries)
time.sleep(1)
if tries == self.max_tries:
raise
continue

# reject error-status-codes
if response.status_code != requests.codes.ok:
tries += 1
self.print_error(file, 'HTTP status "{} {}"'.format(
response.status_code, response.reason), tries, self.max_tries)
if response.status_code == 404:
return self.max_tries
time.sleep(1)
if tries == 5:
response.raise_for_status()
continue

# everything ok -- proceed to download
break

for data in response.iter_content(16384):
file.write(data)
return tries
1 change: 1 addition & 0 deletions gallery_dl/downloader/https.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .http import Downloader
10 changes: 10 additions & 0 deletions gallery_dl/downloader/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from .common import BasicDownloader

class Downloader(BasicDownloader):

def __init__(self, extr):
BasicDownloader.__init__(self)

def download_impl(self, url, file):
file.write(bytes(url[7:], "utf-8"))
return 0
21 changes: 21 additions & 0 deletions gallery_dl/extractor/8chan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from .common import BasicExtractor
from urllib.parse import unquote
import re

class Extractor(BasicExtractor):

thread_url_fmt = "https://www.8chan.co/{0}/res/{1}.html"
regex = r'>File: <a href="([^"]+)">([^<]+)\.[^<]+<.*?<span class="postfilename"( title="([^"]+)")?>([^<]+)<'

def __init__(self, match, config):
BasicExtractor.__init__(self, config)
self.board, _, self.thread_id = match.group(1).split("/")
self.category = "8chan"
self.directory = self.board + "-" + self.thread_id

def images(self):
url = self.thread_url_fmt.format(self.board, self.thread_id)
text = self.request(url).text
for match in re.finditer(self.regex, text):
url, prefix, fullname, name = match.group(1, 2, 4, 5)
yield ("https://www.8chan.co" + url, prefix + "-" + unquote(fullname or name))
47 changes: 47 additions & 0 deletions gallery_dl/extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os
import sys
import re
import sqlite3
import importlib

class ExtractorFinder():

def __init__(self, config):
self.config = config
self.match_list = list()
if "database" in config["general"]:
path = os.path.expanduser(config["general"]["database"])
conn = sqlite3.connect(path)
self.load_from_database(conn)
self.load_from_config(config)

def match(self, url):
for category, regex in self.match_list:
match = regex.match(url)
if match:
module = importlib.import_module("."+category, __package__)
return module.Extractor(match, self.config)
return None

def load_from_database(self, db):
query = (
"SELECT regex.re, category.name "
"FROM regex JOIN category "
"ON regex.category_id = category.id"
)
for row in db.execute(query):
self.add_match(row[1], row[0])

def load_from_config(self, conf):
for category in conf:
for key, value in conf[category].items():
if(key.startswith("regex")):
self.add_match(category, value)

def add_match(self, category, regex):
try:
# print(category, regex)
self.match_list.append( (category, re.compile(regex)) )
except:
print("[Warning] [{0}] failed to compile regular expression '{1}'"
.format(category, regex))
22 changes: 22 additions & 0 deletions gallery_dl/extractor/batoto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from .common import AsyncExtractor
from ..util import filename_from_url
from urllib.parse import unquote

class Extractor(AsyncExtractor):

def __init__(self, match, config):
AsyncExtractor.__init__(self, config)
self.url = "https://bato.to/read/_/" + match.group(1) + "/_/1"
self.category = "batoto"
self.directory = match.group(1)

def images(self):
next_url = self.url
while next_url:
text = self.request(next_url).text
pos = text.find('<div id="full_image"')

next_url, pos = self.extract(text, '<a href="', '"', pos)
url, pos = self.extract(text, 'src="', '"', pos)
name = unquote( filename_from_url(url) )
yield url, name
67 changes: 67 additions & 0 deletions gallery_dl/extractor/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import queue
import threading
import requests
from ..util import safe_request

class BasicExtractor():

def __init__(self, config):
self.session = requests.Session()
self.category = ""
self.directory = ""

def __iter__(self):
return self.images()

def request(self, url, *args, **kwargs):
return safe_request(self.session, url, *args, **kwargs)

def enable_useragent(self):
self.session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"

@staticmethod
def extract(txt, begin, end, pos=0):
try:
first = txt.index(begin, pos) + len(begin)
last = txt.index(end, first)
return txt[first:last], last+len(end)
except:
return None, pos

@staticmethod
def extract_all(txt, begin, end, pos=0):
try:
first = txt.index(begin, pos)
last = txt.index(end, first + len(begin)) + len(end)
return txt[first:last], last
except:
return None, pos

class AsyncExtractor(BasicExtractor):

def __init__(self, config):
super().__init__(config)
self.__queue = queue.Queue(maxsize=5)
self.__thread = threading.Thread(target=self.async_images, daemon=True)

def __iter__(self):
get = self.__queue.get
done = self.__queue.task_done

self.__thread.start()
while True:
task = get()
if task is None:
return
yield task
done()

def async_images(self):
put = self.__queue.put
try:
for task in self.images():
put(task)
except:
import traceback
print(traceback.format_exc())
put(None)
Loading

0 comments on commit deef91e

Please sign in to comment.