initial commit

mikf · Oct 12, 2014 · deef91e · deef91e
1 parent f738f3a
commit deef91e
Show file tree

Hide file tree

Showing 18 changed files with 922 additions and 414 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+*.py[cod]
diff --git a/LICENSE b/LICENSE
diff --git a/gallery-dl b/gallery-dl
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+
+import gallery_dl
+
+if __name__ == '__main__':
+    gallery_dl.main()
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
@@ -0,0 +1,44 @@
+__author__     = "Mike Fährmann"
+__copyright__  = "Copyright 2014, Mike Fährmann"
+
+__license__    = "GPLv3"
+__version__    = "0.4"
+__maintainer__ = "Mike Fährmann"
+__email__      = "[email protected]"
+
+import os
+import sys
+import argparse
+import configparser
+
+from . import extractor
+from . import downloader
+
+def parse_cmdline_options():
+    p = argparse.ArgumentParser(
+        description='Download images from various sources')
+    p.add_argument("-c", "--config",
+        default="~/.config/gallery/config", metavar="CFG", help="alternate configuration file")
+    p.add_argument("-d", "--dest",
+        metavar="DEST", help="destination directory")
+    p.add_argument("urls", nargs="+",
+        metavar="URL", help="url to download images from")
+    return p.parse_args()
+
+def parse_config_file(path):
+    config = configparser.ConfigParser(
+        interpolation=None,
+    )
+    config.optionxform = lambda opt:opt
+    config.read(os.path.expanduser(path))
+    return config
+
+def main():
+    opts = parse_cmdline_options()
+    conf = parse_config_file(opts.config)
+    extf = extractor.ExtractorFinder(conf)
+    dlmg = downloader.DownloadManager(opts, conf)
+
+    for url in opts.urls:
+        ex = extf.match(url)
+        dlmg.add(ex)
diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py
@@ -0,0 +1,54 @@
+import os
+import sys
+import importlib
+
+class DownloadManager():
+
+    def __init__(self, opts, conf):
+        self.opts = opts
+        self.conf = conf
+        self.downloaders = {}
+
+    def add(self, extr):
+        if self.opts.dest:
+            dest = self.opts.dest
+        elif extr.category in self.conf:
+            dest = self.conf[extr.category].get("destination", "/tmp/")
+        else:
+            dest = self.conf["general"].get("destination", "/tmp/")
+        dest = os.path.join(dest, extr.category, extr.directory)
+        os.makedirs(dest, exist_ok=True)
+
+        for url, filename in extr:
+            path = os.path.join(dest, filename)
+            if os.path.exists(path):
+                self.print_skip(path)
+                continue
+            dl = self.get_downloader(extr, url)
+            self.print_start(path)
+            tries = dl.download(url, path)
+            self.print_success(path, tries)
+
+    def get_downloader(self, extr, url):
+        end   = url.find("://")
+        proto = url[:end] if end != -1 else "http"
+        if proto not in self.downloaders:
+            # import downloader
+            module = importlib.import_module("."+proto, __package__)
+            self.downloaders[proto] = module.Downloader
+        return self.downloaders[proto](extr)
+
+    @staticmethod
+    def print_start(path):
+        print(path, end="")
+        sys.stdout.flush()
+
+    @staticmethod
+    def print_skip(path):
+        print("\033[2m", path, "\033[0m", sep="")
+
+    @staticmethod
+    def print_success(path, tries):
+        if tries == 0:
+            print("\r", end="")
+        print("\r\033[1;32m", path, "\033[0m", sep="")
diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py
@@ -0,0 +1,21 @@
+import os
+
+class BasicDownloader():
+
+    max_tries = 5
+
+    def download(self, url, path):
+        with open(path, "wb") as file:
+            try:
+                return self.download_impl(url, file)
+                file.close()
+            except:
+                # make sure to remove file if download failed
+                os.unlink(path)
+                raise
+
+    @staticmethod
+    def print_error(file, error, tries, max_tries=5):
+        if tries == 1 and hasattr(file, "name"):
+            print("\r\033[1;31m", file.name, sep="")
+        print("\033[0;31m[Error]\033[0m ", error, " (", tries, "/", max_tries, ")", sep="")
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
@@ -0,0 +1,42 @@
+from .common import BasicDownloader
+import time
+import requests
+
+class Downloader(BasicDownloader):
+
+    def __init__(self, extr):
+        BasicDownloader.__init__(self)
+        self.session = extr.session
+
+    def download_impl(self, url, file):
+        tries = 0
+        while True:
+            # try to connect to remote source
+            try:
+                response = self.session.get(url, stream=True, verify=True)
+            except requests.exceptions.ConnectionError as e:
+                tries += 1
+                self.print_error(file, e, tries, self.max_tries)
+                time.sleep(1)
+                if tries == self.max_tries:
+                    raise
+                continue
+
+            # reject error-status-codes
+            if response.status_code != requests.codes.ok:
+                tries += 1
+                self.print_error(file, 'HTTP status "{} {}"'.format(
+                    response.status_code, response.reason), tries, self.max_tries)
+                if response.status_code == 404:
+                    return self.max_tries
+                time.sleep(1)
+                if tries == 5:
+                    response.raise_for_status()
+                continue
+
+            # everything ok -- proceed to download
+            break
+
+        for data in response.iter_content(16384):
+            file.write(data)
+        return tries
diff --git a/gallery_dl/downloader/https.py b/gallery_dl/downloader/https.py
@@ -0,0 +1 @@
+from .http import Downloader
diff --git a/gallery_dl/downloader/text.py b/gallery_dl/downloader/text.py
@@ -0,0 +1,10 @@
+from .common import BasicDownloader
+
+class Downloader(BasicDownloader):
+
+    def __init__(self, extr):
+        BasicDownloader.__init__(self)
+
+    def download_impl(self, url, file):
+        file.write(bytes(url[7:], "utf-8"))
+        return 0
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py
@@ -0,0 +1,21 @@
+from .common import BasicExtractor
+from urllib.parse import unquote
+import re
+
+class Extractor(BasicExtractor):
+
+    thread_url_fmt = "https://www.8chan.co/{0}/res/{1}.html"
+    regex = r'>File: <a href="([^"]+)">([^<]+)\.[^<]+<.*?<span class="postfilename"( title="([^"]+)")?>([^<]+)<'
+
+    def __init__(self, match, config):
+        BasicExtractor.__init__(self, config)
+        self.board, _, self.thread_id = match.group(1).split("/")
+        self.category = "8chan"
+        self.directory = self.board + "-" + self.thread_id
+
+    def images(self):
+        url  = self.thread_url_fmt.format(self.board, self.thread_id)
+        text = self.request(url).text
+        for match in re.finditer(self.regex, text):
+            url, prefix, fullname, name = match.group(1, 2, 4, 5)
+            yield ("https://www.8chan.co" + url, prefix + "-" + unquote(fullname or name))
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -0,0 +1,47 @@
+import os
+import sys
+import re
+import sqlite3
+import importlib
+
+class ExtractorFinder():
+
+    def __init__(self, config):
+        self.config = config
+        self.match_list = list()
+        if "database" in config["general"]:
+            path = os.path.expanduser(config["general"]["database"])
+            conn = sqlite3.connect(path)
+            self.load_from_database(conn)
+        self.load_from_config(config)
+
+    def match(self, url):
+        for category, regex in self.match_list:
+            match = regex.match(url)
+            if match:
+                module = importlib.import_module("."+category, __package__)
+                return module.Extractor(match, self.config)
+        return None
+
+    def load_from_database(self, db):
+        query = (
+            "SELECT regex.re, category.name "
+            "FROM   regex JOIN category "
+            "ON     regex.category_id = category.id"
+        )
+        for row in db.execute(query):
+            self.add_match(row[1], row[0])
+
+    def load_from_config(self, conf):
+        for category in conf:
+            for key, value in conf[category].items():
+                if(key.startswith("regex")):
+                    self.add_match(category, value)
+
+    def add_match(self, category, regex):
+        try:
+            # print(category, regex)
+            self.match_list.append( (category, re.compile(regex)) )
+        except:
+            print("[Warning] [{0}] failed to compile regular expression '{1}'"
+                  .format(category, regex))
diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py
@@ -0,0 +1,22 @@
+from .common import AsyncExtractor
+from ..util import filename_from_url
+from urllib.parse import unquote
+
+class Extractor(AsyncExtractor):
+
+    def __init__(self, match, config):
+        AsyncExtractor.__init__(self, config)
+        self.url = "https://bato.to/read/_/" + match.group(1) + "/_/1"
+        self.category = "batoto"
+        self.directory = match.group(1)
+
+    def images(self):
+        next_url = self.url
+        while next_url:
+            text = self.request(next_url).text
+            pos  = text.find('<div id="full_image"')
+
+            next_url, pos = self.extract(text, '<a href="', '"', pos)
+            url, pos = self.extract(text, 'src="', '"', pos)
+            name = unquote( filename_from_url(url) )
+            yield url, name
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
@@ -0,0 +1,67 @@
+import queue
+import threading
+import requests
+from ..util import safe_request
+
+class BasicExtractor():
+
+    def __init__(self, config):
+        self.session   = requests.Session()
+        self.category  = ""
+        self.directory = ""
+
+    def __iter__(self):
+        return self.images()
+
+    def request(self, url, *args, **kwargs):
+        return safe_request(self.session, url, *args, **kwargs)
+
+    def enable_useragent(self):
+        self.session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"
+
+    @staticmethod
+    def extract(txt, begin, end, pos=0):
+        try:
+            first = txt.index(begin, pos) + len(begin)
+            last  = txt.index(end, first)
+            return txt[first:last], last+len(end)
+        except:
+            return None, pos
+
+    @staticmethod
+    def extract_all(txt, begin, end, pos=0):
+        try:
+            first = txt.index(begin, pos)
+            last  = txt.index(end, first + len(begin)) + len(end)
+            return txt[first:last], last
+        except:
+            return None, pos
+
+class AsyncExtractor(BasicExtractor):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.__queue  = queue.Queue(maxsize=5)
+        self.__thread = threading.Thread(target=self.async_images, daemon=True)
+
+    def __iter__(self):
+        get  = self.__queue.get
+        done = self.__queue.task_done
+
+        self.__thread.start()
+        while True:
+            task = get()
+            if task is None:
+                return
+            yield task
+            done()
+
+    def async_images(self):
+        put = self.__queue.put
+        try:
+            for task in self.images():
+                put(task)
+        except:
+            import traceback
+            print(traceback.format_exc())
+        put(None)