Add import_file that imports files to working directory

genialis · Apr 9, 2019 · 74bb00f · 74bb00f
1 parent 4d0e14a
commit 74bb00f
Show file tree

Hide file tree

Showing 14 changed files with 428 additions and 2 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -2,6 +2,8 @@
 # explicitly overriden in the jobs.include matrix.
 
 language: python
+# We need sudo to install packages.
+sudo: required
 # NOTE: Apparently, Travis CI still hasn't strealined Python 3.7 support so
 # one has to resort to using the "unofficial" Ubuntu Xenial Xerus (16.04 LTS)
 # build environment and enable the "sudo" mode.
@@ -11,6 +13,9 @@ language: python
 dist: xenial
 python: "3.7"
 
+before_install:
+  - sudo apt-get install -y p7zip-full
+
 install: pip install tox
 
 script: tox -e $TOX_ENV
@@ -29,12 +34,15 @@ jobs:
       env: TOX_ENV=py37
     - stage: test
       env: TOX_ENV=docs
+      before_install: skip
       after_success: skip
     - stage: test
       env: TOX_ENV=linters
+      before_install: skip
       after_success: skip
     - stage: test
       env: TOX_ENV=packaging
+      before_install: skip
       after_success: skip
 
     # NOTE: We undo/change all the global Travis CI step definitions to ensure
@@ -44,6 +52,7 @@ jobs:
       # Project uses setuptools_scm to determine the version from a SCM tag.
       install: pip install setuptools_scm
       script: skip
+      before_install: skip
       # NOTE: Due to the way Travis CI currently works, setting
       # 'after_success: skip' would also result in skipping the deploy step.
       # A work-around is to execute a dummy echo command.

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -7,3 +7,4 @@ include docs/conf.py
 include AUTHORS LICENSE
 # Include files needed by tests
 include pyproject.toml
+recursive-include test_files *
diff --git a/README.rst b/README.rst
@@ -27,6 +27,8 @@ Resolwe Runtime Utilities
 A project that provides convenience utilities for writing processes for the
 Resolwe_ dataflow engine.
 
+The ``import_file`` function requires `7z` in path.
+
 You can find more information in the documentation_.
 
 .. _Resolwe: https://github.com/genialis/resolwe

diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst
@@ -6,6 +6,15 @@ All notable changes to the |project_name| project will be documented in this
 file.
 This project adheres to `Semantic Versioning <http://semver.org/>`_.
 
+==========
+Unreleased
+==========
+Added
+-----
+- Add ``import_file`` that imports compressed (or not) files of various formats
+  to working directory.
+
+
 ==================
 1.2.0 - 2017-08-08
 ==================

diff --git a/resolwe_runtime_utils.py b/resolwe_runtime_utils.py
@@ -15,9 +15,19 @@
 """
 Utility functions that make it easier to write a Resolwe process.
 """
-
+import glob
+import gzip
 import json
 import os
+import re
+import shlex
+import shutil
+import subprocess
+import tarfile
+import zlib
+
+import requests
+
 
 # Compat between Python 2.7/3.4 and Python 3.5
 if not hasattr(json, 'JSONDecodeError'):
@@ -301,6 +311,225 @@ def checkrc(rc, *args):
     return json.dumps(ret)
 
 
+CHUNK_SIZE = 10_000_000  # 10 Mbytes
+
+
+class ImportedFormat:
+    """Import destination file format."""
+
+    EXTRACTED = 'extracted'
+    COMPRESSED = 'compressed'
+    BOTH = 'both'
+
+
+def import_file(
+    src,
+    file_name,
+    imported_format=ImportedFormat.BOTH,
+    progress_from=0.0,
+    progress_to=None,
+):
+    """Import file to working directory.
+
+    :param src: Source file path or URL
+    :param file_name: Source file name
+    :param imported_format: Import file format (extracted, compressed or both)
+    :param progress_from: Initial progress value
+    :param progress_to: Final progress value
+    :return: Destination file path (if extracted and compressed, extracted path given)
+    """
+
+    if progress_to is not None:
+        if not isinstance(progress_from, float) or not isinstance(progress_to, float):
+            raise ValueError("Progress_from and progress_to must be float")
+
+        if progress_from < 0 or progress_from > 1:
+            raise ValueError("Progress_from must be between 0 and 1")
+
+        if progress_to < 0 or progress_to > 1:
+            raise ValueError("Progress_to must be between 0 and 1")
+
+        if progress_from >= progress_to:
+            raise ValueError("Progress_to must be higher than progress_from")
+
+    print("Importing and compressing {}...".format(file_name))
+
+    def importGz():
+        """Import gzipped file.
+
+        The file_name must have .gz extension.
+        """
+        if imported_format != ImportedFormat.COMPRESSED:  # Extracted file required
+            with open(file_name[:-3], 'wb') as f_out, gzip.open(src, 'rb') as f_in:
+                try:
+                    shutil.copyfileobj(f_in, f_out, CHUNK_SIZE)
+                except zlib.error:
+                    raise ValueError("Invalid gzip file format: {}".format(file_name))
+
+        else:  # Extracted file not-required
+            # Verify the compressed file.
+            with gzip.open(src, 'rb') as f:
+                try:
+                    while f.read(CHUNK_SIZE) != b'':
+                        pass
+                except zlib.error:
+                    raise ValueError("Invalid gzip file format: {}".format(file_name))
+
+        if imported_format != ImportedFormat.EXTRACTED:  # Compressed file required
+            try:
+                shutil.copyfile(src, file_name)
+            except shutil.SameFileError:
+                pass  # Skip copy of downloaded files
+
+        if imported_format == ImportedFormat.COMPRESSED:
+            return file_name
+        else:
+            return file_name[:-3]
+
+    def import7z():
+        """Import compressed file in various formats.
+
+        Supported extensions: .bz2, .zip, .rar, .7z, .tar.gz, and .tar.bz2.
+        """
+        extracted_name, _ = os.path.splitext(file_name)
+        destination_name = extracted_name
+        temp_dir = 'temp_{}'.format(extracted_name)
+
+        cmd = '7z x -y -o{} {}'.format(shlex.quote(temp_dir), shlex.quote(src))
+        try:
+            subprocess.check_call(cmd, shell=True)
+        except subprocess.CalledProcessError as err:
+            if err.returncode == 2:
+                raise ValueError("Failed to extract file: {}".format(file_name))
+            else:
+                raise
+
+        paths = os.listdir(temp_dir)
+        if len(paths) == 1 and os.path.isfile(os.path.join(temp_dir, paths[0])):
+            # Single file in archive.
+            temp_file = os.path.join(temp_dir, paths[0])
+
+            if imported_format != ImportedFormat.EXTRACTED:  # Compressed file required
+                with open(temp_file, 'rb') as f_in, gzip.open(
+                    extracted_name + '.gz', 'wb'
+                ) as f_out:
+                    shutil.copyfileobj(f_in, f_out, CHUNK_SIZE)
+
+            if imported_format != ImportedFormat.COMPRESSED:  # Extracted file required
+                shutil.move(temp_file, './{}'.format(extracted_name))
+
+                if extracted_name.endswith('.tar'):
+                    with tarfile.open(extracted_name) as tar:
+                        tar.extractall()
+
+                    os.remove(extracted_name)
+                    destination_name, _ = os.path.splitext(extracted_name)
+            else:
+                destination_name = extracted_name + '.gz'
+        else:
+            # Directory or several files in archive.
+            if imported_format != ImportedFormat.EXTRACTED:  # Compressed file required
+                with tarfile.open(extracted_name + '.tar.gz', 'w:gz') as tar:
+                    for fname in glob.glob(os.path.join(temp_dir, '*')):
+                        tar.add(fname, os.path.basename(fname))
+
+            if imported_format != ImportedFormat.COMPRESSED:  # Extracted file required
+                for path in os.listdir(temp_dir):
+                    shutil.move(os.path.join(temp_dir, path), './{}'.format(path))
+            else:
+                destination_name = extracted_name + '.tar.gz'
+
+        shutil.rmtree(temp_dir)
+        return destination_name
+
+    def importUncompressed():
+        """Import uncompressed file."""
+        if imported_format != ImportedFormat.EXTRACTED:  # Compressed file required
+            with open(src, 'rb') as f_in, gzip.open(file_name + '.gz', 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out, CHUNK_SIZE)
+
+        if imported_format != ImportedFormat.COMPRESSED:  # Extracted file required
+            try:
+                shutil.copyfile(src, file_name)
+            except shutil.SameFileError:
+                pass  # Skip copy of downloaded files
+
+        return (
+            file_name + '.gz'
+            if imported_format == ImportedFormat.COMPRESSED
+            else file_name
+        )
+
+    # Large file download from Google Drive requires cookie and token.
+    try:
+        response = None
+        if re.match(
+            r'^https://drive.google.com/[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]$',
+            src,
+        ):
+            session = requests.Session()
+            response = session.get(src, stream=True)
+
+            token = None
+            for key, value in response.cookies.items():
+                if key.startswith('download_warning'):
+                    token = value
+                    break
+
+            if token is not None:
+                params = {'confirm': token}
+                response = session.get(src, params=params, stream=True)
+
+        elif re.match(
+            r'^(https?|ftp)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]$',
+            src,
+        ):
+            response = requests.get(src, stream=True)
+    except requests.exceptions.ConnectionError:
+        raise requests.exceptions.ConnectionError("Could not connect to {}".format(src))
+
+    if response:
+        with open(file_name, 'wb') as f:
+            total = response.headers.get('content-length')
+            total = float(total) if total else None
+            downloaded = 0
+            current_progress = 0
+            for content in response.iter_content(chunk_size=CHUNK_SIZE):
+                f.write(content)
+
+                if total is not None and progress_to is not None:
+                    downloaded += len(content)
+                    progress_span = progress_to - progress_from
+                    next_progress = progress_from + progress_span * downloaded / total
+                    next_progress = round(next_progress, 2)
+
+                    if next_progress > current_progress:
+                        print(progress(next_progress))
+                        current_progress = next_progress
+
+        # Check if a temporary file exists.
+        if not os.path.isfile(file_name):
+            raise ValueError("Downloaded file not found {}".format(file_name))
+
+        src = file_name
+    else:
+        if not os.path.isfile(src):
+            raise ValueError("Source file not found {}".format(src))
+
+    # Decide which import should be used.
+    if re.search(r'\.(bz2|zip|rar|7z|tgz|tar\.gz|tar\.bz2)$', file_name):
+        destination_file_name = import7z()
+    elif file_name.endswith('.gz'):
+        destination_file_name = importGz()
+    else:
+        destination_file_name = importUncompressed()
+
+    if progress_to is not None:
+        print(progress(progress_to))
+
+    return destination_file_name
+
+
 ###############################################################################
 # Auxiliary functions for preparing multi-platform console scripts via        #
 # setuptools' 'console_scripts' entry points mechanism for automatic script   #

diff --git a/setup.py b/setup.py
@@ -56,11 +56,19 @@
     ],
     keywords='resolwe runtime utilities library',
     py_modules=['resolwe_runtime_utils'],
+    install_requires=['requests>=2.21'],
     extras_require={
         'dev': ['tox'],
         'docs': ['Sphinx', 'sphinx_rtd_theme'],
         'package': ['twine', 'wheel'],
-        'test': ['black', 'check-manifest', 'readme', 'pytest-cov', 'setuptools_scm'],
+        'test': [
+            'black',
+            'check-manifest',
+            'readme',
+            'pytest-cov',
+            'responses',
+            'setuptools_scm',
+        ],
     },
     entry_points={
         'console_scripts': [

diff --git a/test_files/corrupted.gz b/test_files/corrupted.gz
diff --git a/test_files/corrupted.zip b/test_files/corrupted.zip
diff --git a/test_files/some file.1.txt b/test_files/some file.1.txt
@@ -0,0 +1 @@
+some text
diff --git a/test_files/some file.1.txt.gz b/test_files/some file.1.txt.gz
diff --git a/test_files/some file.1.txt.zip b/test_files/some file.1.txt.zip
diff --git a/test_files/some folder 1.zip b/test_files/some folder 1.zip
diff --git a/test_files/some folder.tar.gz b/test_files/some folder.tar.gz