diff --git a/README.rst b/README.rst index 0bcc87a5..ca4ac1d1 100644 --- a/README.rst +++ b/README.rst @@ -270,6 +270,21 @@ Config settings root /www/resource_cache; } +6. Alternatively, you can upload archived resources to s3filestore. Make sure that you + have properly setup `ckanext-s3filestore `_. + + Add the following values to the CKAN config file: + + * ``ckanext.archiver.s3upload_enable`` = ``True`` to enable upload to cloud storage, defaults to false. + * ``ckanext.s3filestore.aws_storage_path`` = ``my-site-name``. Your filestore project path. For example ckan/storage_path/archived_resource_dir. This is required to upload the archived resources. + + The resources are uploaded to s3filestore in the directory ``s3filestore.aws_bucket_name/s3filestore.aws_storage_path/archived_resources/resource_id/``. + + A cron job must be run atleast once a week to update archived resources and generate a presigned url to download the resources from the ``s3filestore``. The presigned url expires after 7 days(604800s) of running the ``archiver update`` command.:: + + 0 0 * * 0 paster --plugin=ckanext-archiver archiver update -c /srv/app/production.ini + + Legacy settings ~~~~~~~~~~~~~~~ diff --git a/ckanext/archiver/plugin.py b/ckanext/archiver/plugin.py index ac279aa8..65b77892 100644 --- a/ckanext/archiver/plugin.py +++ b/ckanext/archiver/plugin.py @@ -149,8 +149,6 @@ def register_reports(self): def update_config(self, config): p.toolkit.add_template_directory(config, 'templates') archive_dir = config.get('ckanext.archiver.archive_dir') - if archive_dir: - p.toolkit.add_public_directory(config, archive_dir) # IActions diff --git a/ckanext/archiver/tasks.py b/ckanext/archiver/tasks.py index 8866df79..ddd3ce81 100644 --- a/ckanext/archiver/tasks.py +++ b/ckanext/archiver/tasks.py @@ -17,7 +17,7 @@ from requests.packages import urllib3 -from ckan.common import _ +from ckan.common import _ , config from ckan.lib.celery_app import celery from ckan.lib import uploader from ckan import plugins as p @@ -548,11 +548,60 @@ def archive_resource(context, resource, log, result=None, url_timeout=30): log.warning('Not saved cache_url because no value for ' 'ckanext.archiver.cache_url_root in config') raise ArchiveError(_('No value for ckanext.archiver.cache_url_root in config')) - cache_url = urlparse.urljoin(context['cache_url_root'], - '%s/%s' % (relative_archive_path, file_name)) + + archiver_s3upload_enable = config.get('ckanext.archiver.s3upload_enable') + resource_id_dir = relative_archive_path.split('/')[-1] + + if archiver_s3upload_enable: + upload_obj, key_path = upload_archived_resource(resource_id_dir, file_name, saved_file) + cache_url = generate_cache_url(upload_obj, key_path) + else: + cache_url = urlparse.urljoin(context['cache_url_root'], + '%s/%s' % (relative_archive_path, file_name)) + return {'cache_filepath': saved_file, 'cache_url': cache_url} +def upload_archived_resource(resource_id_dir, filename, saved_file): + ''' + Uploads the resources to s3filestore in directory + //archived_resources/ + ''' + + storage_path = config.get('ckanext.s3filestore.aws_storage_path') + + if not storage_path: + log.warning('Not saved to filestore because no value for ' + 'ckanext.s3filestore.aws_storage_path in config') + raise ArchiveError(_('No value for ckanext.s3filestore.aws_storage_path in config')) + + with open (saved_file, 'rb') as save_file: + upload = uploader.get_uploader('archived_resources') + upload.upload_file = save_file + upload.filename = filename + upload.filepath = os.path.join(storage_path, 'archived_resources', resource_id_dir, filename) + upload.id = filename + upload.clear = False + upload.upload(uploader.get_max_resource_size()) + + return upload, upload.filepath + +def generate_cache_url(upload_obj, key_path): + ''' + Generates a presigned url to download the resource from the s3filestore + which expires after 1day(86400s) and returns that as cache_url + ''' + bucket_name = config.get('ckanext.s3filestore.aws_bucket_name') + region = config.get('ckanext.s3filestore.region_name') + host_name = config.get('ckanext.s3filestore.host_name') + bucket = upload_obj.get_s3_bucket(bucket_name) + s3 = upload_obj.get_s3_session() + client = s3.client(service_name='s3', endpoint_url=host_name) + cache_url = client.generate_presigned_url(ClientMethod='get_object', + Params={'Bucket':bucket.name,'Key':key_path}, + ExpiresIn=604800) + + return cache_url def notify_resource(resource, queue, cache_filepath): '''