MIT-LCP · superryeti · Apr 10, 2023 · Apr 10, 2023 · Apr 10, 2023 · Apr 10, 2023
diff --git a/.env.example b/.env.example
@@ -187,7 +187,18 @@ MIN_WORDS_RESEARCH_SUMMARY_CREDENTIALING = 20
 # CITISOAPService API
 # This is the WebServices username and password to access the CITI SOAP Service to obtain users training report details
 # The account can be created at https://webservices.citiprogram.org/login/CreateAccount.aspx
-# The SOAP Service Access can be tested at https://webservices.citiprogram.org/Client/CITISOAPClient_Simple.aspx 
+# The SOAP Service Access can be tested at https://webservices.citiprogram.org/Client/CITISOAPClient_Simple.aspx
 CITI_USERNAME=
 CITI_PASSWORD=
 CITI_SOAP_URL="https://webservices.citiprogram.org/SOAP/CITISOAPService.asmx"
+
+
+# Data Source configurations
+# DEFAULT_PROJECT_DATA_LOCATION controls how data will be stored when a project is published (Direct (DI), Google BigQuery (GBQ), Google Cloud Storage (GCS), AWS Open Data (AOD), AWS S3 (AS3))
+# OPTIONS are: DI, GBQ, GCS, AOD, AS3
+DEFAULT_PROJECT_DATA_LOCATION = 'DI'
+
+# DEFAULT_PROJECT_ACCESS_MECHANISM controls how users can use the data (Google Group Email (google-group-email), S3 (s3), Research Environment (research-environment))
+# OPTIONS are: google-group-email, s3, research-environment or "" (empty string) for no access mechanism(only direct access)
+# as of now, Research Environment is only available for GCS data location
+DEFAULT_PROJECT_ACCESS_MECHANISM = ''
diff --git a/physionet-django/console/forms.py b/physionet-django/console/forms.py
@@ -21,6 +21,7 @@
     Contact,
     CopyeditLog,
     DataAccess,
+    DataSource,
     DUA,
     EditLog,
     License,
@@ -689,6 +690,26 @@ def save(self):
         return data_access
 
 
+class DataSourceForm(forms.ModelForm):
+    class Meta:
+        model = DataSource
+        fields = ('data_location', 'access_mechanism', 'files_available', 'email', 'uri' )
+
+    def __init__(self, project, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.project = project
+
+        if not settings.ENABLE_CLOUD_RESEARCH_ENVIRONMENTS:
+            self.fields['access_mechanism'].choices = [
+                choice for choice in self.fields['access_mechanism'].choices if choice[0] != 'research-environment']
+
+    def save(self):
+        data_source = super(DataSourceForm, self).save(commit=False)
+        data_source.project = self.project
+        data_source.save()
+        return data_source
+
+
 class PublishedProjectContactForm(forms.ModelForm):
     class Meta:
         model = Contact

diff --git a/physionet-django/console/templates/console/manage_published_project.html b/physionet-django/console/templates/console/manage_published_project.html
@@ -359,6 +359,46 @@ <h5 class="card-title mt-0 mb-1">Storage location</h5>
         </table>
         {% endif %}
       </li>
+      <li class="list-group-item">
+        <h5 class="card-title mt-0 mb-1">Data Source</h5>
+        <p>Add and remove Data Source options.</p>
+{#        <div class="alert alert-danger">#}
+{#          <p class='m-0'>Note: The remove button will remove the option for requesting cloud access that appears in the files section of a project. It will not (1) delete/deactivate the bucket or (2) remove access for users who are already using the bucket.</p>#}
+{#        </div>#}
+        <form action="" method="post">
+          {% csrf_token %}
+          {% include "project/content_inline_form_snippet.html" with form=data_source_form %}
+          <button class="btn btn-primary" type="submit">Submit</button>
+        </form>
+        {% if data_sources %}
+        <table class="table table-bordered">
+            <tr>
+              <th>Location</th>
+              <th>Access Mechanism</th>
+              <th>Files Available</th>
+              <th>Email</th>
+              <th>Uri</th>
+              <th>Remove</th>
+            </tr>
+          </thead>
+          <tbody>
+            {% for item in data_sources %}
+            <tr>
+              <td>{{item.data_location}}</td>
+              <td>{{item.access_mechanism}}</td>
+              <td>{{item.files_available}}</td>
+              <td>{{item.email}}</td>
+              <td>{{item.uri}}</td>
+              <form action="" method="post">
+                {% csrf_token %}
+                <td><button class='btn btn-danger' name='data_source_removal' value='{{item.id}}'>Remove</button></td>
+              </form>
+            </tr>
+            {% endfor %}
+          </tbody>
+        </table>
+        {% endif %}
+      </li>
       <li class="list-group-item">
         <h5 class="card-title mt-3 mb-1">Google Cloud</h5>
         {% if not has_credentials %}
@@ -388,6 +428,7 @@ <h5 class="card-title mt-3 mb-1">Google Cloud</h5>
     </ul>
 
   </div>
+
 </div>
 
 {% endblock %}

diff --git a/physionet-django/console/views.py b/physionet-django/console/views.py
@@ -43,6 +43,7 @@
     ActiveProject,
     ArchivedProject,
     DataAccess,
+    DataSource,
     DUA,
     DataAccessRequest,
     DUASignature,
@@ -829,6 +830,7 @@ def manage_published_project(request, project_slug, version):
     deprecate_form = None if project.deprecated_files else forms.DeprecateFilesForm()
     has_credentials = bool(settings.GOOGLE_APPLICATION_CREDENTIALS)
     data_access_form = forms.DataAccessForm(project=project)
+    data_source_form = forms.DataSourceForm(project=project)
     contact_form = forms.PublishedProjectContactForm(project=project,
                                                      instance=project.contact)
     legacy_author_form = forms.CreateLegacyAuthorForm(project=project)
@@ -895,6 +897,18 @@ def manage_published_project(request, project_slug, version):
             if data_access_form.is_valid():
                 data_access_form.save()
                 messages.success(request, "Stored method to access the files")
+        elif 'data_location' in request.POST:
+            data_source_form = forms.DataSourceForm(project=project, data=request.POST)
+            if data_source_form.is_valid():
+                data_source_form.save()
+                messages.success(request, "Stored method to access the files")
+        elif 'data_source_removal' in request.POST and request.POST['data_source_removal'].isdigit():
+            try:
+                data_source = DataSource.objects.get(project=project, id=request.POST['data_source_removal'])
+                data_source.delete()
+                # Deletes the object if it exists for that specific project.
+            except DataSource.DoesNotExist:
+                pass
         elif 'data_access_removal' in request.POST and request.POST['data_access_removal'].isdigit():
             try:
                 data_access = DataAccess.objects.get(project=project, id=request.POST['data_access_removal'])
@@ -921,6 +935,7 @@ def manage_published_project(request, project_slug, version):
                 legacy_author_form = forms.CreateLegacyAuthorForm(project=project)
 
     data_access = DataAccess.objects.filter(project=project)
+    data_sources = DataSource.objects.filter(project=project)
     authors, author_emails, storage_info, edit_logs, copyedit_logs, latest_version = project.info_card()
 
     tasks = list(get_associated_tasks(project))
@@ -946,7 +961,9 @@ def manage_published_project(request, project_slug, version):
             'deprecate_form': deprecate_form,
             'has_credentials': has_credentials,
             'data_access_form': data_access_form,
+            'data_source_form': data_source_form,
             'data_access': data_access,
+            'data_sources': data_sources,
             'rw_tasks': rw_tasks,
             'ro_tasks': ro_tasks,
             'anonymous_url': anonymous_url,

diff --git a/physionet-django/physionet/settings/base.py b/physionet-django/physionet/settings/base.py
@@ -17,6 +17,7 @@
 
 from decouple import config, UndefinedValueError
 
+
 # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
@@ -617,3 +618,6 @@ class StorageTypes:
 
 # minimum number of word needed for research_summary field for Credentialing Model.
 MIN_WORDS_RESEARCH_SUMMARY_CREDENTIALING = config('MIN_WORDS_RESEARCH_SUMMARY_CREDENTIALING', cast=int, default=20)
+
+DEFAULT_PROJECT_DATA_LOCATION = config('DEFAULT_PROJECT_DATA_LOCATION', default='DI')
+DEFAULT_PROJECT_ACCESS_MECHANISM = config('DEFAULT_PROJECT_ACCESS_MECHANISM', default=None)
diff --git a/physionet-django/project/migrations/0068_datasource.py b/physionet-django/project/migrations/0068_datasource.py
@@ -0,0 +1,30 @@
+# Generated by Django 4.1.7 on 2023-04-19 14:03
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('project', '0067_alter_activeproject_core_project_and_more'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='DataSource',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('files_available', models.BooleanField(default=False)),
+                ('data_location', models.CharField(choices=[('DI', 'Direct'), ('GBQ', 'Google BigQuery'), ('GCS', 'Google Cloud Storage'), ('AOD', 'AWS Open Data'), ('AS3', 'AWS S3')], default='DI', max_length=3)),
+                ('access_mechanism', models.CharField(blank=True, choices=[('google-group-email', 'Google Group Email'), ('s3', 'S3'), ('research-environment', 'Research Environment')], max_length=20, null=True)),
+                ('email', models.CharField(blank=True, max_length=320, null=True)),
+                ('uri', models.CharField(blank=True, max_length=320, null=True)),
+                ('project', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='data_sources', to='project.publishedproject')),
+            ],
+            options={
+                'default_permissions': (),
+                'unique_together': {('project', 'data_location')},
+            },
+        ),
+    ]
diff --git a/physionet-django/project/modelcomponents/access.py b/physionet-django/project/modelcomponents/access.py
@@ -1,16 +1,20 @@
 from datetime import timedelta
 from enum import IntEnum
 
+from django.conf import settings
 from django.contrib.auth.hashers import check_password, make_password
 from django.contrib.contenttypes.fields import GenericForeignKey
 from django.contrib.contenttypes.models import ContentType
+from django.core.exceptions import ValidationError
 from django.db import models
 from django.utils import timezone
 from django.utils.crypto import get_random_string
+from django.utils.translation import gettext_lazy as _
 from project.modelcomponents.fields import SafeHTMLField
 from project.validators import validate_version
 
 from project.managers.access import DataAccessRequestQuerySet, DataAccessRequestManager
+from physionet.settings.base import StorageTypes
 
 
 class AccessPolicy(IntEnum):
@@ -167,6 +171,67 @@ class Meta:
         default_permissions = ()
 
 
+class DataSource(models.Model):
+    """
+    Controls all access to project data.
+    """
+    class DataLocation(models.TextChoices):
+        DIRECT = 'DI', 'Direct'
+        GOOGLE_BIGQUERY = 'GBQ', 'Google BigQuery'
+        GOOGLE_CLOUD_STORAGE = 'GCS', 'Google Cloud Storage'
+        AWS_OPEN_DATA = 'AOD', 'AWS Open Data'
+        AWS_S3 = 'AS3', 'AWS S3'
+
+    class AccessMechanism(models.TextChoices):
+        GOOGLE_GROUP_EMAIL = 'google-group-email', 'Google Group Email'
+        S3 = 's3', 'S3'
+        RESEARCH_ENVIRONMENT = 'research-environment', 'Research Environment'
+
+    project = models.ForeignKey('project.PublishedProject',
+                                related_name='data_sources', db_index=True, on_delete=models.CASCADE)
+    files_available = models.BooleanField(default=False)
+    data_location = models.CharField(max_length=3, choices=DataLocation.choices)
+    access_mechanism = models.CharField(max_length=20, choices=AccessMechanism.choices, null=True, blank=True)
+    email = models.CharField(max_length=320, null=True, blank=True)
+    uri = models.CharField(max_length=320, null=True, blank=True)
+
+    class Meta:
+        default_permissions = ()
+        unique_together = ('project', 'data_location')
+
+    def clean(self):
+        super().clean()
+
+        if self.data_location == self.DataLocation.GOOGLE_BIGQUERY:
+            if self.access_mechanism != self.AccessMechanism.GOOGLE_GROUP_EMAIL:
+                raise ValidationError('Google BigQuery data sources must use the Google Group Email access mechanism.')
+            if not self.email:
+                raise ValidationError('Google BigQuery data sources must have an email address.')
+        elif self.data_location == self.DataLocation.GOOGLE_CLOUD_STORAGE:
+            if self.access_mechanism != self.AccessMechanism.GOOGLE_GROUP_EMAIL:
+                raise ValidationError('Google Cloud Storage data sources must use the Google Group Email access '
+                                      'mechanism.')
+            if not self.uri:
+                raise ValidationError('Google Cloud Storage data sources must have an uri address.')
+        elif self.data_location == self.DataLocation.AWS_OPEN_DATA:
+            if self.access_mechanism != self.AccessMechanism.S3:
+                raise ValidationError('AWS Open Data data sources must use the S3 access mechanism.')
+            if not self.uri:
+                raise ValidationError('AWS Open Data data sources must have a URI.')
+        elif self.data_location == self.DataLocation.AWS_S3:
+            if self.access_mechanism != self.AccessMechanism.S3:
+                raise ValidationError('AWS S3 data sources must use the S3 access mechanism.')
+            if not self.uri:
+                raise ValidationError('AWS S3 data sources must have a URI.')
+        elif self.data_location == self.DataLocation.DIRECT:
+            if self.email:
+                raise ValidationError('Direct data sources must not have an email address.')
+            if self.uri:
+                raise ValidationError('Direct data sources must not have a URI.')
+        else:
+            raise ValidationError('Invalid data location.')
+
+
 class AnonymousAccess(models.Model):
     """
     Makes it possible to grant anonymous access (without user auth)
@@ -274,3 +339,42 @@ class Meta:
 
     def __str__(self):
         return self.name
+
+
+class DataSourceCreator:
+    def __init__(self, **kwargs):
+        self.data_location = kwargs.get('data_location', None)
+        self.files_available = kwargs.get('files_available', None)
+        self.email = kwargs.get('email', None)
+        self.uri = kwargs.get('uri', None)
+        self.access_mechanism = kwargs.get('access_mechanism', None)
+
+    def create(self, project):
+        DataSource.objects.create(
+            project=project,
+            files_available=self.files_available,
+            data_location=self.data_location,
+            access_mechanism=self.access_mechanism,
+            email=self.email,
+            uri=self.uri,
+        )
+
+    @staticmethod
+    def create_default(project):
+        if (settings.DEFAULT_PROJECT_DATA_LOCATION == DataSource.DataLocation.DIRECT
+                and settings.STORAGE_TYPE == StorageTypes.LOCAL):
+            DataSource.objects.create(
+                project=project,
+                files_available=True,
+                data_location=DataSource.DataLocation.DIRECT,
+            )
+        elif (settings.DEFAULT_PROJECT_ACCESS_MECHANISM == DataSource.DataLocation.RESEARCH_ENVIRONMENT
+                and settings.DEFAULT_PROJECT_DATA_LOCATION == DataSource.DataLocation.GOOGLE_CLOUD_STORAGE
+                and settings.STORAGE_TYPE == StorageTypes.GCP):
+            DataSource.objects.create(
+                project=project,
+                files_available=False,
+                data_location=DataSource.DataLocation.GOOGLE_CLOUD_STORAGE,
+                uri=f'gs://{project.project_file_root()}/',
+                access_mechanism=DataSource.AccessMechanism.RESEARCH_ENVIRONMENT,
+            )
diff --git a/physionet-django/project/modelcomponents/activeproject.py b/physionet-django/project/modelcomponents/activeproject.py
@@ -29,6 +29,7 @@
 from project.modelcomponents.submission import CopyeditLog, EditLog, SubmissionInfo
 from project.modelcomponents.unpublishedproject import UnpublishedProject
 from project.projectfiles import ProjectFiles
+from project.models import DataSourceCreator
 from project.validators import validate_subdir
 
 LOGGER = logging.getLogger(__name__)
@@ -614,6 +615,7 @@ def publish(self, slug=None, make_zip=True, title=None):
             raise
 
         ProjectFiles().publish_complete(self, published_project)
+        DataSourceCreator().create_default(published_project)
 
         return published_project