Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] refactor DataAccess to DataSource #1967

Draft
wants to merge 12 commits into
base: dev
Choose a base branch
from
13 changes: 12 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,18 @@ MIN_WORDS_RESEARCH_SUMMARY_CREDENTIALING = 20
# CITISOAPService API
# This is the WebServices username and password to access the CITI SOAP Service to obtain users training report details
# The account can be created at https://webservices.citiprogram.org/login/CreateAccount.aspx
# The SOAP Service Access can be tested at https://webservices.citiprogram.org/Client/CITISOAPClient_Simple.aspx
# The SOAP Service Access can be tested at https://webservices.citiprogram.org/Client/CITISOAPClient_Simple.aspx
CITI_USERNAME=
CITI_PASSWORD=
CITI_SOAP_URL="https://webservices.citiprogram.org/SOAP/CITISOAPService.asmx"


# Data Source configurations
# DEFAULT_PROJECT_DATA_LOCATION controls how data will be stored when a project is published (Direct (DI), Google BigQuery (GBQ), Google Cloud Storage (GCS), AWS Open Data (AOD), AWS S3 (AS3))
# OPTIONS are: DI, GBQ, GCS, AOD, AS3
DEFAULT_PROJECT_DATA_LOCATION = 'DI'

# DEFAULT_PROJECT_ACCESS_MECHANISM controls how users can use the data (Google Group Email (google-group-email), S3 (s3), Research Environment (research-environment))
# OPTIONS are: google-group-email, s3, research-environment or "" (empty string) for no access mechanism(only direct access)
# as of now, Research Environment is only available for GCS data location
DEFAULT_PROJECT_ACCESS_MECHANISM = ''
21 changes: 21 additions & 0 deletions physionet-django/console/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
Contact,
CopyeditLog,
DataAccess,
DataSource,
DUA,
EditLog,
License,
Expand Down Expand Up @@ -689,6 +690,26 @@ def save(self):
return data_access


class DataSourceForm(forms.ModelForm):
class Meta:
model = DataSource
fields = ('data_location', 'access_mechanism', 'files_available', 'email', 'uri' )

def __init__(self, project, *args, **kwargs):
super().__init__(*args, **kwargs)
self.project = project

if not settings.ENABLE_CLOUD_RESEARCH_ENVIRONMENTS:
self.fields['access_mechanism'].choices = [
choice for choice in self.fields['access_mechanism'].choices if choice[0] != 'research-environment']
Comment on lines +702 to +704
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering whether we want to care about this being enabled or not... It makes sense and it would be my first instinct to do so, but right now it seems that research environments are singled out as something that can be present or not.

The HDN deployment is not integrated with any other data source other than local/research environment, so in that case it makes no sense showing anything other than that. Research environments are the only ones that have an explicit setting to disable them, but still.

I guess it doesn't hurt to have this, it just got me thinking!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can see what you mean. It makes sense about for HDN like deployment, we might just want to have the local/research environment and disable other stuff.

From what i understand, we might want to allow the deployments to have any combination of access mechanisms.
so what if we have a configurable env like
ACCESS_MECHANISMS = 'research-environment,direct, awss3'
and use that.

or maybe since this is something only the Project Editor or console admin can do, we might be okay with showing them all the option and trust them to use the right ones.


def save(self):
data_source = super(DataSourceForm, self).save(commit=False)
data_source.project = self.project
data_source.save()
return data_source


class PublishedProjectContactForm(forms.ModelForm):
class Meta:
model = Contact
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,46 @@ <h5 class="card-title mt-0 mb-1">Storage location</h5>
</table>
{% endif %}
</li>
<li class="list-group-item">
<h5 class="card-title mt-0 mb-1">Data Source</h5>
<p>Add and remove Data Source options.</p>
{# <div class="alert alert-danger">#}
{# <p class='m-0'>Note: The remove button will remove the option for requesting cloud access that appears in the files section of a project. It will not (1) delete/deactivate the bucket or (2) remove access for users who are already using the bucket.</p>#}
{# </div>#}
<form action="" method="post">
{% csrf_token %}
{% include "project/content_inline_form_snippet.html" with form=data_source_form %}
<button class="btn btn-primary" type="submit">Submit</button>
</form>
{% if data_sources %}
<table class="table table-bordered">
<tr>
<th>Location</th>
<th>Access Mechanism</th>
<th>Files Available</th>
<th>Email</th>
<th>Uri</th>
<th>Remove</th>
</tr>
</thead>
<tbody>
{% for item in data_sources %}
<tr>
<td>{{item.data_location}}</td>
<td>{{item.access_mechanism}}</td>
<td>{{item.files_available}}</td>
<td>{{item.email}}</td>
<td>{{item.uri}}</td>
<form action="" method="post">
{% csrf_token %}
<td><button class='btn btn-danger' name='data_source_removal' value='{{item.id}}'>Remove</button></td>
</form>
</tr>
{% endfor %}
</tbody>
</table>
{% endif %}
</li>
<li class="list-group-item">
<h5 class="card-title mt-3 mb-1">Google Cloud</h5>
{% if not has_credentials %}
Expand Down Expand Up @@ -388,6 +428,7 @@ <h5 class="card-title mt-3 mb-1">Google Cloud</h5>
</ul>

</div>

</div>

{% endblock %}
Expand Down
17 changes: 17 additions & 0 deletions physionet-django/console/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
ActiveProject,
ArchivedProject,
DataAccess,
DataSource,
DUA,
DataAccessRequest,
DUASignature,
Expand Down Expand Up @@ -829,6 +830,7 @@ def manage_published_project(request, project_slug, version):
deprecate_form = None if project.deprecated_files else forms.DeprecateFilesForm()
has_credentials = bool(settings.GOOGLE_APPLICATION_CREDENTIALS)
data_access_form = forms.DataAccessForm(project=project)
data_source_form = forms.DataSourceForm(project=project)
contact_form = forms.PublishedProjectContactForm(project=project,
instance=project.contact)
legacy_author_form = forms.CreateLegacyAuthorForm(project=project)
Expand Down Expand Up @@ -895,6 +897,18 @@ def manage_published_project(request, project_slug, version):
if data_access_form.is_valid():
data_access_form.save()
messages.success(request, "Stored method to access the files")
elif 'data_location' in request.POST:
data_source_form = forms.DataSourceForm(project=project, data=request.POST)
if data_source_form.is_valid():
data_source_form.save()
messages.success(request, "Stored method to access the files")
elif 'data_source_removal' in request.POST and request.POST['data_source_removal'].isdigit():
try:
data_source = DataSource.objects.get(project=project, id=request.POST['data_source_removal'])
data_source.delete()
# Deletes the object if it exists for that specific project.
except DataSource.DoesNotExist:
pass
elif 'data_access_removal' in request.POST and request.POST['data_access_removal'].isdigit():
try:
data_access = DataAccess.objects.get(project=project, id=request.POST['data_access_removal'])
Expand All @@ -921,6 +935,7 @@ def manage_published_project(request, project_slug, version):
legacy_author_form = forms.CreateLegacyAuthorForm(project=project)

data_access = DataAccess.objects.filter(project=project)
data_sources = DataSource.objects.filter(project=project)
authors, author_emails, storage_info, edit_logs, copyedit_logs, latest_version = project.info_card()

tasks = list(get_associated_tasks(project))
Expand All @@ -946,7 +961,9 @@ def manage_published_project(request, project_slug, version):
'deprecate_form': deprecate_form,
'has_credentials': has_credentials,
'data_access_form': data_access_form,
'data_source_form': data_source_form,
'data_access': data_access,
'data_sources': data_sources,
'rw_tasks': rw_tasks,
'ro_tasks': ro_tasks,
'anonymous_url': anonymous_url,
Expand Down
4 changes: 4 additions & 0 deletions physionet-django/physionet/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from decouple import config, UndefinedValueError


# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

Expand Down Expand Up @@ -617,3 +618,6 @@ class StorageTypes:

# minimum number of word needed for research_summary field for Credentialing Model.
MIN_WORDS_RESEARCH_SUMMARY_CREDENTIALING = config('MIN_WORDS_RESEARCH_SUMMARY_CREDENTIALING', cast=int, default=20)

DEFAULT_PROJECT_DATA_LOCATION = config('DEFAULT_PROJECT_DATA_LOCATION', default='DI')
DEFAULT_PROJECT_ACCESS_MECHANISM = config('DEFAULT_PROJECT_ACCESS_MECHANISM', default=None)
30 changes: 30 additions & 0 deletions physionet-django/project/migrations/0068_datasource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Generated by Django 4.1.7 on 2023-04-19 14:03

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('project', '0067_alter_activeproject_core_project_and_more'),
]

operations = [
migrations.CreateModel(
name='DataSource',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('files_available', models.BooleanField(default=False)),
('data_location', models.CharField(choices=[('DI', 'Direct'), ('GBQ', 'Google BigQuery'), ('GCS', 'Google Cloud Storage'), ('AOD', 'AWS Open Data'), ('AS3', 'AWS S3')], default='DI', max_length=3)),
('access_mechanism', models.CharField(blank=True, choices=[('google-group-email', 'Google Group Email'), ('s3', 'S3'), ('research-environment', 'Research Environment')], max_length=20, null=True)),
('email', models.CharField(blank=True, max_length=320, null=True)),
('uri', models.CharField(blank=True, max_length=320, null=True)),
('project', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='data_sources', to='project.publishedproject')),
],
options={
'default_permissions': (),
'unique_together': {('project', 'data_location')},
},
),
]
104 changes: 104 additions & 0 deletions physionet-django/project/modelcomponents/access.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
from datetime import timedelta
from enum import IntEnum

from django.conf import settings
from django.contrib.auth.hashers import check_password, make_password
from django.contrib.contenttypes.fields import GenericForeignKey
from django.contrib.contenttypes.models import ContentType
from django.core.exceptions import ValidationError
from django.db import models
from django.utils import timezone
from django.utils.crypto import get_random_string
from django.utils.translation import gettext_lazy as _
from project.modelcomponents.fields import SafeHTMLField
from project.validators import validate_version

from project.managers.access import DataAccessRequestQuerySet, DataAccessRequestManager
from physionet.settings.base import StorageTypes


class AccessPolicy(IntEnum):
Expand Down Expand Up @@ -167,6 +171,67 @@ class Meta:
default_permissions = ()


class DataSource(models.Model):
superryeti marked this conversation as resolved.
Show resolved Hide resolved
"""
Controls all access to project data.
"""
class DataLocation(models.TextChoices):
DIRECT = 'DI', 'Direct'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any advantage to using abbreviations like DI or AS3 over DIRECT, AWS_S3, AWS_OPEN_DATA etc? I guess it will save a teeny-tine amount of space but makes things a bit incomprehensible when looking at it from the data's perspective.

GOOGLE_BIGQUERY = 'GBQ', 'Google BigQuery'
GOOGLE_CLOUD_STORAGE = 'GCS', 'Google Cloud Storage'
AWS_OPEN_DATA = 'AOD', 'AWS Open Data'
AWS_S3 = 'AS3', 'AWS S3'

class AccessMechanism(models.TextChoices):
GOOGLE_GROUP_EMAIL = 'google-group-email', 'Google Group Email'
S3 = 's3', 'S3'
RESEARCH_ENVIRONMENT = 'research-environment', 'Research Environment'

project = models.ForeignKey('project.PublishedProject',
related_name='data_sources', db_index=True, on_delete=models.CASCADE)
files_available = models.BooleanField(default=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to go back to this to give it some more thought - the files_available boolean is dubious now, seeing it in practice. It's a bit confusing what it's supposed to achieve unless you were part of the previous conversations 🤔

I think it's going to be more clear once we see it actually used in practice, i.e. replace the current DataAccess with DataSource. Originally the idea of this boolean was to denote the source that gives us access to the file browser. I think at this point we could not care about this and assume only the direct source gives us direct file download access. We could easily go back to this and make it more generic in the future.

The point is that in theory the GCS source could also give us access to files, since we are using buckets in HDN to store the projects, with the exact same UX as PhysioNet has for direct upload. But right now the mechanisms are tightly coupled with their specific use cases, so maybe we could just not care about it for now. This would mean we wouldn't initially need the files_available bool.

data_location = models.CharField(max_length=3, choices=DataLocation.choices)
access_mechanism = models.CharField(max_length=20, choices=AccessMechanism.choices, null=True, blank=True)
email = models.CharField(max_length=320, null=True, blank=True)
uri = models.CharField(max_length=320, null=True, blank=True)

class Meta:
default_permissions = ()
unique_together = ('project', 'data_location')

def clean(self):
super().clean()

if self.data_location == self.DataLocation.GOOGLE_BIGQUERY:
if self.access_mechanism != self.AccessMechanism.GOOGLE_GROUP_EMAIL:
raise ValidationError('Google BigQuery data sources must use the Google Group Email access mechanism.')
if not self.email:
raise ValidationError('Google BigQuery data sources must have an email address.')
elif self.data_location == self.DataLocation.GOOGLE_CLOUD_STORAGE:
if self.access_mechanism != self.AccessMechanism.GOOGLE_GROUP_EMAIL:
raise ValidationError('Google Cloud Storage data sources must use the Google Group Email access '
'mechanism.')
if not self.uri:
raise ValidationError('Google Cloud Storage data sources must have an uri address.')
elif self.data_location == self.DataLocation.AWS_OPEN_DATA:
if self.access_mechanism != self.AccessMechanism.S3:
raise ValidationError('AWS Open Data data sources must use the S3 access mechanism.')
if not self.uri:
raise ValidationError('AWS Open Data data sources must have a URI.')
elif self.data_location == self.DataLocation.AWS_S3:
if self.access_mechanism != self.AccessMechanism.S3:
raise ValidationError('AWS S3 data sources must use the S3 access mechanism.')
if not self.uri:
raise ValidationError('AWS S3 data sources must have a URI.')
elif self.data_location == self.DataLocation.DIRECT:
if self.email:
raise ValidationError('Direct data sources must not have an email address.')
if self.uri:
raise ValidationError('Direct data sources must not have a URI.')
else:
raise ValidationError('Invalid data location.')
Comment on lines +205 to +232
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is very hard to follow. You could approach it differently - define what exactly is required for each data source and have a single piece of logic that validates it.

required_fields = {
  self.DataLocation.DIRECT: [],
  self.DataLocation.AWS_S3: ["uri"],
  ...
}

forbidden_fields = {
  self.DataLocation.DIRECT: ["uri", "email"],
  ...
}

for required_field in required_fields[self.data_location]:
  if not getattr(self, required_field): # Or something like that
    raise ValidationError("...")

for forbidden_field in forbidden_fields[self.data_location]:
  if getattr(self, forbidden_field): # Or something like that
    raise ValidationError("...")

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same goes for data access:

data_location_access_mechanisms = {
  self.DataLocation.DIRECT: None
  self.DataLocation.AWS_S3: self.AccessMechanism.AWS_S3,
  ...
}



class AnonymousAccess(models.Model):
"""
Makes it possible to grant anonymous access (without user auth)
Expand Down Expand Up @@ -274,3 +339,42 @@ class Meta:

def __str__(self):
return self.name


class DataSourceCreator:
def __init__(self, **kwargs):
self.data_location = kwargs.get('data_location', None)
self.files_available = kwargs.get('files_available', None)
self.email = kwargs.get('email', None)
self.uri = kwargs.get('uri', None)
self.access_mechanism = kwargs.get('access_mechanism', None)

def create(self, project):
DataSource.objects.create(
project=project,
files_available=self.files_available,
data_location=self.data_location,
access_mechanism=self.access_mechanism,
email=self.email,
uri=self.uri,
)

@staticmethod
def create_default(project):
if (settings.DEFAULT_PROJECT_DATA_LOCATION == DataSource.DataLocation.DIRECT
and settings.STORAGE_TYPE == StorageTypes.LOCAL):
DataSource.objects.create(
project=project,
files_available=True,
data_location=DataSource.DataLocation.DIRECT,
)
elif (settings.DEFAULT_PROJECT_ACCESS_MECHANISM == DataSource.DataLocation.RESEARCH_ENVIRONMENT
and settings.DEFAULT_PROJECT_DATA_LOCATION == DataSource.DataLocation.GOOGLE_CLOUD_STORAGE
and settings.STORAGE_TYPE == StorageTypes.GCP):
DataSource.objects.create(
project=project,
files_available=False,
data_location=DataSource.DataLocation.GOOGLE_CLOUD_STORAGE,
uri=f'gs://{project.project_file_root()}/',
access_mechanism=DataSource.AccessMechanism.RESEARCH_ENVIRONMENT,
)
2 changes: 2 additions & 0 deletions physionet-django/project/modelcomponents/activeproject.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from project.modelcomponents.submission import CopyeditLog, EditLog, SubmissionInfo
from project.modelcomponents.unpublishedproject import UnpublishedProject
from project.projectfiles import ProjectFiles
from project.models import DataSourceCreator
from project.validators import validate_subdir

LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -614,6 +615,7 @@ def publish(self, slug=None, make_zip=True, title=None):
raise

ProjectFiles().publish_complete(self, published_project)
DataSourceCreator().create_default(published_project)

return published_project

Expand Down
Loading