-
Notifications
You must be signed in to change notification settings - Fork 20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] refactor DataAccess to DataSource #1967
base: dev
Are you sure you want to change the base?
Changes from all commits
a1ee3ee
58f65d1
86dcdcf
78c47a3
10202d8
5837f74
d312b8c
6b686cb
7a39834
28e8354
d987295
3a3752a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Generated by Django 4.1.7 on 2023-04-19 14:03 | ||
|
||
from django.db import migrations, models | ||
import django.db.models.deletion | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('project', '0067_alter_activeproject_core_project_and_more'), | ||
] | ||
|
||
operations = [ | ||
migrations.CreateModel( | ||
name='DataSource', | ||
fields=[ | ||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), | ||
('files_available', models.BooleanField(default=False)), | ||
('data_location', models.CharField(choices=[('DI', 'Direct'), ('GBQ', 'Google BigQuery'), ('GCS', 'Google Cloud Storage'), ('AOD', 'AWS Open Data'), ('AS3', 'AWS S3')], default='DI', max_length=3)), | ||
('access_mechanism', models.CharField(blank=True, choices=[('google-group-email', 'Google Group Email'), ('s3', 'S3'), ('research-environment', 'Research Environment')], max_length=20, null=True)), | ||
('email', models.CharField(blank=True, max_length=320, null=True)), | ||
('uri', models.CharField(blank=True, max_length=320, null=True)), | ||
('project', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='data_sources', to='project.publishedproject')), | ||
], | ||
options={ | ||
'default_permissions': (), | ||
'unique_together': {('project', 'data_location')}, | ||
}, | ||
), | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,20 @@ | ||
from datetime import timedelta | ||
from enum import IntEnum | ||
|
||
from django.conf import settings | ||
from django.contrib.auth.hashers import check_password, make_password | ||
from django.contrib.contenttypes.fields import GenericForeignKey | ||
from django.contrib.contenttypes.models import ContentType | ||
from django.core.exceptions import ValidationError | ||
from django.db import models | ||
from django.utils import timezone | ||
from django.utils.crypto import get_random_string | ||
from django.utils.translation import gettext_lazy as _ | ||
from project.modelcomponents.fields import SafeHTMLField | ||
from project.validators import validate_version | ||
|
||
from project.managers.access import DataAccessRequestQuerySet, DataAccessRequestManager | ||
from physionet.settings.base import StorageTypes | ||
|
||
|
||
class AccessPolicy(IntEnum): | ||
|
@@ -167,6 +171,67 @@ class Meta: | |
default_permissions = () | ||
|
||
|
||
class DataSource(models.Model): | ||
superryeti marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Controls all access to project data. | ||
""" | ||
class DataLocation(models.TextChoices): | ||
DIRECT = 'DI', 'Direct' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any advantage to using abbreviations like |
||
GOOGLE_BIGQUERY = 'GBQ', 'Google BigQuery' | ||
GOOGLE_CLOUD_STORAGE = 'GCS', 'Google Cloud Storage' | ||
AWS_OPEN_DATA = 'AOD', 'AWS Open Data' | ||
AWS_S3 = 'AS3', 'AWS S3' | ||
|
||
class AccessMechanism(models.TextChoices): | ||
GOOGLE_GROUP_EMAIL = 'google-group-email', 'Google Group Email' | ||
S3 = 's3', 'S3' | ||
RESEARCH_ENVIRONMENT = 'research-environment', 'Research Environment' | ||
|
||
project = models.ForeignKey('project.PublishedProject', | ||
related_name='data_sources', db_index=True, on_delete=models.CASCADE) | ||
files_available = models.BooleanField(default=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm going to go back to this to give it some more thought - the I think it's going to be more clear once we see it actually used in practice, i.e. replace the current The point is that in theory the GCS source could also give us access to files, since we are using buckets in HDN to store the projects, with the exact same UX as PhysioNet has for direct upload. But right now the mechanisms are tightly coupled with their specific use cases, so maybe we could just not care about it for now. This would mean we wouldn't initially need the |
||
data_location = models.CharField(max_length=3, choices=DataLocation.choices) | ||
access_mechanism = models.CharField(max_length=20, choices=AccessMechanism.choices, null=True, blank=True) | ||
email = models.CharField(max_length=320, null=True, blank=True) | ||
uri = models.CharField(max_length=320, null=True, blank=True) | ||
|
||
class Meta: | ||
default_permissions = () | ||
unique_together = ('project', 'data_location') | ||
|
||
def clean(self): | ||
super().clean() | ||
|
||
if self.data_location == self.DataLocation.GOOGLE_BIGQUERY: | ||
if self.access_mechanism != self.AccessMechanism.GOOGLE_GROUP_EMAIL: | ||
raise ValidationError('Google BigQuery data sources must use the Google Group Email access mechanism.') | ||
if not self.email: | ||
raise ValidationError('Google BigQuery data sources must have an email address.') | ||
elif self.data_location == self.DataLocation.GOOGLE_CLOUD_STORAGE: | ||
if self.access_mechanism != self.AccessMechanism.GOOGLE_GROUP_EMAIL: | ||
raise ValidationError('Google Cloud Storage data sources must use the Google Group Email access ' | ||
'mechanism.') | ||
if not self.uri: | ||
raise ValidationError('Google Cloud Storage data sources must have an uri address.') | ||
elif self.data_location == self.DataLocation.AWS_OPEN_DATA: | ||
if self.access_mechanism != self.AccessMechanism.S3: | ||
raise ValidationError('AWS Open Data data sources must use the S3 access mechanism.') | ||
if not self.uri: | ||
raise ValidationError('AWS Open Data data sources must have a URI.') | ||
elif self.data_location == self.DataLocation.AWS_S3: | ||
if self.access_mechanism != self.AccessMechanism.S3: | ||
raise ValidationError('AWS S3 data sources must use the S3 access mechanism.') | ||
if not self.uri: | ||
raise ValidationError('AWS S3 data sources must have a URI.') | ||
elif self.data_location == self.DataLocation.DIRECT: | ||
if self.email: | ||
raise ValidationError('Direct data sources must not have an email address.') | ||
if self.uri: | ||
raise ValidationError('Direct data sources must not have a URI.') | ||
else: | ||
raise ValidationError('Invalid data location.') | ||
Comment on lines
+205
to
+232
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is very hard to follow. You could approach it differently - define what exactly is required for each data source and have a single piece of logic that validates it. required_fields = {
self.DataLocation.DIRECT: [],
self.DataLocation.AWS_S3: ["uri"],
...
}
forbidden_fields = {
self.DataLocation.DIRECT: ["uri", "email"],
...
}
for required_field in required_fields[self.data_location]:
if not getattr(self, required_field): # Or something like that
raise ValidationError("...")
for forbidden_field in forbidden_fields[self.data_location]:
if getattr(self, forbidden_field): # Or something like that
raise ValidationError("...") There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same goes for data access: data_location_access_mechanisms = {
self.DataLocation.DIRECT: None
self.DataLocation.AWS_S3: self.AccessMechanism.AWS_S3,
...
} |
||
|
||
|
||
class AnonymousAccess(models.Model): | ||
""" | ||
Makes it possible to grant anonymous access (without user auth) | ||
|
@@ -274,3 +339,42 @@ class Meta: | |
|
||
def __str__(self): | ||
return self.name | ||
|
||
|
||
class DataSourceCreator: | ||
def __init__(self, **kwargs): | ||
self.data_location = kwargs.get('data_location', None) | ||
self.files_available = kwargs.get('files_available', None) | ||
self.email = kwargs.get('email', None) | ||
self.uri = kwargs.get('uri', None) | ||
self.access_mechanism = kwargs.get('access_mechanism', None) | ||
|
||
def create(self, project): | ||
DataSource.objects.create( | ||
project=project, | ||
files_available=self.files_available, | ||
data_location=self.data_location, | ||
access_mechanism=self.access_mechanism, | ||
email=self.email, | ||
uri=self.uri, | ||
) | ||
|
||
@staticmethod | ||
def create_default(project): | ||
if (settings.DEFAULT_PROJECT_DATA_LOCATION == DataSource.DataLocation.DIRECT | ||
and settings.STORAGE_TYPE == StorageTypes.LOCAL): | ||
DataSource.objects.create( | ||
project=project, | ||
files_available=True, | ||
data_location=DataSource.DataLocation.DIRECT, | ||
) | ||
elif (settings.DEFAULT_PROJECT_ACCESS_MECHANISM == DataSource.DataLocation.RESEARCH_ENVIRONMENT | ||
and settings.DEFAULT_PROJECT_DATA_LOCATION == DataSource.DataLocation.GOOGLE_CLOUD_STORAGE | ||
and settings.STORAGE_TYPE == StorageTypes.GCP): | ||
DataSource.objects.create( | ||
project=project, | ||
files_available=False, | ||
data_location=DataSource.DataLocation.GOOGLE_CLOUD_STORAGE, | ||
uri=f'gs://{project.project_file_root()}/', | ||
access_mechanism=DataSource.AccessMechanism.RESEARCH_ENVIRONMENT, | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm wondering whether we want to care about this being enabled or not... It makes sense and it would be my first instinct to do so, but right now it seems that research environments are singled out as something that can be present or not.
The HDN deployment is not integrated with any other data source other than local/research environment, so in that case it makes no sense showing anything other than that. Research environments are the only ones that have an explicit setting to disable them, but still.
I guess it doesn't hurt to have this, it just got me thinking!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can see what you mean. It makes sense about for HDN like deployment, we might just want to have the local/research environment and disable other stuff.
From what i understand, we might want to allow the deployments to have any combination of access mechanisms.
so what if we have a configurable env like
ACCESS_MECHANISMS = 'research-environment,direct, awss3'
and use that.
or maybe since this is something only the Project Editor or console admin can do, we might be okay with showing them all the option and trust them to use the right ones.