Skip to content

Commit

Permalink
✨(documents) add content field as an S3 object
Browse files Browse the repository at this point in the history
The content field is a writable property on the model which is persisted
in object storage. We take advantage of the versioning, robustness and
scalability of S3.
  • Loading branch information
sampaccoud committed Apr 6, 2024
1 parent e1b3aae commit 8311a71
Show file tree
Hide file tree
Showing 12 changed files with 159 additions and 44 deletions.
4 changes: 0 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ services:
- "8071:8000"
volumes:
- ./src/backend:/app
- ./data/media:/data/media
- ./data/static:/data/static
depends_on:
- postgresql
Expand All @@ -70,7 +69,6 @@ services:
- env.d/development/postgresql
volumes:
- ./src/backend:/app
- ./data/media:/data/media
- ./data/static:/data/static
depends_on:
- app-dev
Expand All @@ -88,8 +86,6 @@ services:
env_file:
- env.d/development/common
- env.d/development/postgresql
volumes:
- ./data/media:/data/media
depends_on:
- postgresql
- redis
Expand Down
6 changes: 6 additions & 0 deletions env.d/development/common.dist
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ DJANGO_EMAIL_PORT=1025
# Backend url
IMPRESS_BASE_URL="http://localhost:8072"

# Media
STORAGES_STATICFILES_BACKEND=django.contrib.staticfiles.storage.StaticFilesStorage
AWS_S3_ENDPOINT_URL=http://localhost:9000
AWS_S3_ACCESS_KEY_ID=impress
AWS_S3_SECRET_ACCESS_KEY=password

# OIDC
OIDC_OP_JWKS_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/certs
OIDC_OP_AUTHORIZATION_ENDPOINT=http://localhost:8083/realms/impress/protocol/openid-connect/auth
Expand Down
1 change: 1 addition & 0 deletions env.d/development/minio.dist
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Minio configuration
MINIO_ROOT_USER=impress
MINIO_ROOT_PASSWORD=password

24 changes: 24 additions & 0 deletions src/backend/core/api/fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""A JSONField for DRF to handle serialization/deserialization."""
import json

from rest_framework import serializers


class JSONField(serializers.Field):
"""
A custom field for handling JSON data.
"""

def to_representation(self, value):
"""
Convert the JSON string to a Python dictionary for serialization.
"""
return value

def to_internal_value(self, data):
"""
Convert the Python dictionary to a JSON string for deserialization.
"""
if data is None:
return None
return json.dumps(data)
6 changes: 5 additions & 1 deletion src/backend/core/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from core import models

from .fields import JSONField


class UserSerializer(serializers.ModelSerializer):
"""Serialize users."""
Expand Down Expand Up @@ -131,9 +133,11 @@ def get_abilities(self, document) -> dict:
class DocumentSerializer(BaseResourceSerializer):
"""Serialize documents."""

content = JSONField(required=False)

class Meta:
model = models.Document
fields = ["id", "title", "accesses", "abilities"]
fields = ["id", "content", "title", "accesses", "abilities"]
read_only_fields = ["id", "accesses", "abilities"]


Expand Down
1 change: 1 addition & 0 deletions src/backend/core/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class Meta:

title = factory.Sequence(lambda n: f"document{n}")
is_public = factory.Faker("boolean")
content = factory.LazyFunction(lambda: {"foo": fake.word()})

@factory.post_generation
def users(self, create, extracted, **kwargs):
Expand Down
49 changes: 49 additions & 0 deletions src/backend/core/models.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
"""
Declare and configure the models for the impress core application
"""
import hashlib
import json
import textwrap
import uuid

from django.conf import settings
from django.contrib.auth import models as auth_models
from django.contrib.auth.base_user import AbstractBaseUser
from django.core import mail, validators
from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
from django.db import models
from django.template.base import Template as DjangoTemplate
from django.template.context import Context
Expand Down Expand Up @@ -250,6 +254,8 @@ class Document(BaseModel):
help_text=_("Whether this document is public for anyone to use."),
)

_content = None

class Meta:
db_table = "impress_document"
ordering = ("title",)
Expand All @@ -259,6 +265,49 @@ class Meta:
def __str__(self):
return self.title

@property
def content(self):
"""Return the json content from object storage if available"""
if self._content is None and self.id:
try:
# Load content from object storage
with default_storage.open(str(self.id)) as f:
self._content = json.load(f)
except FileNotFoundError:
pass
return self._content

@content.setter
def content(self, content):
"""Cache the content, don't write to object storage yet"""
if isinstance(content, str):
content = json.loads(content)
if not isinstance(content, dict):
raise ValueError("content should be a json object.")
self._content = content

def save(self, *args, **kwargs):
"""Write content to object storage only if _content has changed."""
super().save(*args, **kwargs)

if self._content:
file_key = str(self.pk)
bytes_content = json.dumps(self._content).encode("utf-8")

if default_storage.exists(file_key):
response = default_storage.connection.meta.client.head_object(
Bucket=default_storage.bucket_name, Key=file_key
)
has_changed = (
response["ETag"].strip('"')
!= hashlib.md5(bytes_content).hexdigest() # noqa
)
else:
has_changed = True
if has_changed:
content_file = ContentFile(bytes_content)
default_storage.save(file_key, content_file)

def get_abilities(self, user):
"""
Compute and return abilities for a given user on the document.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def test_api_documents_retrieve_anonymous_public():
},
"accesses": [],
"title": document.title,
"content": {"foo": document.content["foo"]},
}


Expand Down Expand Up @@ -65,6 +66,7 @@ def test_api_documents_retrieve_authenticated_unrelated_public():
},
"accesses": [],
"title": document.title,
"content": {"foo": document.content["foo"]},
}


Expand Down Expand Up @@ -128,6 +130,7 @@ def test_api_documents_retrieve_authenticated_related_direct():
assert response.json() == {
"id": str(document.id),
"title": document.title,
"content": {"foo": document.content["foo"]},
"abilities": document.get_abilities(user),
}

Expand Down Expand Up @@ -241,6 +244,7 @@ def test_api_documents_retrieve_authenticated_related_team_members(
assert response.json() == {
"id": str(document.id),
"title": document.title,
"content": {"foo": document.content["foo"]},
"abilities": document.get_abilities(user),
}

Expand Down Expand Up @@ -337,6 +341,7 @@ def test_api_documents_retrieve_authenticated_related_team_administrators(
assert response.json() == {
"id": str(document.id),
"title": document.title,
"content": {"foo": document.content["foo"]},
"abilities": document.get_abilities(user),
}

Expand Down Expand Up @@ -437,5 +442,6 @@ def test_api_documents_retrieve_authenticated_related_team_owners(
assert response.json() == {
"id": str(document.id),
"title": document.title,
"content": {"foo": document.content["foo"]},
"abilities": document.get_abilities(user),
}
6 changes: 3 additions & 3 deletions src/backend/core/tests/documents/test_api_documents_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pytest
from rest_framework.test import APIClient

from core import factories
from core import factories, models
from core.api import serializers
from core.tests.conftest import TEAM, USER, VIA

Expand Down Expand Up @@ -138,7 +138,7 @@ def test_api_documents_update_authenticated_administrator_or_owner(
)
assert response.status_code == 200

document.refresh_from_db()
document = models.Document.objects.get(pk=document.pk)
document_values = serializers.DocumentSerializer(instance=document).data
for key, value in document_values.items():
if key in ["id", "accesses"]:
Expand Down Expand Up @@ -175,7 +175,7 @@ def test_api_documents_update_authenticated_owners(via, mock_user_get_teams):
)

assert response.status_code == 200
document.refresh_from_db()
document = models.Document.objects.get(pk=document.pk)
document_values = serializers.DocumentSerializer(instance=document).data
for key, value in document_values.items():
if key in ["id", "accesses"]:
Expand Down
46 changes: 46 additions & 0 deletions src/backend/core/tests/test_models_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
"""
from django.contrib.auth.models import AnonymousUser
from django.core.exceptions import ValidationError
from django.core.files.storage import default_storage

import pytest
import requests

from core import factories, models

Expand Down Expand Up @@ -151,3 +153,47 @@ def test_models_documents_get_abilities_preset_role(django_assert_num_queries):
"update": False,
"manage_accesses": False,
}


def test_models_documents_file_upload_to_minio():
"""Validate read/write from/to minio"""
document = factories.DocumentFactory()
document.content = {"foé": "çar"}
document.save()

# Check that the file exists in MinIO:
file_key = str(document.pk)
# - through the storage backend
assert default_storage.exists(file_key) is True
# - directly from minio
signed_url = default_storage.url(file_key)
response = requests.get(signed_url, timeout=1)
assert response.json() == {"foé": "çar"}


def test_models_documents_version_duplicate():
"""A new version should be created in object storage only if the content has changed."""
document = factories.DocumentFactory()

file_key = str(document.pk)
response = default_storage.connection.meta.client.list_object_versions(
Bucket=default_storage.bucket_name, Prefix=file_key
)
assert len(response["Versions"]) == 1

# Save again with the same content
document.save()

response = default_storage.connection.meta.client.list_object_versions(
Bucket=default_storage.bucket_name, Prefix=file_key
)
assert len(response["Versions"]) == 1

# Save modified content
document.content = {"foo": "spam"}
document.save()

response = default_storage.connection.meta.client.list_object_versions(
Bucket=default_storage.bucket_name, Prefix=file_key
)
assert len(response["Versions"]) == 2
52 changes: 17 additions & 35 deletions src/backend/impress/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,25 @@ class Base(Configuration):

STORAGES = {
"default": {
"BACKEND": "django.core.files.storage.FileSystemStorage",
"BACKEND": "storages.backends.s3.S3Storage",
},
"staticfiles": {
"BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage",
"BACKEND": values.Value(
"whitenoise.storage.CompressedManifestStaticFilesStorage",
environ_name="STORAGES_STATICFILES_BACKEND",
),
},
}

# Media
AWS_S3_ENDPOINT_URL = values.Value(environ_name="AWS_S3_ENDPOINT_URL")
AWS_S3_ACCESS_KEY_ID = values.Value(environ_name="AWS_S3_ACCESS_KEY_ID")
AWS_S3_SECRET_ACCESS_KEY = values.Value(environ_name="AWS_S3_SECRET_ACCESS_KEY")
AWS_S3_REGION_NAME = values.Value(environ_name="AWS_S3_REGION_NAME")
AWS_STORAGE_BUCKET_NAME = values.Value(
"impress-media-storage", environ_name="AWS_STORAGE_BUCKET_NAME"
)

# Internationalization
# https://docs.djangoproject.com/en/3.1/topics/i18n/

Expand Down Expand Up @@ -449,14 +461,9 @@ class Test(Base):
]
USE_SWAGGER = True

STORAGES = {
"default": {
"BACKEND": "django.core.files.storage.FileSystemStorage",
},
"staticfiles": {
"BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage",
},
}
AWS_S3_ENDPOINT_URL = "http://minio:9000"
AWS_S3_ACCESS_KEY_ID = "impress"
AWS_S3_SECRET_ACCESS_KEY = "password" # noqa

CELERY_TASK_ALWAYS_EAGER = values.BooleanValue(True)

Expand Down Expand Up @@ -503,34 +510,9 @@ class Production(Base):
CSRF_COOKIE_SECURE = True
SESSION_COOKIE_SECURE = True

# For static files in production, we want to use a backend that includes a hash in
# the filename, that is calculated from the file content, so that browsers always
# get the updated version of each file.
STORAGES = {
"default": {
"BACKEND": "storages.backends.s3.S3Storage",
},
"staticfiles": {
# For static files in production, we want to use a backend that includes a hash in
# the filename, that is calculated from the file content, so that browsers always
# get the updated version of each file.
"BACKEND": values.Value(
"whitenoise.storage.CompressedManifestStaticFilesStorage",
environ_name="STORAGES_STATICFILES_BACKEND",
)
},
}

# Privacy
SECURE_REFERRER_POLICY = "same-origin"

# Media
AWS_S3_ENDPOINT_URL = values.Value()
AWS_S3_ACCESS_KEY_ID = values.Value()
AWS_S3_SECRET_ACCESS_KEY = values.Value()
AWS_STORAGE_BUCKET_NAME = values.Value("tf-default-impress-media-storage")
AWS_S3_REGION_NAME = values.Value()


class Feature(Production):
"""
Expand Down
2 changes: 1 addition & 1 deletion src/backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ dependencies = [
"django-cors-headers==4.3.1",
"django-countries==7.5.1",
"django-parler==2.3",
"django-storages==1.14.2",
"django-storages[s3]==1.14.2",
"django-timezone-field>=5.1",
"django==5.0.3",
"djangorestframework==3.14.0",
Expand Down

0 comments on commit 8311a71

Please sign in to comment.