Skip to content

Commit

Permalink
⚗️(backend) function to extract text from base64 yjs document
Browse files Browse the repository at this point in the history
Function to extract text from base64 yjs document.
Can be usefull if we need to index the content
of the documents.
  • Loading branch information
AntoLC committed Sep 19, 2024
1 parent ac86a4e commit 3552d66
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ and this project adheres to

## [Unreleased]

## Added

- ⚗️(backend) Extract text from base64 yjs document #270


## [1.4.0] - 2024-09-17

Expand Down
23 changes: 22 additions & 1 deletion src/backend/core/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import pytest

from core.utils import email_invitation
from core.utils import email_invitation, extract_text_from_saved_yjs_document

pytestmark = pytest.mark.django_db

Expand Down Expand Up @@ -85,3 +85,24 @@ def test_utils__email_invitation_failed(mock_logger, _mock_send_mail):

assert email == "[email protected]"
assert isinstance(exception, smtplib.SMTPException)


def test_extract_text_from_saved_yjs_document():
"""
Test extract_text_from_saved_yjs_document
This base64 string is an example of what is saved in the database.
This base64 is generated from the blocknote editor, it contains
the text "Hello world"
"""
base64_string = (
"ARCymr/3DgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcAspq/9w4AAw5ibG9j"
"a0NvbnRhaW5lcgcAspq/9w4BAwlwYXJhZ3JhcGgHALKav/cOAgYEALKav/cOAwFIKACy"
"mr/3DgINdGV4dEFsaWdubWVudAF3BGxlZnQoALKav/cOAQJpZAF3DmluaXRpYWxCbG9j"
"a0lkKACymr/3DgEJdGV4dENvbG9yAXcHZGVmYXVsdCgAspq/9w4BD2JhY2tncm91bmRD"
"b2xvcgF3B2RlZmF1bHSHspq/9w4BAw5ibG9ja0NvbnRhaW5lcgcAspq/9w4JAwlwYXJh"
"Z3JhcGgoALKav/cOCg10ZXh0QWxpZ25tZW50AXcEbGVmdCgAspq/9w4JAmlkAXckMTFj"
"YTgzYmEtZGM3OS00N2Q3LTllNzYtNmM4OTQwNzc1ZjE3KACymr/3DgkJdGV4dENvbG9y"
"AXcHZGVmYXVsdCgAspq/9w4JD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSEspq/9w4E"
"C2VsbG8gd29ybGQgAA=="
)
assert extract_text_from_saved_yjs_document(base64_string) == "Hello world"
18 changes: 18 additions & 0 deletions src/backend/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Utilities for the core app.
"""

import base64
import smtplib
from logging import getLogger

Expand All @@ -12,6 +13,9 @@
from django.utils.translation import gettext_lazy as _
from django.utils.translation import override

import y_py as Y
from bs4 import BeautifulSoup

logger = getLogger(__name__)


Expand All @@ -38,3 +42,17 @@ def email_invitation(language, email, document_id):

except smtplib.SMTPException as exception:
logger.error("invitation to %s was not sent: %s", email, exception)


def extract_text_from_saved_yjs_document(base64_string):
"""Extract text from saved yjs document"""

decoded_bytes = base64.b64decode(base64_string)
uint8_array = bytearray(decoded_bytes)

doc = Y.YDoc() # pylint: disable=E1101
Y.apply_update(doc, uint8_array) # pylint: disable=E1101
blocknote_structure = str(doc.get_xml_element("document-store"))

soup = BeautifulSoup(blocknote_structure, "html.parser")
return soup.get_text(separator=" ").strip()
2 changes: 2 additions & 0 deletions src/backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ license = { file = "LICENSE" }
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"beautifulsoup4==4.12.3",
"boto3==1.35.10",
"Brotli==1.1.0",
"celery[redis]==5.4.0",
Expand Down Expand Up @@ -57,6 +58,7 @@ dependencies = [
"WeasyPrint>=60.2",
"whitenoise==6.7.0",
"mozilla-django-oidc==4.0.1",
"y-py==0.5.5",
]

[project.urls]
Expand Down

0 comments on commit 3552d66

Please sign in to comment.