Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add file redaction policies #11

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion tonic_textual/classes/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import requests

from tonic_textual.classes.common_api_responses.label_custom_list import LabelCustomList
from tonic_textual.classes.enums.file_redaction_policies import docx_image_policy, docx_comment_policy, pdf_signature_policy
from tonic_textual.classes.tonic_exception import (
DatasetFileMatchesExistingFile,
DatasetFileNotFound,
Expand Down Expand Up @@ -50,7 +51,10 @@ def __init__(
generator_config: Optional[Dict[str, PiiState]] = None,
label_block_lists: Optional[Dict[str, List[str]]] = None,
label_allow_lists: Optional[Dict[str, List[str]]] = None,
):
docx_image_policy_name: Optional[docx_image_policy] = docx_image_policy.redact,
docx_comment_policy_name: Optional[docx_comment_policy] = docx_comment_policy.remove,
pdf_signature_policy_name: Optional[pdf_signature_policy] = pdf_signature_policy.redact
):
self.__initialize(
client,
id,
Expand All @@ -59,6 +63,9 @@ def __init__(
generator_config,
label_block_lists,
label_allow_lists,
docx_image_policy_name,
docx_comment_policy_name,
pdf_signature_policy_name
)

def __initialize(
Expand All @@ -70,6 +77,9 @@ def __initialize(
generator_config: Optional[Dict[str, PiiState]] = None,
label_block_lists: Optional[Dict[str, List[str]]] = None,
label_allow_lists: Optional[Dict[str, List[str]]] = None,
docx_image_policy_name: Optional[docx_image_policy] = docx_image_policy.redact,
docx_comment_policy_name: Optional[docx_comment_policy] = docx_comment_policy.remove,
pdf_signature_policy_name: Optional[pdf_signature_policy] = pdf_signature_policy.redact
):
self.id = id
self.name = name
Expand All @@ -78,6 +88,9 @@ def __initialize(
self.generator_config = generator_config
self.label_block_lists = label_block_lists
self.label_allow_lists = label_allow_lists
self.docx_image_policy: docx_image_policy_name
self.docx_comment_policy: docx_comment_policy_name
self.pdf_signature_policy: pdf_signature_policy_name
self.files = [
DatasetFile(
self.client,
Expand All @@ -89,6 +102,9 @@ def __initialize(
f["processingStatus"],
f.get("processingError"),
f.get("labelAllowLists"),
f.get("docxImagePolicy"),
f.get("docxCommentPolicy"),
f.get("pdfSignaturePolicy")
)
for f in files
]
Expand All @@ -104,6 +120,9 @@ def edit(
generator_config: Optional[Dict[str, PiiState]] = None,
label_block_lists: Optional[Dict[str, List[str]]] = None,
label_allow_lists: Optional[Dict[str, List[str]]] = None,
docx_image_policy_name: Optional[docx_image_policy] = None,
docx_comment_policy_name: Optional[docx_comment_policy] = None,
pdf_signature_policy_name: Optional[pdf_signature_policy] = None,
should_rescan=True,
):
"""
Expand All @@ -122,6 +141,12 @@ def edit(
label_allow_lists: Optional[Dict[str, List[str]]]
A dictionary of (entity type, included entities). When a piece of text matches a regular expression in the list,
the text is marked as the entity type and is included in the redaction or synthesis.
docx_image_policy_name: Optional[docx_image_policy] = None
The policy for handling images in DOCX files. Options are 'redact', 'ignore', and 'remove'.
docx_comment_policy_name: Optional[docx_comment_policy] = None
The policy for handling comments in DOCX files. Options are 'remove' and 'ignore'.
pdf_signature_policy_name: Optional[pdf_signature_policy] = None
The policy for handling signatures in PDF files. Options are 'redact' and 'ignore'.

Raises
------
Expand All @@ -148,6 +173,12 @@ def edit(
k: LabelCustomList(regexes=v).to_dict()
for k, v in label_allow_lists.items()
}
if docx_image_policy is not None:
data["docxImagePolicy"] = docx_image_policy_name
if docx_comment_policy is not None:
data["docxCommentPolicy"] = docx_comment_policy_name
if pdf_signature_policy is not None:
data["pdfSignaturePolicy"] = pdf_signature_policy_name

try:
new_dataset = self.client.http_put(
Expand All @@ -161,6 +192,9 @@ def edit(
new_dataset["generatorSetup"],
new_dataset["labelBlockLists"],
new_dataset["labelAllowLists"],
new_dataset("docxImagePolicy"),
new_dataset("docxCommentPolicy"),
new_dataset("pdfSignaturePolicy")
)
except requests.exceptions.HTTPError as e:
if e.response.status_code == 409:
Expand Down Expand Up @@ -268,6 +302,9 @@ def add_file(
f["processingStatus"],
f.get("processingError"),
f.get("labelAllowLists"),
f.get("docxImagePolicy"),
f.get("docxCommentPolicy"),
f.get("pdfSignaturePolicy")
)
for f in updated_dataset["files"]
]
Expand Down
8 changes: 8 additions & 0 deletions tonic_textual/classes/datasetfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from typing import Optional, Dict

from tonic_textual.classes.common_api_responses.label_custom_list import LabelCustomList
from tonic_textual.classes.enums.file_redaction_policies import docx_image_policy, docx_comment_policy, \
pdf_signature_policy
from tonic_textual.classes.httpclient import HttpClient
from tonic_textual.classes.tonic_exception import FileNotReadyForDownload

Expand Down Expand Up @@ -49,6 +51,9 @@ def __init__(
processing_status: str,
processing_error: Optional[str],
label_allow_lists: Optional[Dict[str, LabelCustomList]] = None,
docx_image_policy_name: Optional[docx_image_policy] = docx_image_policy.redact,
docx_comment_policy_name: Optional[docx_comment_policy] = docx_comment_policy.remove,
pdf_signature_policy_name: Optional[pdf_signature_policy] = pdf_signature_policy.redact
):
self.client = client
self.id = id
Expand All @@ -59,6 +64,9 @@ def __init__(
self.processing_status = processing_status
self.processing_error = processing_error
self.label_allow_lists = label_allow_lists
self.docx_image_policy: docx_image_policy_name
self.docx_comment_policy: docx_comment_policy_name
self.pdf_signature_policy: pdf_signature_policy_name

def describe(self) -> str:
"""Returns the dataset file metadata as string. Includes the identifier, file
Expand Down
14 changes: 14 additions & 0 deletions tonic_textual/classes/enums/file_redaction_policies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from enum import Enum

class docx_image_policy(Enum):
redact = 1
ignore = 2
remove = 3

class docx_comment_policy(Enum):
remove = 1
ignore = 2

class pdf_signature_policy(Enum):
redact = 1
ignore = 2
Loading