From a37d3ae6af13029eb19f4231aa26d0f642856cc0 Mon Sep 17 00:00:00 2001 From: Adam Kamor Date: Thu, 21 Nov 2024 12:39:33 -0500 Subject: [PATCH 1/5] docs + missing file --- docs/source/index.rst | 2 + docs/source/parse/index.rst | 1 - docs/source/redact/index.rst | 46 ++-- docs/source/redact/redacting_text.rst | 200 ++++++++++++------ .../common_api_responses/replacement.py | 7 + .../bulk_redaction_response.py | 48 +++++ tonic_textual/redact_api.py | 128 +++++++++++ 7 files changed, 354 insertions(+), 78 deletions(-) create mode 100644 tonic_textual/classes/redact_api_responses/bulk_redaction_response.py diff --git a/docs/source/index.rst b/docs/source/index.rst index 753e226..4d88d61 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -25,3 +25,5 @@ The quickstart provides information on how to install the SDK and set up an API quickstart/getting_started parse/index redact/index + redact/api + parse/api diff --git a/docs/source/parse/index.rst b/docs/source/parse/index.rst index c3b9bc0..e899c7c 100644 --- a/docs/source/parse/index.rst +++ b/docs/source/parse/index.rst @@ -15,4 +15,3 @@ To learn more about how to use Textual to redact entities within text and files parsing_files pipelines working_with_parsed_output - api diff --git a/docs/source/redact/index.rst b/docs/source/redact/index.rst index c76c48e..5a29104 100644 --- a/docs/source/redact/index.rst +++ b/docs/source/redact/index.rst @@ -1,28 +1,39 @@ Redact ============= -The Textual redact functionality allows you to identify entities in files, and then optionally redact/synthesize these entities to create a safe version of your unstructured text. This functionality works on both raw strings and files, including PDF, DOCX, XLSX, and other formats. +The Textual redact functionality allows you to identify entities in files, and then optionally tokenize/synthesize these entities to create a safe version of your unstructured text. This functionality works on both raw strings and files, including PDF, DOCX, XLSX, and other formats. Before you can use these functions, read the :doc:`Getting started <../quickstart/getting_started>` guide and create an API key. -Redacting strings +Redacting Text ----------------- -To identify entities in a raw string, call the **redact** function. +You can redact text directly in a variety of formats such as plain text, json, xml, and html. All redaction requests return a response which includes the original text, redacted text, a list of found entities and their locations. Additionally all redact functions allow you to specify which entities are tokenized and which are synthesized. -.. code-block:: python - - from tonic_textual.redact_api import TextualNer +The common set of inputs to are redact functions are: - textual = TonicTextual("https://textual.tonic.ai") - - raw_redaction = textual.redact("My name is John, and today I am demo-ing Textual, a software product created by Tonic") +* **generator_default** + The default operation performed on an entity. The options are 'Redact', 'Synthesis', and 'Off' +* **generator_config** + A dictionary whose keys are entity labels and values are how to redact the entity. The options are 'Redact', 'Synthesis', and 'Off'. + + Example: {'NAME_GIVEN': 'Synthesis'} +* **label_allow_lists** + A dictionary whose keys are entity labels and values are lists of regexes. If a piece of text matches a regex it is flagged as that entity type. + + Example: {'HEALTHCARE_ID': [r'[a-zA-zZ]{3}\\d{6,}'] +* **label_block_lists** + A dictionary whose keys are entity labels and values are lists of regexes. If a piece of text matches a regex it is ignored for that entity type. + + Example: {'NUMERIC_VALUE': [r'\\d{3}'] -The response provides a list of identified entities, with information about each entity. +The JSON and XML redact functions also have additional inputs which you can read about in their respective sections. -It also returns a redacted string that replaces the found entities with tokens. You can configure how to handle each type of entities - whether to redact or synthesize them. +.. toctree:: + :hidden: + :maxdepth: 2 -To learn more about to redact raw strings, go to :doc:`Redacting text `. + redacting_text Redacting files --------------- @@ -51,6 +62,11 @@ To generated redacted/synthesized files: To learn more about how to generate redacted and synthesized files, go to :doc:`Redacting files `. +.. toctree:: + :maxdepth: 2 + + redacting_files + Working with datasets --------------------- @@ -60,8 +76,6 @@ To help automate workflows, you can work with datasets directly from the SDK. To .. toctree:: - - redacting_text - redacting_files + :maxdepth: 2 + datasets - api diff --git a/docs/source/redact/redacting_text.rst b/docs/source/redact/redacting_text.rst index a856d75..d267dcd 100644 --- a/docs/source/redact/redacting_text.rst +++ b/docs/source/redact/redacting_text.rst @@ -1,6 +1,3 @@ -🅰 Text -========================= - Redact raw text --------------- To redact sensitive information from a text string, pass the string to the `redact` method: @@ -18,7 +15,7 @@ This produces the following output: .. code-block:: console - My name is Alfonzo, and today I am demoing Textual, a software product created by New Ignition Worldwide + My name is [NAME_GIVEN_HI1h7], and [DATE_TIME_4hKfrH] I am demoing Textual, a software product created by [ORGANIZATION_P5XLAH] { "start": 11, "end": 15, @@ -28,98 +25,99 @@ This produces the following output: "text": "John", "score": 0.9, "language": "en", - "new_text": "[NAME_GIVEN_dySb5]" + "new_text": "[NAME_GIVEN_HI1h7]" + } + { + "start": 21, + "end": 26, + "new_start": 35, + "new_end": 53, + "label": "DATE_TIME", + "text": "today", + "score": 0.9, + "language": "en", + "new_text": "[DATE_TIME_4hKfrH]" } { "start": 79, "end": 84, - "new_start": 93, - "new_end": 114, + "new_start": 106, + "new_end": 127, "label": "ORGANIZATION", "text": "Tonic", "score": 0.9, "language": "en", - "new_text": "[ORGANIZATION_5Ve7OH]" + "new_text": "[ORGANIZATION_P5XLAH]" } -Synthesize raw text -------------------- -The following example passes the same string to the `redact` method, but sets some categories to `Synthesis`, which indicates to use realistic replacement values: +Bulk redact raw text +--------------------- +In the same way that our `redact` method can be used to redact strings our `redact_bulk` method allows you to redact many strings at once. Each string is individually redacted, meaning individual strings are fed into our model independently and cannot affect each other. To redact sensitive information from a list of text strings, pass the list to the `redact_bulk` method: .. code-block:: python from tonic_textual.redact_api import TextualNer textual = TextualNer() - generator_config = {"NAME_GIVEN":"Synthesis", "ORGANIZATION":"Synthesis"} - raw_synthesis = textual.redact( - "My name is John, and today I am demoing Textual, a software product created by Tonic", - generator_config=generator_config) - print(raw_synthesis.describe()) + + raw_redaction = textual.redact_bulk(["Tonic was founded in 2018", "John Smith is a person"]) + print(raw_redaction.describe()) This produces the following output: .. code-block:: console - My name is Alfonzo, and today I am demoing Textual, a software product created by New Ignition Worldwide + [ORGANIZATION_P5XLAH] was founded in [DATE_TIME_0FW53] + [NAME_GIVEN_HI1h7] [NAME_FAMILY_5oMl28] is a person { - "start": 11, - "end": 15, - "new_start": 11, - "new_end": 18, - "label": "NAME_GIVEN", - "text": "John", + "start": 0, + "end": 5, + "new_start": 0, + "new_end": 21, + "label": "ORGANIZATION", + "text": "Tonic", "score": 0.9, "language": "en", - "new_text": "Alfonzo" + "new_text": "[ORGANIZATION_P5XLAH]", + "idx": 0 } { - "start": 79, - "end": 84, - "new_start": 82, - "new_end": 104, - "label": "ORGANIZATION", - "text": "Tonic", + "start": 21, + "end": 25, + "new_start": 37, + "new_end": 54, + "label": "DATE_TIME", + "text": "2018", "score": 0.9, "language": "en", - "new_text": "New Ignition Worldwide" - } - -Using LLM synthesis -------------------- -The following example passes the same string to the `llm_synthesis` method: - -.. code-block:: python - - from tonic_textual.redact_api import TextualNer - - textual = TextualNer() - - raw_synthesis = textual.llm_synthesis("My name is John, and today I am demoing Textual, a software product created by Tonic") - print(raw_synthesis.describe()) - -This produces the following output: - -.. code-block:: console - - My name is Matthew, and today I am demoing Textual, a software product created by Google. + "new_text": "[DATE_TIME_0FW53]", + "idx": 0 + } { - "start": 11, - "end": 15, + "start": 0, + "end": 4, + "new_start": 0, + "new_end": 18, "label": "NAME_GIVEN", "text": "John", - "score": 0.9 + "score": 0.9, + "language": "en", + "new_text": "[NAME_GIVEN_HI1h7]", + "idx": 1 } { - "start": 79, - "end": 84, - "label": "ORGANIZATION", - "text": "Tonic", - "score": 0.9 + "start": 5, + "end": 10, + "new_start": 19, + "new_end": 39, + "label": "NAME_FAMILY", + "text": "Smith", + "score": 0.9, + "language": "en", + "new_text": "[NAME_FAMILY_5oMl28]", + "idx": 1 } -Note that LLM Synthesis is non-deterministic — you will likely get different results each time you run. - Redact JSON data ---------------- To redact sensitive information from a JSON string or Python dict, pass the object to the `redact_json` method: @@ -220,3 +218,83 @@ To redact sensitive information from HTML, pass the HTML document string to the xml_redaction = textual.redact_html(html_content) The response includes entity level information, including the XPATH at which the sensitive entity is found. The start and end positions are relative to the beginning of thhe XPATH location where the entity is found. + +Choosing tokenization or synthesis raw text +---------------------------------------------- +You can choose whether a given entitiy is synthesized or tokenized. By default all entities are tokenized. You can specify which entities you wish to synthesize/tokenize by using the `generator_config` parameter. This works the same for all of our `redact` functions. + +The following example passes the same string to the `redact` method, but sets some entities to `Synthesis`, which indicates to use realistic replacement values: + +.. code-block:: python + + from tonic_textual.redact_api import TextualNer + + textual = TextualNer() + generator_config = {"NAME_GIVEN":"Synthesis", "ORGANIZATION":"Synthesis"} + raw_synthesis = textual.redact( + "My name is John, and today I am demoing Textual, a software product created by Tonic", + generator_config=generator_config) + print(raw_synthesis.describe()) + +This produces the following output: + +.. code-block:: console + + My name is Alfonzo, and today I am demoing Textual, a software product created by New Ignition Worldwide + { + "start": 11, + "end": 15, + "new_start": 11, + "new_end": 18, + "label": "NAME_GIVEN", + "text": "John", + "score": 0.9, + "language": "en", + "new_text": "Alfonzo" + } + { + "start": 79, + "end": 84, + "new_start": 82, + "new_end": 104, + "label": "ORGANIZATION", + "text": "Tonic", + "score": 0.9, + "language": "en", + "new_text": "New Ignition Worldwide" + } + +Using LLM synthesis +------------------- +The following example passes the same string to the `llm_synthesis` method: + +.. code-block:: python + + from tonic_textual.redact_api import TextualNer + + textual = TextualNer() + + raw_synthesis = textual.llm_synthesis("My name is John, and today I am demoing Textual, a software product created by Tonic") + print(raw_synthesis.describe()) + +This produces the following output: + +.. code-block:: console + + My name is Matthew, and today I am demoing Textual, a software product created by Google. + { + "start": 11, + "end": 15, + "label": "NAME_GIVEN", + "text": "John", + "score": 0.9 + } + { + "start": 79, + "end": 84, + "label": "ORGANIZATION", + "text": "Tonic", + "score": 0.9 + } + +Note that LLM Synthesis is non-deterministic — you will likely get different results each time you run. diff --git a/tonic_textual/classes/common_api_responses/replacement.py b/tonic_textual/classes/common_api_responses/replacement.py index 2835842..67c9b64 100644 --- a/tonic_textual/classes/common_api_responses/replacement.py +++ b/tonic_textual/classes/common_api_responses/replacement.py @@ -37,6 +37,8 @@ class Replacement(dict): xml_path : Optional[str] The xpath of the entity in the original XML document. This is only present if the input text was an XML document. NOTE: Arrays in xpath are 1-based + idx : Optional[int] + The index in the original bulk text array to which the NER result corresponds. NOTE: This is only used when calling our bulk redaction methods """ def __init__( @@ -53,6 +55,7 @@ def __init__( example_redaction: Optional[str] = None, json_path: Optional[str] = None, xml_path: Optional[str] = None, + idx: Optional[int] = None, ): self.start = start self.end = end @@ -66,6 +69,7 @@ def __init__( self.example_redaction = example_redaction self.json_path = json_path self.xml_path = xml_path + self.idx = idx dict.__init__( self, @@ -85,6 +89,7 @@ def __init__( ), **({} if json_path is None else {"json_path": json_path}), **({} if xml_path is None else {"xml_path": xml_path}), + **({} if idx is None else {"idx": idx}), ) def describe(self) -> str: @@ -109,4 +114,6 @@ def to_dict(self) -> Dict: out["json_path"] = self.json_path if self.xml_path is not None: out["xml_path"] = self.xml_path + if self.idx is not None: + out["idx"] = self.idx return out diff --git a/tonic_textual/classes/redact_api_responses/bulk_redaction_response.py b/tonic_textual/classes/redact_api_responses/bulk_redaction_response.py new file mode 100644 index 0000000..1cd0e26 --- /dev/null +++ b/tonic_textual/classes/redact_api_responses/bulk_redaction_response.py @@ -0,0 +1,48 @@ +from typing import List + +from tonic_textual.classes.common_api_responses.replacement import Replacement + + +class BulkRedactionResponse(dict): + """Bulk Redaction response object + + Attributes + ---------- + bulk_text : str + The original text + bulk_redacted_text : str + The redacted/synthesized text + usage : int + The number of words used + de_identify_results : List[Replacement] + The list of named entities found in bulk_text + """ + + def __init__( + self, + bulk_text: str, + bulk_redacted_text: str, + usage: int, + de_identify_results: List[Replacement], + ): + self.bulk_text = bulk_text + self.bulk_redacted_text = bulk_redacted_text + self.usage = usage + self.de_identify_results = de_identify_results + dict.__init__( + self, + bulk_text=bulk_text, + bulk_redacted_text=bulk_redacted_text, + usage=usage, + de_identify_results=de_identify_results, + ) + + def describe(self) -> str: + + result = '\n'.join(self.bulk_redacted_text) + '\n' + for x in self.de_identify_results: + result += f"{x.describe()}\n" + return result + + def get_usage(self): + return self.usage diff --git a/tonic_textual/redact_api.py b/tonic_textual/redact_api.py index e3fbe85..a0c4c0f 100644 --- a/tonic_textual/redact_api.py +++ b/tonic_textual/redact_api.py @@ -13,6 +13,7 @@ SingleDetectionResult, ) from tonic_textual.classes.httpclient import HttpClient +from tonic_textual.classes.redact_api_responses.bulk_redaction_response import BulkRedactionResponse from tonic_textual.classes.redact_api_responses.redaction_response import ( RedactionResponse, ) @@ -291,6 +292,87 @@ def redact( } return self.send_redact_request("/api/redact", payload, random_seed) + + def redact_bulk( + self, + strings: List[str], + generator_config: Dict[str, PiiState] = dict(), + generator_default: PiiState = PiiState.Redaction, + random_seed: Optional[int] = None, + label_block_lists: Optional[Dict[str, List[str]]] = None, + label_allow_lists: Optional[Dict[str, List[str]]] = None, + ) -> RedactionResponse: + """Redacts a string. Depending on the configured handling for each sensitive + data type, values can be either redacted, synthesized, or ignored. + + Parameters + ---------- + strings : List[str] + The array of strings to redact. + + generator_config: Dict[str, PiiState] + A dictionary of sensitive data entities. For each entity, indicates whether + to redact, synthesize, or ignore it. + Values must be one of "Redaction", "Synthesis", or "Off". + + generator_default: PiiState = PiiState.Redaction + The default redaction used for all types not specified in generator_config. + Values must be one of "Redaction", "Synthesis", or "Off". + + random_seed: Optional[int] = None + An optional value to use to override Textual's default random number + seeding. Can be used to ensure that different API calls use the same or + different random seeds. + + label_block_lists: Optional[Dict[str, List[str]]] + A dictionary of (entity type, ignored values). When a value for an entity type matches a listed regular expression, + the value is ignored and is not redacted or synthesized. + + label_allow_lists: Optional[Dict[str, List[str]]] + A dictionary of (entity type, additional values). When a piece of text matches a listed regular expression, + the text is marked as the entity type and is included in the redaction or synthesis. + + + Returns + ------- + RedactionResponse + The redacted string along with ancillary information. + + Examples + -------- + >>> textual.redact_bulk( + >>> ["John Smith is a person", "I live in Atlanta"], + >>> # only redacts NAME_GIVEN + >>> generator_config={"NAME_GIVEN": "Redaction"}, + >>> generator_default="Off", + >>> # Occurrences of "There" are treated as NAME_GIVEN entities + >>> label_allow_lists={"NAME_GIVEN": ["There"]}, + >>> # Text matching the regex ` ([a-z]{2}) ` is not treated as an occurrence of NAME_FAMILY + >>> label_block_lists={"NAME_FAMILY": [" ([a-z]{2}) "]}, + >>> ) + + + """ + + validate_generator_options(generator_default, generator_config) + payload = { + "bulkText": strings, + "generatorDefault": generator_default, + "generatorConfig": generator_config, + } + + if label_block_lists is not None: + payload["labelBlockLists"] = { + k: LabelCustomList(regexes=v).to_dict() + for k, v in label_block_lists.items() + } + if label_allow_lists is not None: + payload["labelAllowLists"] = { + k: LabelCustomList(regexes=v).to_dict() + for k, v in label_allow_lists.items() + } + + return self.send_redact_bulk_request("/api/redact/bulk", payload, random_seed) def llm_synthesis( self, @@ -598,6 +680,52 @@ def send_redact_request( de_id_results, ) + def send_redact_bulk_request( + self, + endpoint: str, + payload: Dict, + random_seed: Optional[int] = None, + ) -> BulkRedactionResponse: + """Helper function to send redact requests, handle responses, and catch errors.""" + + if random_seed is not None: + additional_headers = {"textual-random-seed": str(random_seed)} + else: + additional_headers = {} + + try: + response = self.client.http_post( + endpoint, data=payload, additional_headers=additional_headers + ) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 400: + raise InvalidJsonForRedactionRequest(e.response.text) + raise e + + de_id_results = [ + Replacement( + start=x["start"], + end=x["end"], + new_start=x.get("newStart"), + new_end=x.get("newEnd"), + label=x["label"], + text=x["text"], + new_text=x.get("newText"), + score=x["score"], + language=x.get("language"), + example_redaction=x.get("exampleRedaction"), + idx=x.get("idx") + ) + for x in response["deIdentifyResults"] + ] + + return BulkRedactionResponse( + response["bulkText"], + response["bulkRedactedText"], + response["usage"], + de_id_results, + ) + def start_file_redaction(self, file: io.IOBase, file_name: str) -> str: """ Redact a provided file From 0dd1918d4d0c146209c2c393b41480d69963fd6b Mon Sep 17 00:00:00 2001 From: Adam Kamor Date: Thu, 21 Nov 2024 12:47:33 -0500 Subject: [PATCH 2/5] hide toctree --- docs/source/redact/index.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/redact/index.rst b/docs/source/redact/index.rst index 5a29104..72c9971 100644 --- a/docs/source/redact/index.rst +++ b/docs/source/redact/index.rst @@ -64,6 +64,7 @@ To learn more about how to generate redacted and synthesized files, go to :doc:` .. toctree:: :maxdepth: 2 + :hidden: redacting_files @@ -77,5 +78,6 @@ To help automate workflows, you can work with datasets directly from the SDK. To .. toctree:: :maxdepth: 2 + :hidden: datasets From 8207e0bcf2d777741a851ab90699e1983362c924 Mon Sep 17 00:00:00 2001 From: Adam Kamor Date: Thu, 21 Nov 2024 12:54:21 -0500 Subject: [PATCH 3/5] adding clarifying note --- docs/source/redact/redacting_text.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/redact/redacting_text.rst b/docs/source/redact/redacting_text.rst index d267dcd..b981d9b 100644 --- a/docs/source/redact/redacting_text.rst +++ b/docs/source/redact/redacting_text.rst @@ -63,7 +63,7 @@ In the same way that our `redact` method can be used to redact strings our `reda raw_redaction = textual.redact_bulk(["Tonic was founded in 2018", "John Smith is a person"]) print(raw_redaction.describe()) -This produces the following output: +This produces the following output. Note that the 'idx' property denotes the position in the original string list to which the result pertains. .. code-block:: console From 0d0318fbc8cae96af0290a91f9104dacff885737 Mon Sep 17 00:00:00 2001 From: Joe Ferrara Date: Thu, 21 Nov 2024 12:18:13 -0700 Subject: [PATCH 4/5] make de_identify_results List[List[Replacement]] --- docs/source/redact/redacting_text.rst | 96 +++++++++---------- .../common_api_responses/replacement.py | 7 -- .../bulk_redaction_response.py | 10 +- tonic_textual/redact_api.py | 32 +++---- 4 files changed, 69 insertions(+), 76 deletions(-) diff --git a/docs/source/redact/redacting_text.rst b/docs/source/redact/redacting_text.rst index b981d9b..f3d650e 100644 --- a/docs/source/redact/redacting_text.rst +++ b/docs/source/redact/redacting_text.rst @@ -67,56 +67,52 @@ This produces the following output. Note that the 'idx' property denotes the po .. code-block:: console - [ORGANIZATION_P5XLAH] was founded in [DATE_TIME_0FW53] - [NAME_GIVEN_HI1h7] [NAME_FAMILY_5oMl28] is a person - { - "start": 0, - "end": 5, - "new_start": 0, - "new_end": 21, - "label": "ORGANIZATION", - "text": "Tonic", - "score": 0.9, - "language": "en", - "new_text": "[ORGANIZATION_P5XLAH]", - "idx": 0 - } - { - "start": 21, - "end": 25, - "new_start": 37, - "new_end": 54, - "label": "DATE_TIME", - "text": "2018", - "score": 0.9, - "language": "en", - "new_text": "[DATE_TIME_0FW53]", - "idx": 0 - } - { - "start": 0, - "end": 4, - "new_start": 0, - "new_end": 18, - "label": "NAME_GIVEN", - "text": "John", - "score": 0.9, - "language": "en", - "new_text": "[NAME_GIVEN_HI1h7]", - "idx": 1 - } - { - "start": 5, - "end": 10, - "new_start": 19, - "new_end": 39, - "label": "NAME_FAMILY", - "text": "Smith", - "score": 0.9, - "language": "en", - "new_text": "[NAME_FAMILY_5oMl28]", - "idx": 1 - } +[ORGANIZATION_5Ve7OH] was founded in [DATE_TIME_DnuC1] +{ + "start": 0, + "end": 5, + "new_start": 0, + "new_end": 21, + "label": "ORGANIZATION", + "text": "Tonic", + "score": 0.9, + "language": "en", + "new_text": "[ORGANIZATION_5Ve7OH]" +} +{ + "start": 21, + "end": 25, + "new_start": 37, + "new_end": 54, + "label": "DATE_TIME", + "text": "2018", + "score": 0.9, + "language": "en", + "new_text": "[DATE_TIME_DnuC1]" +} +[NAME_GIVEN_dySb5] [NAME_FAMILY_7w4Db3] is a person +{ + "start": 0, + "end": 4, + "new_start": 0, + "new_end": 18, + "label": "NAME_GIVEN", + "text": "John", + "score": 0.9, + "language": "en", + "new_text": "[NAME_GIVEN_dySb5]" +} +{ + "start": 5, + "end": 10, + "new_start": 19, + "new_end": 39, + "label": "NAME_FAMILY", + "text": "Smith", + "score": 0.9, + "language": "en", + "new_text": "[NAME_FAMILY_7w4Db3]" +} Redact JSON data ---------------- diff --git a/tonic_textual/classes/common_api_responses/replacement.py b/tonic_textual/classes/common_api_responses/replacement.py index 67c9b64..2835842 100644 --- a/tonic_textual/classes/common_api_responses/replacement.py +++ b/tonic_textual/classes/common_api_responses/replacement.py @@ -37,8 +37,6 @@ class Replacement(dict): xml_path : Optional[str] The xpath of the entity in the original XML document. This is only present if the input text was an XML document. NOTE: Arrays in xpath are 1-based - idx : Optional[int] - The index in the original bulk text array to which the NER result corresponds. NOTE: This is only used when calling our bulk redaction methods """ def __init__( @@ -55,7 +53,6 @@ def __init__( example_redaction: Optional[str] = None, json_path: Optional[str] = None, xml_path: Optional[str] = None, - idx: Optional[int] = None, ): self.start = start self.end = end @@ -69,7 +66,6 @@ def __init__( self.example_redaction = example_redaction self.json_path = json_path self.xml_path = xml_path - self.idx = idx dict.__init__( self, @@ -89,7 +85,6 @@ def __init__( ), **({} if json_path is None else {"json_path": json_path}), **({} if xml_path is None else {"xml_path": xml_path}), - **({} if idx is None else {"idx": idx}), ) def describe(self) -> str: @@ -114,6 +109,4 @@ def to_dict(self) -> Dict: out["json_path"] = self.json_path if self.xml_path is not None: out["xml_path"] = self.xml_path - if self.idx is not None: - out["idx"] = self.idx return out diff --git a/tonic_textual/classes/redact_api_responses/bulk_redaction_response.py b/tonic_textual/classes/redact_api_responses/bulk_redaction_response.py index 1cd0e26..f4fcc98 100644 --- a/tonic_textual/classes/redact_api_responses/bulk_redaction_response.py +++ b/tonic_textual/classes/redact_api_responses/bulk_redaction_response.py @@ -39,9 +39,13 @@ def __init__( def describe(self) -> str: - result = '\n'.join(self.bulk_redacted_text) + '\n' - for x in self.de_identify_results: - result += f"{x.describe()}\n" + result = "" + for redacted_text, de_id_res in zip( + self.bulk_redacted_text, self.de_identify_results + ): + result += f"{redacted_text}\n" + for replacement in de_id_res: + result += f"{replacement.describe()}\n" return result def get_usage(self): diff --git a/tonic_textual/redact_api.py b/tonic_textual/redact_api.py index ae2414d..2333de2 100644 --- a/tonic_textual/redact_api.py +++ b/tonic_textual/redact_api.py @@ -703,23 +703,23 @@ def send_redact_bulk_request( if e.response.status_code == 400: raise InvalidJsonForRedactionRequest(e.response.text) raise e - - de_id_results = [ - Replacement( - start=x["start"], - end=x["end"], - new_start=x.get("newStart"), - new_end=x.get("newEnd"), - label=x["label"], - text=x["text"], - new_text=x.get("newText"), - score=x["score"], - language=x.get("language"), - example_redaction=x.get("exampleRedaction"), - idx=x.get("idx") + + de_id_results = [[] for i in range(len(response["bulkText"]))] + for x in response["deIdentifyResults"]: + de_id_results[x["idx"]].append( + Replacement( + start=x["start"], + end=x["end"], + new_start=x.get("newStart"), + new_end=x.get("newEnd"), + label=x["label"], + text=x["text"], + new_text=x.get("newText"), + score=x["score"], + language=x.get("language"), + example_redaction=x.get("exampleRedaction"), + ) ) - for x in response["deIdentifyResults"] - ] return BulkRedactionResponse( response["bulkText"], From f8c054a3d12165edca1427c640baa56003dc96ab Mon Sep 17 00:00:00 2001 From: Adam Kamor <9391841+akamor@users.noreply.github.com> Date: Thu, 21 Nov 2024 14:33:04 -0500 Subject: [PATCH 5/5] Update docs/source/redact/redacting_text.rst Co-authored-by: ander steele --- docs/source/redact/redacting_text.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/redact/redacting_text.rst b/docs/source/redact/redacting_text.rst index f3d650e..21b9d35 100644 --- a/docs/source/redact/redacting_text.rst +++ b/docs/source/redact/redacting_text.rst @@ -63,7 +63,7 @@ In the same way that our `redact` method can be used to redact strings our `reda raw_redaction = textual.redact_bulk(["Tonic was founded in 2018", "John Smith is a person"]) print(raw_redaction.describe()) -This produces the following output. Note that the 'idx' property denotes the position in the original string list to which the result pertains. +This produces the following output: .. code-block:: console