Skip to content

Commit

Permalink
Merge pull request #114 from microsoft/omri/data_generation_bug_fixes
Browse files Browse the repository at this point in the history
minor changes to data generator
  • Loading branch information
omri374 authored Jan 8, 2025
2 parents 9aba87c + 8e7a47c commit 0a2b787
Show file tree
Hide file tree
Showing 7 changed files with 188 additions and 209 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.2.0
0.2.2
282 changes: 142 additions & 140 deletions notebooks/1_Generate_data.ipynb

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions presidio_evaluator/data_generator/faker_extensions/data_objects.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from dataclasses import dataclass
import dataclasses
import json
Expand All @@ -16,6 +17,16 @@ class FakerSpan:
end: int
type: str

def __new__(cls, *args, **kwargs):
warnings.warn(
"FakerSpan is deprecated and will be removed in future versions."
"Use Span instead",
category=DeprecationWarning,
stacklevel=2,
)

return super().__new__(cls)

def __repr__(self):
return json.dumps(dataclasses.asdict(self))

Expand All @@ -31,6 +42,16 @@ class FakerSpansResult:
template_id: Optional[int] = None
sample_id: Optional[int] = None

def __new__(cls, *args, **kwargs):
warnings.warn(
"FakerSpansResult is deprecated and will be removed in future versions."
"Use InputSample instead",
category=DeprecationWarning,
stacklevel=2,
)

return super().__new__(cls)

def __str__(self):
return self.fake

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def parse(
# Update span indices
delta = new_len - old_len
span.end_position = span.end_position + delta
span.type = formatter.strip()
span.entity_type = formatter.strip()

# Update previously inserted spans since indices shifted
for j in range(0, i):
Expand Down
74 changes: 13 additions & 61 deletions presidio_evaluator/data_generator/presidio_data_generator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import dataclasses
import json
import random
import re
from pathlib import Path
import warnings
from typing import List, Optional, Union, Generator

import numpy as np
Expand All @@ -13,16 +11,11 @@
from tqdm import tqdm

from presidio_evaluator.data_generator.faker_extensions import (
FakerSpansResult,
NationalityProvider,
OrganizationProvider,
UsDriverLicenseProvider,
IpAddressProvider,
AddressProviderNew,
SpanGenerator,
RecordsFaker,
PhoneNumberProviderNew,
AgeProvider,
)

from presidio_evaluator.data_generator.faker_extensions.data_objects import (
FakerSpansResult,
)


Expand All @@ -43,8 +36,6 @@ def __init__(
:example:
>>>from presidio_evaluator.data_generator import PresidioDataGenerator
>>>sentence_templates = [
>>> "My name is {{name}}",
>>> "Please send it to {{address}}",
Expand All @@ -67,6 +58,14 @@ def __init__(
[{"value": "Ukraine", "start": 31, "end": 38, "type": "country"}, {"value": "North Kim", "start": 16, "end": 25, "type": "city"}]
"""

warnings.warn(
"PresidioDataGenerator is deprecated and will be removed in future versions."
"Use PresidioSentenceFaker instead",
category=DeprecationWarning,
stacklevel=2,
)

if custom_faker and locale:
raise ValueError(
"If a custom faker is passed, it's expected to have its locales loaded"
Expand Down Expand Up @@ -280,50 +279,3 @@ def name_gendered(row):

fake_data = pd.concat([fake_data, genderized], axis="columns")
return fake_data


if __name__ == "__main__":
PresidioDataGenerator.seed(42)

template_file_path = Path(Path(__file__).parent, "raw_data", "templates.txt")

# Read FakeNameGenerator data
fake_data_df = pd.read_csv(
Path(Path(__file__).parent, "raw_data", "FakeNameGenerator.com_3000.csv")
)
# Convert column names to lowercase to match patterns
fake_data_df = PresidioDataGenerator.update_fake_name_generator_df(fake_data_df)

# Create a RecordsFaker (Faker object which prefers samples multiple objects from one record)
faker = RecordsFaker(records=fake_data_df, local="en_US")
faker.add_provider(IpAddressProvider)
faker.add_provider(NationalityProvider)
faker.add_provider(OrganizationProvider)
faker.add_provider(UsDriverLicenseProvider)
faker.add_provider(AgeProvider)
faker.add_provider(AddressProviderNew) # More address formats than Faker
faker.add_provider(PhoneNumberProviderNew) # More phone number formats than Faker

# Create Presidio Data Generator
data_generator = PresidioDataGenerator(custom_faker=faker, lower_case_ratio=0.05)
data_generator.add_provider_alias(provider_name="name", new_name="person")
data_generator.add_provider_alias(
provider_name="credit_card_number", new_name="credit_card"
)
data_generator.add_provider_alias(
provider_name="date_of_birth", new_name="birthday"
)

sentence_templates = PresidioDataGenerator.read_template_file(template_file_path)
fake_patterns = data_generator.generate_fake_data(
templates=sentence_templates, n_samples=10000
)

# save to json
output_file = Path(
Path(__file__).parent.parent.parent, "data", "presidio_data_generator_data.json"
)

to_json = [dataclasses.asdict(pattern) for pattern in fake_patterns]
with open("{}".format(output_file), "w+", encoding="utf-8") as f:
json.dump(to_json, f, ensure_ascii=False, indent=2)
12 changes: 7 additions & 5 deletions presidio_evaluator/data_generator/presidio_sentence_faker.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,18 +170,20 @@ def generate_new_fake_sentences(self, num_samples: int) -> List[InputSample]:
template = self._preprocess_template(template)
fake_sentence_result = self._sentence_faker.parse(template, template_id)
for span in fake_sentence_result.spans:
if span.type in self._entity_type_mapping.keys():
if span.entity_type in self._entity_type_mapping.keys():
# Use the mapped entity type if exists
span.type = self._entity_type_mapping[span.type]
span.entity_type = self._entity_type_mapping[span.entity_type]
else:
# Otherwise, capitalize the entity type and add to the mapping
print(
f"Warning: Non-mapped entity type found: {span.type}. "
f"Non-mapped entities will be mapped to {span.type.upper()} "
f"Warning: Non-mapped entity type found: {span.entity_type}. "
f"Non-mapped entities will be mapped to {span.entity_type.upper()} "
f"in the output dataset. If you prefer a different mapping, "
f"pass the `entity_type_mapping` argument with a mapping for this entity type."
)
self._entity_type_mapping[span.type] = span.type.upper()
self._entity_type_mapping[span.entity_type] = (
span.entity_type.upper()
)
for key, value in self._entity_type_mapping.items():
fake_sentence_result.masked = fake_sentence_result.masked.replace(
"{{%s}}" % key, "{{%s}}" % value
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "presidio_evaluator"
version = "0.2.1"
version = "0.2.2"
description = ""
authors = ["Microsoft"]
readme = "README.md"
Expand Down Expand Up @@ -47,6 +47,8 @@ build-backend = "poetry.core.masonry.api"
[tool.ruff]
line-length = 88
exclude = [".git", "__pycache__", "build", "dist", "tests"]

[tool.ruff.lint]
ignore = ["E203", "D100", "D202"]

[tool.pytest.ini_options]
Expand Down

0 comments on commit 0a2b787

Please sign in to comment.