Skip to content

Commit

Permalink
Change the Mondrian class
Browse files Browse the repository at this point in the history
  • Loading branch information
glassonion1 committed Oct 21, 2021
1 parent 9ac8f70 commit f2081ab
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 58 deletions.
41 changes: 2 additions & 39 deletions anonypy/anonypy.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,11 @@
def is_k_anonymous(df, partition, k=3):
if len(partition) < k:
return False
return True

def diversity(df, partition, column):
return len(df[column][partition].unique())

def is_l_diverse(df, partition, sensitive_column, l=2):
"""
:param df: The dataframe for which to check l-diversity
:param partition: The partition of the dataframe on which to check l-diversity
:param sensitive_column: The name of the sensitive column
:param l: The minimum required diversity of sensitive attribute values in the partition
"""
return diversity(df, partition, sensitive_column) >= l

def t_closeness(df, partition, column, global_freqs):
total_count = float(len(partition))
d_max = None
group_counts = df.loc[partition].groupby(column)[column].agg('count')
for value, count in group_counts.to_dict().items():
p = count/total_count
d = abs(p-global_freqs[value])
if d_max is None or d > d_max:
d_max = d
return d_max

def is_t_close(df, partition, sensitive_column, global_freqs, p=0.2):
"""
:param df: The dataframe for which to check l-diversity
:param partition: The partition of the dataframe on which to check l-diversity
:param sensitive_column: The name of the sensitive column
:param global_freqs: The global frequencies of the sensitive attribute values
:param p: The maximum allowed Kolmogorov-Smirnov distance
"""
return t_closeness(df, partition, sensitive_column, global_freqs) <= p

def agg_categorical_column(series):
# this is workaround for dtype bug of series
series.astype('category')
return [','.join(set(series))]

def agg_numerical_column(series):
return [series.mean()]


def build_anonymized_dataset(df, partitions, feature_columns, sensitive_column, max_partitions=None):
aggregations = {}
for column in feature_columns:
Expand Down
68 changes: 56 additions & 12 deletions anonypy/mondrian.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,54 @@
class Mondrian:
df = []
def __init__(self, df, feature_columns):
self.df = df[feature_columns]
import types

def is_k_anonymous(partition, k):
if len(partition) < k:
return False
return True

def is_l_diverse(sensitive_series, partition, l):
diversity = len(sensitive_series[partition].unique())
return diversity >= l

def t_closeness(df, partition, column, global_freqs):
total_count = float(len(partition))
d_max = None
group_counts = df.loc[partition].groupby(column)[column].agg('count')
for value, count in group_counts.to_dict().items():
p = count/total_count
d = abs(p-global_freqs[value])
if d_max is None or d > d_max:
d_max = d
return d_max

def _get_spans(self, partition, scale=None):
def is_t_close(df, partition, sensitive_column, global_freqs, p):
return t_closeness(df, partition, sensitive_column, global_freqs) <= p

class Mondrian:
def __init__(self, df, feature_columns, sensitive_column=None):
self.df = df
self.feature_columns = feature_columns
self.sensitive_column = sensitive_column

def is_valid(self, partition, k=2, l=0, global_freqs=None, p=0.2):
# k-anonymous
if not is_k_anonymous(partition, k):
return False
# l-diverse
if l > 0 and self.sensitive_column is not None:
diverse = is_l_diverse(self.df[self.sensitive_column], partition, l)
if not diverse:
return False
# t-close
if global_freqs is not None and self.sensitive_column is not None:
close = is_t_close(self.df, partition, self.sensitive_column, global_freqs, p)
if not close:
return False

return True

def get_spans(self, partition, scale=None):
spans = {}
for column in self.df.columns:
for column in self.feature_columns:
if self.df[column].dtype.name == 'category':
span = len(self.df[column][partition].unique())
else:
Expand All @@ -15,7 +58,7 @@ def _get_spans(self, partition, scale=None):
spans[column] = span
return spans

def _split(self, column, partition):
def split(self, column, partition):
dfp = self.df[column][partition]
if dfp.dtype.name == 'category':
values = dfp.unique()
Expand All @@ -28,17 +71,18 @@ def _split(self, column, partition):
dfr = dfp.index[dfp >= median]
return (dfl, dfr)

def partition(self, is_valid):
scale = self._get_spans(self.df.index)
def partition(self, k=3, l=0, global_freqs=None, p=0.2):
scale = self.get_spans(self.df.index)

finished_partitions = []
partitions = [self.df.index]
while partitions:
partition = partitions.pop(0)
spans = self._get_spans(partition, scale)
spans = self.get_spans(partition, scale)
for column, span in sorted(spans.items(), key=lambda x:-x[1]):
lp, rp = self._split(column, partition)
if not is_valid(self.df, lp) or not is_valid(self.df, rp):
lp, rp = self.split(column, partition)
if not self.is_valid(lp, k, l, global_freqs, p) \
or not self.is_valid(rp, k, l, global_freqs, p):
continue
partitions.extend((lp, rp))
break
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name='anonypy',
version='0.0.1',
version='0.0.2',
packages=find_packages(),
author='glassonion1',
author_email='[email protected]',
Expand Down
37 changes: 31 additions & 6 deletions tests/anonypy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,11 @@ def test_build_anonymized_dataset():
for name in categorical:
df[name] = df[name].astype('category')

feature_columns = ['age', 'education-num']
m = anonypy.Mondrian(df, feature_columns)
finished_partitions = m.partition(anonypy.is_k_anonymous)
feature_columns = ['age', 'education']
sensitive_column = 'income'

m = anonypy.Mondrian(df, feature_columns, sensitive_column)
finished_partitions = m.partition(k=3, l=2)

print(len(finished_partitions))
print(finished_partitions[0])
Expand All @@ -52,11 +54,34 @@ def test_build_anonymized_dataset():

print(rects[:10])

sensitive_column = 'income'
rows = anonypy.build_anonymized_dataset(df, finished_partitions, feature_columns, sensitive_column)
dfn = pd.DataFrame(rows)
print(dfn.sort_values(feature_columns+[sensitive_column]))

def test_build_anonymized_dataset2():
path = 'data/adult.test.txt'
df = pd.read_csv(path, sep=', ', names=names, engine='python')

for name in categorical:
df[name] = df[name].astype('category')

feature_columns = ['age', 'education-num']
sensitive_column = 'income'

global_freqs = {}
total_count = float(len(df))
group_counts = df.groupby(sensitive_column)[sensitive_column].agg('count')
for value, count in group_counts.to_dict().items():
p = count/total_count
global_freqs[value] = p

m = anonypy.Mondrian(df, feature_columns, sensitive_column)
finished_partitions = m.partition(k=3, global_freqs=global_freqs)

rows = anonypy.build_anonymized_dataset(df, finished_partitions, feature_columns, sensitive_column)
dfn = pd.DataFrame(rows)
print(dfn.sort_values(feature_columns+[sensitive_column]))

def test_get_spans():
path = 'data/adult.test.txt'
df = pd.read_csv(path, sep=', ', names=names, engine='python')
Expand All @@ -66,13 +91,13 @@ def test_get_spans():

feature_columns = ['age', 'education-num']
m = anonypy.Mondrian(df, feature_columns)
spans = m._get_spans(df.index)
spans = m.get_spans(df.index)

assert {'age': 73, 'education-num': 15} == spans

feature_columns = ['sex', 'income', 'native-country', 'race']
m = anonypy.Mondrian(df, feature_columns)
spans = m._get_spans(df.index)
spans = m.get_spans(df.index)

assert {'income': 2, 'sex': 2, 'native-country': 41, 'race': 5} == spans

0 comments on commit f2081ab

Please sign in to comment.