diff --git a/anonypy/anonypy.py b/anonypy/anonypy.py index cac0f3d..379e177 100644 --- a/anonypy/anonypy.py +++ b/anonypy/anonypy.py @@ -1,48 +1,11 @@ -def is_k_anonymous(df, partition, k=3): - if len(partition) < k: - return False - return True - -def diversity(df, partition, column): - return len(df[column][partition].unique()) - -def is_l_diverse(df, partition, sensitive_column, l=2): - """ - :param df: The dataframe for which to check l-diversity - :param partition: The partition of the dataframe on which to check l-diversity - :param sensitive_column: The name of the sensitive column - :param l: The minimum required diversity of sensitive attribute values in the partition - """ - return diversity(df, partition, sensitive_column) >= l - -def t_closeness(df, partition, column, global_freqs): - total_count = float(len(partition)) - d_max = None - group_counts = df.loc[partition].groupby(column)[column].agg('count') - for value, count in group_counts.to_dict().items(): - p = count/total_count - d = abs(p-global_freqs[value]) - if d_max is None or d > d_max: - d_max = d - return d_max - -def is_t_close(df, partition, sensitive_column, global_freqs, p=0.2): - """ - :param df: The dataframe for which to check l-diversity - :param partition: The partition of the dataframe on which to check l-diversity - :param sensitive_column: The name of the sensitive column - :param global_freqs: The global frequencies of the sensitive attribute values - :param p: The maximum allowed Kolmogorov-Smirnov distance - """ - return t_closeness(df, partition, sensitive_column, global_freqs) <= p - def agg_categorical_column(series): + # this is workaround for dtype bug of series + series.astype('category') return [','.join(set(series))] def agg_numerical_column(series): return [series.mean()] - def build_anonymized_dataset(df, partitions, feature_columns, sensitive_column, max_partitions=None): aggregations = {} for column in feature_columns: diff --git a/anonypy/mondrian.py b/anonypy/mondrian.py index 6bf1104..fb970d9 100644 --- a/anonypy/mondrian.py +++ b/anonypy/mondrian.py @@ -1,11 +1,54 @@ -class Mondrian: - df = [] - def __init__(self, df, feature_columns): - self.df = df[feature_columns] +import types + +def is_k_anonymous(partition, k): + if len(partition) < k: + return False + return True + +def is_l_diverse(sensitive_series, partition, l): + diversity = len(sensitive_series[partition].unique()) + return diversity >= l + +def t_closeness(df, partition, column, global_freqs): + total_count = float(len(partition)) + d_max = None + group_counts = df.loc[partition].groupby(column)[column].agg('count') + for value, count in group_counts.to_dict().items(): + p = count/total_count + d = abs(p-global_freqs[value]) + if d_max is None or d > d_max: + d_max = d + return d_max - def _get_spans(self, partition, scale=None): +def is_t_close(df, partition, sensitive_column, global_freqs, p): + return t_closeness(df, partition, sensitive_column, global_freqs) <= p + +class Mondrian: + def __init__(self, df, feature_columns, sensitive_column=None): + self.df = df + self.feature_columns = feature_columns + self.sensitive_column = sensitive_column + + def is_valid(self, partition, k=2, l=0, global_freqs=None, p=0.2): + # k-anonymous + if not is_k_anonymous(partition, k): + return False + # l-diverse + if l > 0 and self.sensitive_column is not None: + diverse = is_l_diverse(self.df[self.sensitive_column], partition, l) + if not diverse: + return False + # t-close + if global_freqs is not None and self.sensitive_column is not None: + close = is_t_close(self.df, partition, self.sensitive_column, global_freqs, p) + if not close: + return False + + return True + + def get_spans(self, partition, scale=None): spans = {} - for column in self.df.columns: + for column in self.feature_columns: if self.df[column].dtype.name == 'category': span = len(self.df[column][partition].unique()) else: @@ -15,7 +58,7 @@ def _get_spans(self, partition, scale=None): spans[column] = span return spans - def _split(self, column, partition): + def split(self, column, partition): dfp = self.df[column][partition] if dfp.dtype.name == 'category': values = dfp.unique() @@ -28,17 +71,18 @@ def _split(self, column, partition): dfr = dfp.index[dfp >= median] return (dfl, dfr) - def partition(self, is_valid): - scale = self._get_spans(self.df.index) + def partition(self, k=3, l=0, global_freqs=None, p=0.2): + scale = self.get_spans(self.df.index) finished_partitions = [] partitions = [self.df.index] while partitions: partition = partitions.pop(0) - spans = self._get_spans(partition, scale) + spans = self.get_spans(partition, scale) for column, span in sorted(spans.items(), key=lambda x:-x[1]): - lp, rp = self._split(column, partition) - if not is_valid(self.df, lp) or not is_valid(self.df, rp): + lp, rp = self.split(column, partition) + if not self.is_valid(lp, k, l, global_freqs, p) \ + or not self.is_valid(rp, k, l, global_freqs, p): continue partitions.extend((lp, rp)) break diff --git a/setup.py b/setup.py index a5e2947..12a379a 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name='anonypy', - version='0.0.1', + version='0.0.2', packages=find_packages(), author='glassonion1', author_email='glassonion999@gmail.com', diff --git a/tests/anonypy_test.py b/tests/anonypy_test.py index 2374a5c..681c215 100644 --- a/tests/anonypy_test.py +++ b/tests/anonypy_test.py @@ -39,9 +39,11 @@ def test_build_anonymized_dataset(): for name in categorical: df[name] = df[name].astype('category') - feature_columns = ['age', 'education-num'] - m = anonypy.Mondrian(df, feature_columns) - finished_partitions = m.partition(anonypy.is_k_anonymous) + feature_columns = ['age', 'education'] + sensitive_column = 'income' + + m = anonypy.Mondrian(df, feature_columns, sensitive_column) + finished_partitions = m.partition(k=3, l=2) print(len(finished_partitions)) print(finished_partitions[0]) @@ -52,11 +54,34 @@ def test_build_anonymized_dataset(): print(rects[:10]) - sensitive_column = 'income' rows = anonypy.build_anonymized_dataset(df, finished_partitions, feature_columns, sensitive_column) dfn = pd.DataFrame(rows) print(dfn.sort_values(feature_columns+[sensitive_column])) +def test_build_anonymized_dataset2(): + path = 'data/adult.test.txt' + df = pd.read_csv(path, sep=', ', names=names, engine='python') + + for name in categorical: + df[name] = df[name].astype('category') + + feature_columns = ['age', 'education-num'] + sensitive_column = 'income' + + global_freqs = {} + total_count = float(len(df)) + group_counts = df.groupby(sensitive_column)[sensitive_column].agg('count') + for value, count in group_counts.to_dict().items(): + p = count/total_count + global_freqs[value] = p + + m = anonypy.Mondrian(df, feature_columns, sensitive_column) + finished_partitions = m.partition(k=3, global_freqs=global_freqs) + + rows = anonypy.build_anonymized_dataset(df, finished_partitions, feature_columns, sensitive_column) + dfn = pd.DataFrame(rows) + print(dfn.sort_values(feature_columns+[sensitive_column])) + def test_get_spans(): path = 'data/adult.test.txt' df = pd.read_csv(path, sep=', ', names=names, engine='python') @@ -66,13 +91,13 @@ def test_get_spans(): feature_columns = ['age', 'education-num'] m = anonypy.Mondrian(df, feature_columns) - spans = m._get_spans(df.index) + spans = m.get_spans(df.index) assert {'age': 73, 'education-num': 15} == spans feature_columns = ['sex', 'income', 'native-country', 'race'] m = anonypy.Mondrian(df, feature_columns) - spans = m._get_spans(df.index) + spans = m.get_spans(df.index) assert {'income': 2, 'sex': 2, 'native-country': 41, 'race': 5} == spans