Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Modules
  • Loading branch information
ETHenzlere committed Jun 30, 2024
commit 1b4fb15fd2ae213ec9d69cfe2646268ad9388fcf
24 changes: 24 additions & 0 deletions config/postgres/sample_templated_config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,30 @@

<query_templates_file>data/templated/example.xml</query_templates_file>

<anonymization>
<table name="item">
<differential_privacy epsilon="1.0" pre_epsilon="0.0" algorithm="mst">
<!-- Column categorization -->
<ignore>
<column name="i_id"/>
<column name="i_data" />
<column name="i_im_id" />
</ignore>
<categorical>
<column name="i_name" />
</categorical>
<!-- Continuous column fine-tuning -->
<continuous>
<column name="i_price" bins="1000" lower="2.0" upper="100.0" />
</continuous>
</differential_privacy>
<!-- Sensitive value handling -->
<value_faking>
<column name="i_name" method="name" locales="en_US" seed="0"/>
</value_faking>
</table>
</anonymization>

<!-- The workload -->
<terminals>1</terminals>
<works>
Expand Down
21 changes: 10 additions & 11 deletions scripts/anonymization/src/anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from modules.jdbc_handler import JDBCHandler
from configuration.config_parser import XMLParser
from configuration.configurations import DPConfig, SensitiveConfig, ContinuousConfig
#from modules.dp_anonymizer import DifferentialPrivacyAnonymizer
#from modules.sensitive_anonymizer import SensitiveAnonymizer
from modules.dp_anonymizer import DifferentialPrivacyAnonymizer
from modules.sensitive_anonymizer import SensitiveAnonymizer


def anonymize(
Expand All @@ -18,7 +18,7 @@ def anonymize(
sens_config: SensitiveConfig,
templates_path: str,
):
'''

dp_data = dataset
if anon_config:
dp_anonymizer = DifferentialPrivacyAnonymizer(dataset, anon_config, cont_config)
Expand All @@ -29,8 +29,7 @@ def anonymize(
dp_data = sens_anonymizer.run_anonymization()

return dp_data
'''
return



def anonymize_db(
Expand All @@ -40,16 +39,16 @@ def anonymize_db(
cont_config: ContinuousConfig,
templates_path: str,
):
'''
jdbc_handler.start_JVM()

jdbc_handler.start_jvm()

conn = jdbc_handler.get_connection()

table = anon_config.table_name
dataset, timestamps = jdbc_handler.data_from_table(conn, table)

datasetAnon = anonymize(
dataset, anon_config, contConfig, sensConfig, templates_path
dataset_anon = anonymize(
dataset, anon_config, cont_config, sens_config, templates_path
)

## TODO: Throw in Sensitive Anonmization
Expand All @@ -59,11 +58,11 @@ def anonymize_db(

# Populate new table
jdbc_handler.populate_anonymized_table(
conn, datasetAnon, anon_table_name, timestamps
conn, dataset_anon, anon_table_name, timestamps
)

conn.close()
'''

return


Expand Down
86 changes: 86 additions & 0 deletions scripts/anonymization/src/modules/dp_anonymizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import time
from snsynth import Synthesizer
from configuration.configurations import DPConfig,ContinuousConfig
import pandas as pd
from .preprocessor import Preprocessor

class DifferentialPrivacyAnonymizer():
def __init__(self, dataset: pd.DataFrame, anon_config: DPConfig, cont_config: ContinuousConfig):
self.dataset = dataset
self.anon_config = anon_config
self.cont_config = cont_config

def run_anonymization(self):

alg = self.anon_config.algorithm
eps = float(self.anon_config.epsilon)
pre_eps = float(self.anon_config.preproc_eps)
cat = self.anon_config.categorical
cont = self.anon_config.continuous
ordi = self.anon_config.ordinal

saved_columns,saved_indexes = self.__remove_ignorable()

nullable_flag = self.dataset.isnull().values.any()

anon_data = pd.DataFrame()

if eps > 0:
# For epsilon > 0 we run the anonymization
synth = Synthesizer.create(alg, epsilon=eps, verbose=True)
start_time = time.perf_counter()

# If there is a preprocessing configuration for continuous columns, we need the Preprocessor
if self.cont_config:
sample = synth.fit_sample(
self.dataset,
preprocessor_eps=pre_eps,
categorical_columns=cat,
continuous_columns=cont,
ordinal_columns=ordi,
transformer=Preprocessor(self.anon_config).getTransformer(
self.dataset, self.cont_config
),
nullable=nullable_flag,
)
anon_data = pd.DataFrame(sample)
else:
sample = synth.fit_sample(
self.dataset,
preprocessor_eps=pre_eps,
categorical_columns=cat,
continuous_columns=cont,
ordinal_columns=ordi,
nullable=nullable_flag,
)
anon_data = pd.DataFrame(sample)

end_time = time.perf_counter()
print(f"Process took: {(end_time-start_time):0.2f} seconds")

else:
print("Epsilon = 0. Anonymization will return the original data")
anon_data = self.dataset

anon_data = self.__add_ignorable(anon_data,saved_indexes,saved_columns)


return anon_data

def __remove_ignorable(self):
saved_columns = []
saved_indexes = []
ignore_columns = self.anon_config.hidden
if ignore_columns:
saved_columns = self.dataset[ignore_columns]
for col in ignore_columns:
saved_indexes.append(self.dataset.columns.get_loc(col))

self.dataset = self.dataset.drop(ignore_columns, axis=1)
return saved_columns,saved_indexes

def __add_ignorable(self,dataset,saved_indexes,saved_columns):
ignore_columns = self.anon_config.hidden
for ind, col in enumerate(ignore_columns):
dataset.insert(saved_indexes[ind], col, saved_columns[col])
return dataset
79 changes: 79 additions & 0 deletions scripts/anonymization/src/modules/preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from snsynth.transform import BinTransformer, TableTransformer

from configuration.configurations import DPConfig, ContinuousConfig


class Preprocessor:
def __init__(self, config: DPConfig):
self.config = config

def getTransformer(self, dataset, cont_config):
"""Method that returns a TableTransformer object which can be used by DP mechansisms

Args:
dataset (DataFrame): Pandas DataFrame
algorithm (str): Name of the DP-algorithm
categorical (str[]): List of categorical column names
continuous (str[]): List of continuous column names
ordinal (str[]): List of ordinal column names
continuousConfig (dict): Configuration of continuous columns

Returns:
TableTransformer: A transformer object
"""

style = "cube"
if "gan" in self.config.algorithm:
style = "gan"

tt = TableTransformer.create(
dataset,
nullable=dataset.isnull().values.any(),
categorical_columns=self.config.categorical,
continuous_columns=self.config.continuous,
ordinal_columns=self.config.ordinal,
style=style,
constraints=self.getConstraints(cont_config, dataset),
)
print("Instantiated Transformer")
return tt

def getConstraints(self,cont_config: ContinuousConfig, dataset):
"""Helper method that forms constraints from a list of continuous columns

Args:
config (dict): The continuous column configuration
dataset (DataFrame): Pandas DataFrame

Returns:
dict: A dictionary of constraints that will be applied to each specified column
"""
constraints = {}

for cont_entry in cont_config.columns:
col_name = cont_entry.name
bins = int(cont_entry.bins)

lower = cont_entry.lower
upper = cont_entry.upper

min_bound = None
max_bound = None

if lower:
if "." in lower:
min_bound = float(lower)
else:
min_bound = int(lower)
if upper:
if "." in upper:
max_bound = float(upper)
else:
max_bound = int(upper)

nullFlag = dataset[col_name].isnull().values.any()
constraints[col_name] = BinTransformer(
bins=bins, lower=min_bound, upper=max_bound, nullable=nullFlag
)

return constraints
67 changes: 67 additions & 0 deletions scripts/anonymization/src/modules/sensitive_anonymizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import pandas as pd

from configuration.configurations import SensitiveConfig
from faker import Faker


class SensitiveAnonymizer:
def __init__(
self, dataset: pd.DataFrame, sens_config: SensitiveConfig, templates_path: str
):
self.dataset = dataset
self.sens_config = sens_config
self.templates_path = templates_path

def run_anonymization(self):
anon_data = self.dataset.copy()
list_of_mappings = []
if self.sens_config:
for col in self.sens_config.columns:
anon_data, mapping = self.__fake_column(
anon_data, col.name, col.method, col.locales, col.seed
)
list_of_mappings.append(mapping)
# TODO: Use list of mappings to change templates file
return anon_data

def __fake_column(self,
dataset: pd.DataFrame, col_name: str, method: str, locales: list, seed=0
):

if len(locales) > 0:
fake = Faker(locales)
else:
fake = Faker()

fake.seed_instance(seed)

sensDict = {}
min_len = 0
max_len = 1
exists = False

dataset[col_name] = dataset[col_name].astype(str)

try:
fakerFunc = getattr(fake.unique, method)
exists = True
except AttributeError:
exists = False
min_len = len(min(dataset[col_name].tolist(), key=len))
max_len = len(max(dataset[col_name].tolist(), key=len))
print("Faker method '" + method + "' not found. Resorting to random String")

collection = dataset[col_name].unique()

for val in collection:
if exists:
fakeValue = fakerFunc()
sensDict[val] = fakeValue
else:
sensDict[val] = fake.pystr(min_chars=min_len, max_chars=max_len)

dataset[col_name] = dataset[col_name].map(sensDict)

fake.unique.clear()

return dataset, sensDict