Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Pylint + Github workflow
  • Loading branch information
ETHenzlere committed Aug 25, 2024
commit 051fc74804be5eea2b7a1df30277cdcd4c9f226b
3 changes: 2 additions & 1 deletion .github/workflows/maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,8 @@ jobs:
elif [[ ${{matrix.benchmark}} == anonymization ]]; then
java -jar benchbase.jar -b tpcc -c config/mysql/sample_tpcc_config.xml --create=true --load=true --execute=false --json-histograms results/histograms.json
java -jar benchbase.jar -b tpcc -c config/mysql/sample_${{matrix.benchmark}}_config.xml --anonymize=true
exit 0
mysql -h127.0.0.1 -P$MYSQL_PORT -uadmin -ppassword -e "RENAME TABLE item TO item_original,item_anonymized TO item;"
java -jar benchbase.jar -b tpcc -c config/mysql/sample_tpcc_config.xml --execute=true --json-histograms results/histograms.json
elif [[ ${{matrix.benchmark}} == tpcc-with-reconnects ]]; then
# See Also: WITH_SERVICE_INTERRUPTIONS=true docker/build-run-benchmark-with-docker.sh
java -jar benchbase.jar -b tpcc -c config/mysql/sample_tpcc_config.xml --create=true --load=true
Expand Down
24 changes: 22 additions & 2 deletions scripts/anonymization/src/anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,31 @@
import sys
import xml.etree.ElementTree as ET
import pandas as pd
from modules.jdbc_handler import JDBCHandler

from configuration.config_parser import XMLParser
from configuration.configurations import DPConfig, SensitiveConfig, ContinuousConfig

from modules.dp_anonymizer import DifferentialPrivacyAnonymizer
from modules.sensitive_anonymizer import SensitiveAnonymizer

from modules.jdbc_handler import JDBCHandler

def anonymize(
dataset: pd.DataFrame,
anon_config: DPConfig,
cont_config: ContinuousConfig,
sens_config: SensitiveConfig,
):
"""Function that runs differential privacy and sensitive value anonymization

Args:
dataset (pd.DataFrame): The original dataset
anon_config (DPConfig): The Differential Privacy Configuration
cont_config (ContinuousConfig): The Configuration for Continuous Values
sens_config (SensitiveConfig): The Configuration for Sensitive Values

Returns:
pd.DataFrame: The Anonymized Dataset
"""

dp_data = dataset
if anon_config:
Expand All @@ -36,6 +48,14 @@ def anonymize_db(
sens_config: SensitiveConfig,
cont_config: ContinuousConfig,
):
"""Entry function that handles data pulling, anonymization and data pushing back to the DB

Args:
jdbc_handler (JDBCHandler): The JDBC Information
anon_config (DPConfig): The Differential Privacy Configuration
cont_config (ContinuousConfig): The Configuration for Continuous Values
sens_config (SensitiveConfig): The Configuration for Sensitive Values
"""

jdbc_handler.start_jvm()

Expand Down
3 changes: 2 additions & 1 deletion scripts/anonymization/src/configuration/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def get_config(self):
# Exit the program if not enough basic information (name of a table) is available
if table_name is None:
sys.exit(
"There was no name provided for the table that should be anonymized. Program is exiting now!"
"There was no name provided for the table that should be anonymized.\n"
+ "Program is exiting now!"
)

print(f"Parsing config for table: {table_name}")
Expand Down
29 changes: 25 additions & 4 deletions scripts/anonymization/src/modules/dp_anonymizer.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,37 @@
"""Module that handles differential privacy anonymization
"""

import time
import pandas as pd

from snsynth import Synthesizer
from configuration.configurations import DPConfig,ContinuousConfig
import pandas as pd
from .preprocessor import Preprocessor

class DifferentialPrivacyAnonymizer():
"""A class that holds all necessary information for differential privacy anonymization

Attributes
----------
dataset : pd.DataFrame
The full dataset
anon_config : DPConfig
The configuration of all columns of the dataset
cont_config : ContinuousConfig
The specific configuration of all continuous columns of the dataset
"""

def __init__(self, dataset: pd.DataFrame, anon_config: DPConfig, cont_config: ContinuousConfig):
self.dataset = dataset
self.anon_config = anon_config
self.cont_config = cont_config

def run_anonymization(self):
"""Function that runs differential privacy algorithms

Returns:
pd.DataFrame: The resulting differentially private dataset
"""

alg = self.anon_config.algorithm
eps = float(self.anon_config.epsilon)
Expand Down Expand Up @@ -38,7 +59,7 @@ def run_anonymization(self):
categorical_columns=cat,
continuous_columns=cont,
ordinal_columns=ordi,
transformer=Preprocessor(self.anon_config).getTransformer(
transformer=Preprocessor(self.anon_config).get_transformer(
self.dataset, self.cont_config
),
nullable=nullable_flag,
Expand All @@ -64,7 +85,6 @@ def run_anonymization(self):

anon_data = self.__add_ignorable(anon_data,saved_indexes,saved_columns)


return anon_data

def __remove_ignorable(self):
Expand All @@ -83,4 +103,5 @@ def __add_ignorable(self,dataset,saved_indexes,saved_columns):
ignore_columns = self.anon_config.column_classification.hidden
for ind, col in enumerate(ignore_columns):
dataset.insert(saved_indexes[ind], col, saved_columns[col])
return dataset
return dataset

22 changes: 17 additions & 5 deletions scripts/anonymization/src/modules/preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
"""Module for data transformation
"""

from snsynth.transform import BinTransformer, TableTransformer

from configuration.configurations import DPConfig, ContinuousConfig


class Preprocessor:
"""A class that transforms the dataset in order to
allow differential privacy algorithms to work with the data

Attributes
----------
config : DPConfig
The differential privacy parameters
"""

def __init__(self, config: DPConfig):
self.config = config

def getTransformer(self, dataset, cont_config):
def get_transformer(self, dataset, cont_config):
"""Method that returns a TableTransformer object which can be used by DP mechansisms

Args:
Expand All @@ -33,12 +45,12 @@ def getTransformer(self, dataset, cont_config):
continuous_columns=self.config.column_classification.continuous,
ordinal_columns=self.config.column_classification.ordinal,
style=style,
constraints=self.getConstraints(cont_config, dataset),
constraints=self.get_constraints(cont_config, dataset),
)
print("Instantiated Transformer")
return tt

def getConstraints(self,cont_config: ContinuousConfig, dataset):
def get_constraints(self,cont_config: ContinuousConfig, dataset):
"""Helper method that forms constraints from a list of continuous columns

Args:
Expand Down Expand Up @@ -71,9 +83,9 @@ def getConstraints(self,cont_config: ContinuousConfig, dataset):
else:
max_bound = int(upper)

nullFlag = dataset[col_name].isnull().values.any()
null_flag = dataset[col_name].isnull().values.any()
constraints[col_name] = BinTransformer(
bins=bins, lower=min_bound, upper=max_bound, nullable=nullFlag
bins=bins, lower=min_bound, upper=max_bound, nullable=null_flag
)

return constraints
23 changes: 20 additions & 3 deletions scripts/anonymization/src/modules/sensitive_anonymizer.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,34 @@
"""Module for handling Sensitive Values
"""

import pandas as pd
import xml.etree.ElementTree as ET

from configuration.configurations import SensitiveConfig
from faker import Faker


class SensitiveAnonymizer:
"""A class that holds all information for sensitive anonymization.

Attributes
----------
dataset : pd.DataFrame
The full dataset
sens_config : SensitiveConfig
The configuration of the columns that are sensitive
"""

def __init__(
self, dataset: pd.DataFrame, sens_config: SensitiveConfig
):
self.dataset = dataset
self.sens_config = sens_config

def run_anonymization(self):
"""Function that runs the actual anonymization of sensitive values

Returns:
pd.DataFrame: A dataset with anonymized sensitive values
"""
anon_data = self.dataset.copy()

if self.sens_config:
Expand Down Expand Up @@ -63,4 +79,5 @@ def __fake_column(self,

fake.unique.clear()

return dataset
return dataset

7 changes: 4 additions & 3 deletions scripts/anonymization/src/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@


def test_full_config():
"""Test method for a full config with dp-anonymization, continuous and sensitive values"""
"""
Test method for a full config with dp-anonymization,continuous and sensitive values
"""

parameters = ET.fromstring(FULL_CONFIG)

Expand All @@ -54,7 +56,7 @@ def test_full_config():
assert anon_config is not None
assert sens_config is not None
assert cont_config is not None

dataset = pd.read_csv('test_table.csv')

# Templates Path = None
Expand All @@ -79,4 +81,3 @@ def test_minimal_config():
assert anon_config is not None
assert sens_config is None
assert cont_config is None