1+ {
2+ "cells" : [
3+ {
4+ "metadata" : {
5+ "trusted" : true ,
6+ "_uuid" : " 1713f4306d45835efceefad592e4e6b046302f23" ,
7+ "collapsed" : true
8+ },
9+ "cell_type" : " code" ,
10+ "source" : " import pandas as pd\n import numpy as np\n np.random.seed(42)\n\n import matplotlib.pyplot as plt\n %matplotlib inline\n\n\n from sklearn.model_selection import train_test_split\n from sklearn.preprocessing import StandardScaler\n from sklearn.metrics import accuracy_score, roc_auc_score\n from sklearn.utils.class_weight import compute_class_weight\n\n\n from keras.layers import Input, Dense, Dropout\n from keras.models import Model" ,
11+ "execution_count" : null ,
12+ "outputs" : []
13+ },
14+ {
15+ "metadata" : {
16+ "trusted" : true ,
17+ "_uuid" : " e1b433fb1c8346a04cff7d90f162a90afc6a97e9" ,
18+ "collapsed" : true
19+ },
20+ "cell_type" : " code" ,
21+ "source" : " !ls ../input" ,
22+ "execution_count" : null ,
23+ "outputs" : []
24+ },
25+ {
26+ "metadata" : {
27+ "trusted" : true ,
28+ "collapsed" : true ,
29+ "_uuid" : " 67ca0a2af21dfcd70a0d6ea024ae1417c0db0097"
30+ },
31+ "cell_type" : " code" ,
32+ "source" : " path = '../input/adult.csv'\n input_data = pd.read_csv(path, na_values=\" ?\" )" ,
33+ "execution_count" : null ,
34+ "outputs" : []
35+ },
36+ {
37+ "metadata" : {
38+ "trusted" : true ,
39+ "_uuid" : " c2cdce6626547f1f755782cd426c777173f564ac" ,
40+ "collapsed" : true
41+ },
42+ "cell_type" : " code" ,
43+ "source" : " input_data.head()" ,
44+ "execution_count" : null ,
45+ "outputs" : []
46+ },
47+ {
48+ "metadata" : {
49+ "trusted" : true ,
50+ "collapsed" : true ,
51+ "_uuid" : " 6d81a92b7651715cabfed67491e587863c1d8698"
52+ },
53+ "cell_type" : " code" ,
54+ "source" : " input_data = input_data[input_data['race'].isin(['White', 'Black'])]" ,
55+ "execution_count" : null ,
56+ "outputs" : []
57+ },
58+ {
59+ "metadata" : {
60+ "trusted" : true ,
61+ "_uuid" : " 90028dc30c91810dfb793280dbc3b89930303e7c" ,
62+ "collapsed" : true
63+ },
64+ "cell_type" : " code" ,
65+ "source" : " input_data.head()" ,
66+ "execution_count" : null ,
67+ "outputs" : []
68+ },
69+ {
70+ "metadata" : {
71+ "trusted" : true ,
72+ "_uuid" : " 5092dbcf31d13e4b76411bbece6c42e164edd348" ,
73+ "collapsed" : true
74+ },
75+ "cell_type" : " code" ,
76+ "source" : " # sensitive attributes; we identify 'race' and 'sex' as sensitive attributes\n sensitive_attribs = ['race', 'gender']\n A = input_data[sensitive_attribs]\n A = pd.get_dummies(A,drop_first=True)\n A.columns = sensitive_attribs" ,
77+ "execution_count" : null ,
78+ "outputs" : []
79+ },
80+ {
81+ "metadata" : {
82+ "trusted" : true ,
83+ "_uuid" : " 18e6ae5dda563f17ff3ac647f55f0d990e42e4c7" ,
84+ "collapsed" : true
85+ },
86+ "cell_type" : " code" ,
87+ "source" : " A.head()" ,
88+ "execution_count" : null ,
89+ "outputs" : []
90+ },
91+ {
92+ "metadata" : {
93+ "trusted" : true ,
94+ "_uuid" : " 2f3877d05de984830b75cc4e0006feec6ca00f8e" ,
95+ "collapsed" : true
96+ },
97+ "cell_type" : " code" ,
98+ "source" : " y = (input_data['income'] == '>50K').astype(int)" ,
99+ "execution_count" : null ,
100+ "outputs" : []
101+ },
102+ {
103+ "metadata" : {
104+ "trusted" : true ,
105+ "collapsed" : true ,
106+ "_uuid" : " 916460c4b276af0d12a496d89ef2ff61f5ad96ed"
107+ },
108+ "cell_type" : " code" ,
109+ "source" : " X = input_data.drop(labels=['income', 'race', 'gender'],axis=1)\n\n X = X.fillna('Unknown')\n\n X = pd.get_dummies(X, drop_first=True)" ,
110+ "execution_count" : null ,
111+ "outputs" : []
112+ },
113+ {
114+ "metadata" : {
115+ "trusted" : true ,
116+ "collapsed" : true ,
117+ "_uuid" : " 1c7a4316a7f23f885620a7ca456443c3cf4fcae0"
118+ },
119+ "cell_type" : " code" ,
120+ "source" : " # split into train/test set\n X_train, X_test, y_train, y_test, A_train, A_test = train_test_split(X, y, A, test_size=0.5, \n stratify=y, random_state=7)\n\n # standardize the data\n scaler = StandardScaler().fit(X_train)\n #scale_df = lambda df, scaler: pd.DataFrame(scaler.transform(df), columns=df.columns, index=df.index)\n X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)\n X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)" ,
121+ "execution_count" : null ,
122+ "outputs" : []
123+ },
124+ {
125+ "metadata" : {
126+ "trusted" : true ,
127+ "collapsed" : true ,
128+ "_uuid" : " 33d6dd864dd78f81025e9e836cbb57b2c2b02643"
129+ },
130+ "cell_type" : " code" ,
131+ "source" : " def p_rule(y_pred, a_values, threshold=0.5):\n y_a_1 = y_pred[a_values == 1] > threshold if threshold else y_pred[a_values == 1]\n y_a_0 = y_pred[a_values == 0] > threshold if threshold else y_pred[a_values == 0]\n odds = y_a_1.mean() / y_a_0.mean()\n return np.min([odds, 1/odds]) * 100" ,
132+ "execution_count" : null ,
133+ "outputs" : []
134+ },
135+ {
136+ "metadata" : {
137+ "trusted" : true ,
138+ "collapsed" : true ,
139+ "_uuid" : " 552af7ac22b3f3f52424bc183d15d8d89cfdae18"
140+ },
141+ "cell_type" : " code" ,
142+ "source" : " def make_trainable_fn(net): # Produces a function that makes a network trainable or not\n def make_trainable(flag): # Loop over layers and set their trainability\n net.trainable = flag\n for layer in net.layers:\n layer.trainable = flag\n return make_trainable" ,
143+ "execution_count" : null ,
144+ "outputs" : []
145+ },
146+ {
147+ "metadata" : {
148+ "trusted" : true ,
149+ "collapsed" : true ,
150+ "_uuid" : " da96fef499154bed2d2454b12e8350362e5bf65d"
151+ },
152+ "cell_type" : " code" ,
153+ "source" : " def compute_class_weights(data_set):\n class_values = [0, 1]\n class_weights = []\n if len(data_set.shape) == 1:\n balanced_weights = compute_class_weight('balanced', class_values, data_set)\n class_weights.append(dict(zip(class_values, balanced_weights)))\n else:\n n_attr = data_set.shape[1]\n for attr_idx in range(n_attr):\n balanced_weights = compute_class_weight('balanced', class_values,\n np.array(data_set)[:,attr_idx])\n class_weights.append(dict(zip(class_values, balanced_weights)))\n return class_weights" ,
154+ "execution_count" : null ,
155+ "outputs" : []
156+ },
157+ {
158+ "metadata" : {
159+ "trusted" : true ,
160+ "collapsed" : true ,
161+ "_uuid" : " 1deddf7765ea1ccb094ad147832c35e8977cfcb1"
162+ },
163+ "cell_type" : " code" ,
164+ "source" : " def compute_target_class_weights(y):\n class_values = [0,1]\n balanced_weights = compute_class_weight('balanced', class_values, y)\n class_weights = {'y': dict(zip(class_values, balanced_weights))}\n return class_weights" ,
165+ "execution_count" : null ,
166+ "outputs" : []
167+ },
168+ {
169+ "metadata" : {
170+ "trusted" : true ,
171+ "collapsed" : true ,
172+ "_uuid" : " 98e90f1c8f2b380bd8ef2fcae2434fb33c22e1b8"
173+ },
174+ "cell_type" : " code" ,
175+ "source" : " n_features=X_train.shape[1]\n n_sensitive=A_train.shape[1]\n lambdas=[130., 30.]\n " ,
176+ "execution_count" : null ,
177+ "outputs" : []
178+ },
179+ {
180+ "metadata" : {
181+ "trusted" : true ,
182+ "collapsed" : true ,
183+ "_uuid" : " 14d4bfe6a04f952fad4a3f0b00f0caf105181d4b"
184+ },
185+ "cell_type" : " code" ,
186+ "source" : " clf_inputs = Input(shape=(n_features,)) # Classifier input = All features\n\n ############### Create CLF net ########################\n x = Dense(32, activation='relu')(clf_inputs)\n x = Dropout(0.2)(x)\n x = Dense(32, activation='relu')(x)\n x = Dropout(0.2)(x)\n x = Dense(32, activation='relu')(x)\n x = Dropout(0.2)(x)\n outputs = Dense(1, activation='sigmoid', name='y')(x)\n clf_net = Model(inputs=[clf_inputs], outputs=[outputs])\n #######################################################" ,
187+ "execution_count" : null ,
188+ "outputs" : []
189+ },
190+ {
191+ "metadata" : {
192+ "trusted" : true ,
193+ "_uuid" : " 2e85c13eac2c339276e14770b17caba341bca582" ,
194+ "collapsed" : true
195+ },
196+ "cell_type" : " code" ,
197+ "source" : " adv_inputs = Input(shape=(1,)) # Adversary input = Classifier output (one number)\n\n ############## Create ADV net #########################\n x = Dense(32, activation='relu')(adv_inputs)\n x = Dense(32, activation='relu')(x)\n x = Dense(32, activation='relu')(x)\n outputs = [Dense(1, activation='sigmoid')(x) for _ in range(n_sensitive)]\n adv_net = Model(inputs=[adv_inputs], outputs=outputs)\n #######################################################" ,
198+ "execution_count" : null ,
199+ "outputs" : []
200+ },
201+ {
202+ "metadata" : {
203+ "trusted" : true ,
204+ "collapsed" : true ,
205+ "_uuid" : " ffc19e462ce48456c9e698c4a564f8473489093c"
206+ },
207+ "cell_type" : " code" ,
208+ "source" : " ############## Create train switches #################\n trainable_clf_net = make_trainable_fn(clf_net) # Get function to make classifier trainable\n\n trainable_adv_net = make_trainable_fn(adv_net) # Function to make adversary trainable\n\n ######################################################" ,
209+ "execution_count" : null ,
210+ "outputs" : []
211+ },
212+ {
213+ "metadata" : {
214+ "trusted" : true ,
215+ "collapsed" : true ,
216+ "_uuid" : " 0ba4fdfbfb8233f90affbd4495156a4b4aeca0fc"
217+ },
218+ "cell_type" : " code" ,
219+ "source" : " #clf = compile_clf(clf_net) # Compile classifier\n\n clf = clf_net\n trainable_clf_net(True)\n clf.compile(loss='binary_crossentropy', optimizer='adam')\n " ,
220+ "execution_count" : null ,
221+ "outputs" : []
222+ },
223+ {
224+ "metadata" : {
225+ "trusted" : true ,
226+ "collapsed" : true ,
227+ "_uuid" : " 397117080a03743e5eba52b928a5b7db8585d301"
228+ },
229+ "cell_type" : " code" ,
230+ "source" : " # Creates a classifier adversary super net\n adv_out = adv_net(clf_net(clf_inputs))\n clf_out = clf_net(clf_inputs)\n clf_w_adv = Model(inputs=[clf_inputs], outputs=[clf_out]+adv_out)\n\n # The adversary is not trainable the classifier is\n trainable_clf_net(True)\n trainable_adv_net(False)\n # Create a weighted loss for all sensitive variables\n loss_weights = [1.]+[-lambda_param for lambda_param in lambdas]\n # Compile super net\n clf_w_adv.compile(loss='binary_crossentropy', \n loss_weights=loss_weights,\n optimizer='adam')" ,
231+ "execution_count" : null ,
232+ "outputs" : []
233+ },
234+ {
235+ "metadata" : {
236+ "trusted" : true ,
237+ "collapsed" : true ,
238+ "_uuid" : " 568e536f4b0d54022b8022f6579180def3d598d7"
239+ },
240+ "cell_type" : " code" ,
241+ "source" : " # Compile adversary with the classifier as inputs\n adv = Model(inputs=[clf_inputs], outputs=adv_net(clf_net(clf_inputs)))\n # Classifier is not trainable, adversary is\n trainable_clf_net(False)\n trainable_adv_net(True)\n adv.compile(loss='binary_crossentropy', optimizer='adam')" ,
242+ "execution_count" : null ,
243+ "outputs" : []
244+ },
245+ {
246+ "metadata" : {
247+ "trusted" : true ,
248+ "_uuid" : " 37bd746cdb7ddedc52948517c091b2a6ab378e27" ,
249+ "collapsed" : true
250+ },
251+ "cell_type" : " code" ,
252+ "source" : " trainable_clf_net(True)\n clf.fit(X_train.values, y_train.values, epochs=10)" ,
253+ "execution_count" : null ,
254+ "outputs" : []
255+ },
256+ {
257+ "metadata" : {
258+ "trusted" : true ,
259+ "_uuid" : " 6b9d0368986990063acd2b4b36d239ce9583bc55" ,
260+ "collapsed" : true
261+ },
262+ "cell_type" : " code" ,
263+ "source" : " trainable_clf_net(False)\n trainable_adv_net(True)\n class_weight_adv = compute_class_weights(A_train)\n adv.fit(X_train.values, np.hsplit(A_train.values, A_train.shape[1]), class_weight=class_weight_adv,epochs=10)" ,
264+ "execution_count" : null ,
265+ "outputs" : []
266+ },
267+ {
268+ "metadata" : {
269+ "trusted" : true ,
270+ "collapsed" : true ,
271+ "_uuid" : " 8297c4ceb5334aac278aed6260c4f508581a7f61"
272+ },
273+ "cell_type" : " code" ,
274+ "source" : " y_pred = clf.predict(X_test)" ,
275+ "execution_count" : null ,
276+ "outputs" : []
277+ },
278+ {
279+ "metadata" : {
280+ "trusted" : true ,
281+ "_uuid" : " 91c96a5c1bd240a153270ab162c0df02d86192c3" ,
282+ "collapsed" : true
283+ },
284+ "cell_type" : " code" ,
285+ "source" : " for sens in A_test.columns:\n pr = p_rule(y_pred,A_test[sens])\n print(sens,pr)" ,
286+ "execution_count" : null ,
287+ "outputs" : []
288+ },
289+ {
290+ "metadata" : {
291+ "trusted" : true ,
292+ "_uuid" : " d9c86e63a56fe33fa5759239782d1802cd7cd26e" ,
293+ "collapsed" : true
294+ },
295+ "cell_type" : " code" ,
296+ "source" : " acc = accuracy_score(y_test,(y_pred>0.5))* 100\n print('Clf acc: {:.2f}'.format(acc))" ,
297+ "execution_count" : null ,
298+ "outputs" : []
299+ },
300+ {
301+ "metadata" : {
302+ "trusted" : true ,
303+ "_uuid" : " 488a8dc18376fa0ff02fa6af5a837611811875a0" ,
304+ "collapsed" : true
305+ },
306+ "cell_type" : " code" ,
307+ "source": "n_iter=250\nbatch_size=128\nn_sensitive = A_train.shape[1]\n\nclass_weight_clf_w_adv = [{0:1., 1:1.}]+class_weight_adv\n\nval_metrics = pd.DataFrame()\n\nfairness_metrics = pd.DataFrame()\n\nfor idx in range(n_iter): # Train for n epochs\n\n # train adverserial\n trainable_clf_net(False)\n trainable_adv_net(True)\n adv.fit(X_train.values, \n np.hsplit(A_train.values, A_train.shape[1]), \n batch_size=batch_size, \n class_weight=class_weight_adv, \n epochs=1, verbose=0)\n\n\n # train classifier\n # Make classifier trainable and adversery untrainable\n trainable_clf_net(True)\n trainable_adv_net(False)\n # Sample batch\n indices = np.random.permutation(len(X_train))[:batch_size]\n # Train on batch\n clf_w_adv.train_on_batch(X_train.values[indices], \n [y_train.values[indices]]+np.hsplit(A_train.values[indices], n_sensitive),\n class_weight=class_weight_clf_w_adv)\n\n \n # Make validation data predictions\n y_pred = pd.Series(clf.predict(X_test).ravel(), index=y_test.index)\n\n roc_auc = roc_auc_score(y_test, y_pred)\n acc = accuracy_score(y_test, (y_pred>0.5))*100\n # Calculate ROC and accuracy\n val_metrics.loc[idx, 'ROC AUC'] = roc_auc\n val_metrics.loc[idx, 'Accuracy'] = acc\n\n # Calculate p rule\n for sensitive_attr in A_test.columns:\n fairness_metrics.loc[idx, sensitive_attr] = p_rule(y_pred,A_test[sensitive_attr])\n\n print('Epoch: {}, Accuracy: {:.2f}, Race P: {:.2f}, Gender P: {:.2f}'.format(idx,\n acc, \n fairness_metrics.loc[idx, 'race'],\n fairness_metrics.loc[idx, 'gender']))\n\n",
308+ "execution_count" : null ,
309+ "outputs" : []
310+ },
311+ {
312+ "metadata" : {
313+ "trusted" : true ,
314+ "_uuid" : " 17e86024acf92095929a36df1f02f84e88c5132d" ,
315+ "collapsed" : true
316+ },
317+ "cell_type" : " code" ,
318+ "source" : " # adverserial train on train set and validate on test set\n #vm, fm = fit(X_train, y_train, A_train,validation_data=(X_test, y_test, A_test),n_iter=200)" ,
319+ "execution_count" : null ,
320+ "outputs" : []
321+ },
322+ {
323+ "metadata" : {
324+ "trusted" : true ,
325+ "_uuid" : " d43f2519b2c87f6f12669cc5c6005811836a5724" ,
326+ "collapsed" : true
327+ },
328+ "cell_type" : " code" ,
329+ "source" : " plt.figure(figsize=(10,7))\n plt.xlabel('Epochs')\n plt.plot(val_metrics['Accuracy'],label='Accuracy')\n plt.plot(val_metrics['ROC AUC']*100,label='ROC AUC')\n plt.plot(fairness_metrics['race'],label='Race')\n plt.plot(fairness_metrics['gender'],label='Gender')\n plt.legend()" ,
330+ "execution_count" : null ,
331+ "outputs" : []
332+ },
333+ {
334+ "metadata" : {
335+ "trusted" : true ,
336+ "collapsed" : true ,
337+ "_uuid" : " ac795a99159aa2fb93c229a165012b718b45b857"
338+ },
339+ "cell_type" : " code" ,
340+ "source" : " " ,
341+ "execution_count" : null ,
342+ "outputs" : []
343+ }
344+ ],
345+ "metadata" : {
346+ "anaconda-cloud" : {},
347+ "kernelspec" : {
348+ "display_name" : " Python 3" ,
349+ "language" : " python" ,
350+ "name" : " python3"
351+ },
352+ "language_info" : {
353+ "name" : " python" ,
354+ "version" : " 3.6.5" ,
355+ "mimetype" : " text/x-python" ,
356+ "codemirror_mode" : {
357+ "name" : " ipython" ,
358+ "version" : 3
359+ },
360+ "pygments_lexer" : " ipython3" ,
361+ "nbconvert_exporter" : " python" ,
362+ "file_extension" : " .py"
363+ }
364+ },
365+ "nbformat" : 4 ,
366+ "nbformat_minor" : 1
367+ }
0 commit comments