Skip to content

Commit aeae218

Browse files
committed
Chapter 9 Done
1 parent 82b650c commit aeae218

File tree

3 files changed

+645
-0
lines changed

3 files changed

+645
-0
lines changed
File renamed without changes.

9.2_Learning_to_be_fair.ipynb

Lines changed: 367 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,367 @@
1+
{
2+
"cells": [
3+
{
4+
"metadata": {
5+
"trusted": true,
6+
"_uuid": "1713f4306d45835efceefad592e4e6b046302f23",
7+
"collapsed": true
8+
},
9+
"cell_type": "code",
10+
"source": "import pandas as pd\nimport numpy as np\nnp.random.seed(42)\n\nimport matplotlib.pyplot as plt\n%matplotlib inline\n\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.metrics import accuracy_score, roc_auc_score\nfrom sklearn.utils.class_weight import compute_class_weight\n\n\nfrom keras.layers import Input, Dense, Dropout\nfrom keras.models import Model",
11+
"execution_count": null,
12+
"outputs": []
13+
},
14+
{
15+
"metadata": {
16+
"trusted": true,
17+
"_uuid": "e1b433fb1c8346a04cff7d90f162a90afc6a97e9",
18+
"collapsed": true
19+
},
20+
"cell_type": "code",
21+
"source": "!ls ../input",
22+
"execution_count": null,
23+
"outputs": []
24+
},
25+
{
26+
"metadata": {
27+
"trusted": true,
28+
"collapsed": true,
29+
"_uuid": "67ca0a2af21dfcd70a0d6ea024ae1417c0db0097"
30+
},
31+
"cell_type": "code",
32+
"source": "path = '../input/adult.csv'\ninput_data = pd.read_csv(path, na_values=\"?\")",
33+
"execution_count": null,
34+
"outputs": []
35+
},
36+
{
37+
"metadata": {
38+
"trusted": true,
39+
"_uuid": "c2cdce6626547f1f755782cd426c777173f564ac",
40+
"collapsed": true
41+
},
42+
"cell_type": "code",
43+
"source": "input_data.head()",
44+
"execution_count": null,
45+
"outputs": []
46+
},
47+
{
48+
"metadata": {
49+
"trusted": true,
50+
"collapsed": true,
51+
"_uuid": "6d81a92b7651715cabfed67491e587863c1d8698"
52+
},
53+
"cell_type": "code",
54+
"source": "input_data = input_data[input_data['race'].isin(['White', 'Black'])]",
55+
"execution_count": null,
56+
"outputs": []
57+
},
58+
{
59+
"metadata": {
60+
"trusted": true,
61+
"_uuid": "90028dc30c91810dfb793280dbc3b89930303e7c",
62+
"collapsed": true
63+
},
64+
"cell_type": "code",
65+
"source": "input_data.head()",
66+
"execution_count": null,
67+
"outputs": []
68+
},
69+
{
70+
"metadata": {
71+
"trusted": true,
72+
"_uuid": "5092dbcf31d13e4b76411bbece6c42e164edd348",
73+
"collapsed": true
74+
},
75+
"cell_type": "code",
76+
"source": "# sensitive attributes; we identify 'race' and 'sex' as sensitive attributes\nsensitive_attribs = ['race', 'gender']\nA = input_data[sensitive_attribs]\nA = pd.get_dummies(A,drop_first=True)\nA.columns = sensitive_attribs",
77+
"execution_count": null,
78+
"outputs": []
79+
},
80+
{
81+
"metadata": {
82+
"trusted": true,
83+
"_uuid": "18e6ae5dda563f17ff3ac647f55f0d990e42e4c7",
84+
"collapsed": true
85+
},
86+
"cell_type": "code",
87+
"source": "A.head()",
88+
"execution_count": null,
89+
"outputs": []
90+
},
91+
{
92+
"metadata": {
93+
"trusted": true,
94+
"_uuid": "2f3877d05de984830b75cc4e0006feec6ca00f8e",
95+
"collapsed": true
96+
},
97+
"cell_type": "code",
98+
"source": "y = (input_data['income'] == '>50K').astype(int)",
99+
"execution_count": null,
100+
"outputs": []
101+
},
102+
{
103+
"metadata": {
104+
"trusted": true,
105+
"collapsed": true,
106+
"_uuid": "916460c4b276af0d12a496d89ef2ff61f5ad96ed"
107+
},
108+
"cell_type": "code",
109+
"source": "X = input_data.drop(labels=['income', 'race', 'gender'],axis=1)\n\nX = X.fillna('Unknown')\n\nX = pd.get_dummies(X, drop_first=True)",
110+
"execution_count": null,
111+
"outputs": []
112+
},
113+
{
114+
"metadata": {
115+
"trusted": true,
116+
"collapsed": true,
117+
"_uuid": "1c7a4316a7f23f885620a7ca456443c3cf4fcae0"
118+
},
119+
"cell_type": "code",
120+
"source": "# split into train/test set\nX_train, X_test, y_train, y_test, A_train, A_test = train_test_split(X, y, A, test_size=0.5, \n stratify=y, random_state=7)\n\n# standardize the data\nscaler = StandardScaler().fit(X_train)\n#scale_df = lambda df, scaler: pd.DataFrame(scaler.transform(df), columns=df.columns, index=df.index)\nX_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)\nX_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)",
121+
"execution_count": null,
122+
"outputs": []
123+
},
124+
{
125+
"metadata": {
126+
"trusted": true,
127+
"collapsed": true,
128+
"_uuid": "33d6dd864dd78f81025e9e836cbb57b2c2b02643"
129+
},
130+
"cell_type": "code",
131+
"source": "def p_rule(y_pred, a_values, threshold=0.5):\n y_a_1 = y_pred[a_values == 1] > threshold if threshold else y_pred[a_values == 1]\n y_a_0 = y_pred[a_values == 0] > threshold if threshold else y_pred[a_values == 0]\n odds = y_a_1.mean() / y_a_0.mean()\n return np.min([odds, 1/odds]) * 100",
132+
"execution_count": null,
133+
"outputs": []
134+
},
135+
{
136+
"metadata": {
137+
"trusted": true,
138+
"collapsed": true,
139+
"_uuid": "552af7ac22b3f3f52424bc183d15d8d89cfdae18"
140+
},
141+
"cell_type": "code",
142+
"source": "def make_trainable_fn(net): # Produces a function that makes a network trainable or not\n def make_trainable(flag): # Loop over layers and set their trainability\n net.trainable = flag\n for layer in net.layers:\n layer.trainable = flag\n return make_trainable",
143+
"execution_count": null,
144+
"outputs": []
145+
},
146+
{
147+
"metadata": {
148+
"trusted": true,
149+
"collapsed": true,
150+
"_uuid": "da96fef499154bed2d2454b12e8350362e5bf65d"
151+
},
152+
"cell_type": "code",
153+
"source": "def compute_class_weights(data_set):\n class_values = [0, 1]\n class_weights = []\n if len(data_set.shape) == 1:\n balanced_weights = compute_class_weight('balanced', class_values, data_set)\n class_weights.append(dict(zip(class_values, balanced_weights)))\n else:\n n_attr = data_set.shape[1]\n for attr_idx in range(n_attr):\n balanced_weights = compute_class_weight('balanced', class_values,\n np.array(data_set)[:,attr_idx])\n class_weights.append(dict(zip(class_values, balanced_weights)))\n return class_weights",
154+
"execution_count": null,
155+
"outputs": []
156+
},
157+
{
158+
"metadata": {
159+
"trusted": true,
160+
"collapsed": true,
161+
"_uuid": "1deddf7765ea1ccb094ad147832c35e8977cfcb1"
162+
},
163+
"cell_type": "code",
164+
"source": "def compute_target_class_weights(y):\n class_values = [0,1]\n balanced_weights = compute_class_weight('balanced', class_values, y)\n class_weights = {'y': dict(zip(class_values, balanced_weights))}\n return class_weights",
165+
"execution_count": null,
166+
"outputs": []
167+
},
168+
{
169+
"metadata": {
170+
"trusted": true,
171+
"collapsed": true,
172+
"_uuid": "98e90f1c8f2b380bd8ef2fcae2434fb33c22e1b8"
173+
},
174+
"cell_type": "code",
175+
"source": "n_features=X_train.shape[1]\nn_sensitive=A_train.shape[1]\nlambdas=[130., 30.]\n",
176+
"execution_count": null,
177+
"outputs": []
178+
},
179+
{
180+
"metadata": {
181+
"trusted": true,
182+
"collapsed": true,
183+
"_uuid": "14d4bfe6a04f952fad4a3f0b00f0caf105181d4b"
184+
},
185+
"cell_type": "code",
186+
"source": "clf_inputs = Input(shape=(n_features,)) # Classifier input = All features\n\n############### Create CLF net ########################\nx = Dense(32, activation='relu')(clf_inputs)\nx = Dropout(0.2)(x)\nx = Dense(32, activation='relu')(x)\nx = Dropout(0.2)(x)\nx = Dense(32, activation='relu')(x)\nx = Dropout(0.2)(x)\noutputs = Dense(1, activation='sigmoid', name='y')(x)\nclf_net = Model(inputs=[clf_inputs], outputs=[outputs])\n#######################################################",
187+
"execution_count": null,
188+
"outputs": []
189+
},
190+
{
191+
"metadata": {
192+
"trusted": true,
193+
"_uuid": "2e85c13eac2c339276e14770b17caba341bca582",
194+
"collapsed": true
195+
},
196+
"cell_type": "code",
197+
"source": "adv_inputs = Input(shape=(1,)) # Adversary input = Classifier output (one number)\n\n############## Create ADV net #########################\nx = Dense(32, activation='relu')(adv_inputs)\nx = Dense(32, activation='relu')(x)\nx = Dense(32, activation='relu')(x)\noutputs = [Dense(1, activation='sigmoid')(x) for _ in range(n_sensitive)]\nadv_net = Model(inputs=[adv_inputs], outputs=outputs)\n#######################################################",
198+
"execution_count": null,
199+
"outputs": []
200+
},
201+
{
202+
"metadata": {
203+
"trusted": true,
204+
"collapsed": true,
205+
"_uuid": "ffc19e462ce48456c9e698c4a564f8473489093c"
206+
},
207+
"cell_type": "code",
208+
"source": "############## Create train switches #################\ntrainable_clf_net = make_trainable_fn(clf_net) # Get function to make classifier trainable\n\ntrainable_adv_net = make_trainable_fn(adv_net) # Function to make adversary trainable\n\n######################################################",
209+
"execution_count": null,
210+
"outputs": []
211+
},
212+
{
213+
"metadata": {
214+
"trusted": true,
215+
"collapsed": true,
216+
"_uuid": "0ba4fdfbfb8233f90affbd4495156a4b4aeca0fc"
217+
},
218+
"cell_type": "code",
219+
"source": "#clf = compile_clf(clf_net) # Compile classifier\n\nclf = clf_net\ntrainable_clf_net(True)\nclf.compile(loss='binary_crossentropy', optimizer='adam')\n",
220+
"execution_count": null,
221+
"outputs": []
222+
},
223+
{
224+
"metadata": {
225+
"trusted": true,
226+
"collapsed": true,
227+
"_uuid": "397117080a03743e5eba52b928a5b7db8585d301"
228+
},
229+
"cell_type": "code",
230+
"source": "# Creates a classifier adversary super net\nadv_out = adv_net(clf_net(clf_inputs))\nclf_out = clf_net(clf_inputs)\nclf_w_adv = Model(inputs=[clf_inputs], outputs=[clf_out]+adv_out)\n\n# The adversary is not trainable the classifier is\ntrainable_clf_net(True)\ntrainable_adv_net(False)\n# Create a weighted loss for all sensitive variables\nloss_weights = [1.]+[-lambda_param for lambda_param in lambdas]\n# Compile super net\nclf_w_adv.compile(loss='binary_crossentropy', \n loss_weights=loss_weights,\n optimizer='adam')",
231+
"execution_count": null,
232+
"outputs": []
233+
},
234+
{
235+
"metadata": {
236+
"trusted": true,
237+
"collapsed": true,
238+
"_uuid": "568e536f4b0d54022b8022f6579180def3d598d7"
239+
},
240+
"cell_type": "code",
241+
"source": "# Compile adversary with the classifier as inputs\nadv = Model(inputs=[clf_inputs], outputs=adv_net(clf_net(clf_inputs)))\n# Classifier is not trainable, adversary is\ntrainable_clf_net(False)\ntrainable_adv_net(True)\nadv.compile(loss='binary_crossentropy', optimizer='adam')",
242+
"execution_count": null,
243+
"outputs": []
244+
},
245+
{
246+
"metadata": {
247+
"trusted": true,
248+
"_uuid": "37bd746cdb7ddedc52948517c091b2a6ab378e27",
249+
"collapsed": true
250+
},
251+
"cell_type": "code",
252+
"source": "trainable_clf_net(True)\nclf.fit(X_train.values, y_train.values, epochs=10)",
253+
"execution_count": null,
254+
"outputs": []
255+
},
256+
{
257+
"metadata": {
258+
"trusted": true,
259+
"_uuid": "6b9d0368986990063acd2b4b36d239ce9583bc55",
260+
"collapsed": true
261+
},
262+
"cell_type": "code",
263+
"source": "trainable_clf_net(False)\ntrainable_adv_net(True)\nclass_weight_adv = compute_class_weights(A_train)\nadv.fit(X_train.values, np.hsplit(A_train.values, A_train.shape[1]), class_weight=class_weight_adv,epochs=10)",
264+
"execution_count": null,
265+
"outputs": []
266+
},
267+
{
268+
"metadata": {
269+
"trusted": true,
270+
"collapsed": true,
271+
"_uuid": "8297c4ceb5334aac278aed6260c4f508581a7f61"
272+
},
273+
"cell_type": "code",
274+
"source": "y_pred = clf.predict(X_test)",
275+
"execution_count": null,
276+
"outputs": []
277+
},
278+
{
279+
"metadata": {
280+
"trusted": true,
281+
"_uuid": "91c96a5c1bd240a153270ab162c0df02d86192c3",
282+
"collapsed": true
283+
},
284+
"cell_type": "code",
285+
"source": "for sens in A_test.columns:\n pr = p_rule(y_pred,A_test[sens])\n print(sens,pr)",
286+
"execution_count": null,
287+
"outputs": []
288+
},
289+
{
290+
"metadata": {
291+
"trusted": true,
292+
"_uuid": "d9c86e63a56fe33fa5759239782d1802cd7cd26e",
293+
"collapsed": true
294+
},
295+
"cell_type": "code",
296+
"source": "acc = accuracy_score(y_test,(y_pred>0.5))* 100\nprint('Clf acc: {:.2f}'.format(acc))",
297+
"execution_count": null,
298+
"outputs": []
299+
},
300+
{
301+
"metadata": {
302+
"trusted": true,
303+
"_uuid": "488a8dc18376fa0ff02fa6af5a837611811875a0",
304+
"collapsed": true
305+
},
306+
"cell_type": "code",
307+
"source": "n_iter=250\nbatch_size=128\nn_sensitive = A_train.shape[1]\n\nclass_weight_clf_w_adv = [{0:1., 1:1.}]+class_weight_adv\n\nval_metrics = pd.DataFrame()\n\nfairness_metrics = pd.DataFrame()\n\nfor idx in range(n_iter): # Train for n epochs\n\n # train adverserial\n trainable_clf_net(False)\n trainable_adv_net(True)\n adv.fit(X_train.values, \n np.hsplit(A_train.values, A_train.shape[1]), \n batch_size=batch_size, \n class_weight=class_weight_adv, \n epochs=1, verbose=0)\n\n\n # train classifier\n # Make classifier trainable and adversery untrainable\n trainable_clf_net(True)\n trainable_adv_net(False)\n # Sample batch\n indices = np.random.permutation(len(X_train))[:batch_size]\n # Train on batch\n clf_w_adv.train_on_batch(X_train.values[indices], \n [y_train.values[indices]]+np.hsplit(A_train.values[indices], n_sensitive),\n class_weight=class_weight_clf_w_adv)\n\n \n # Make validation data predictions\n y_pred = pd.Series(clf.predict(X_test).ravel(), index=y_test.index)\n\n roc_auc = roc_auc_score(y_test, y_pred)\n acc = accuracy_score(y_test, (y_pred>0.5))*100\n # Calculate ROC and accuracy\n val_metrics.loc[idx, 'ROC AUC'] = roc_auc\n val_metrics.loc[idx, 'Accuracy'] = acc\n\n # Calculate p rule\n for sensitive_attr in A_test.columns:\n fairness_metrics.loc[idx, sensitive_attr] = p_rule(y_pred,A_test[sensitive_attr])\n\n print('Epoch: {}, Accuracy: {:.2f}, Race P: {:.2f}, Gender P: {:.2f}'.format(idx,\n acc, \n fairness_metrics.loc[idx, 'race'],\n fairness_metrics.loc[idx, 'gender']))\n\n",
308+
"execution_count": null,
309+
"outputs": []
310+
},
311+
{
312+
"metadata": {
313+
"trusted": true,
314+
"_uuid": "17e86024acf92095929a36df1f02f84e88c5132d",
315+
"collapsed": true
316+
},
317+
"cell_type": "code",
318+
"source": "# adverserial train on train set and validate on test set\n#vm, fm = fit(X_train, y_train, A_train,validation_data=(X_test, y_test, A_test),n_iter=200)",
319+
"execution_count": null,
320+
"outputs": []
321+
},
322+
{
323+
"metadata": {
324+
"trusted": true,
325+
"_uuid": "d43f2519b2c87f6f12669cc5c6005811836a5724",
326+
"collapsed": true
327+
},
328+
"cell_type": "code",
329+
"source": "plt.figure(figsize=(10,7))\nplt.xlabel('Epochs')\nplt.plot(val_metrics['Accuracy'],label='Accuracy')\nplt.plot(val_metrics['ROC AUC']*100,label='ROC AUC')\nplt.plot(fairness_metrics['race'],label='Race')\nplt.plot(fairness_metrics['gender'],label='Gender')\nplt.legend()",
330+
"execution_count": null,
331+
"outputs": []
332+
},
333+
{
334+
"metadata": {
335+
"trusted": true,
336+
"collapsed": true,
337+
"_uuid": "ac795a99159aa2fb93c229a165012b718b45b857"
338+
},
339+
"cell_type": "code",
340+
"source": "",
341+
"execution_count": null,
342+
"outputs": []
343+
}
344+
],
345+
"metadata": {
346+
"anaconda-cloud": {},
347+
"kernelspec": {
348+
"display_name": "Python 3",
349+
"language": "python",
350+
"name": "python3"
351+
},
352+
"language_info": {
353+
"name": "python",
354+
"version": "3.6.5",
355+
"mimetype": "text/x-python",
356+
"codemirror_mode": {
357+
"name": "ipython",
358+
"version": 3
359+
},
360+
"pygments_lexer": "ipython3",
361+
"nbconvert_exporter": "python",
362+
"file_extension": ".py"
363+
}
364+
},
365+
"nbformat": 4,
366+
"nbformat_minor": 1
367+
}

0 commit comments

Comments
 (0)