Skip to content

Commit 16b0ce5

Browse files
author
Daniel McDuff
committed
Preprocessing updates
1 parent 8ec41b0 commit 16b0ce5

File tree

3 files changed

+107
-56
lines changed

3 files changed

+107
-56
lines changed

code/load_faces.py

Lines changed: 94 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import theano
1616
import theano.tensor as T
1717
from theano.tensor.shared_randomstreams import RandomStreams
18+
import pandas
1819

1920
def logistic_transform(A, mu, sigma):
2021
A[numpy.where(A == 0)] = 0.1
@@ -24,49 +25,97 @@ def logistic_transform(A, mu, sigma):
2425
Z /= 255.
2526
return Z
2627

27-
def import_data(label, data_dir):
28+
def import_data(label, data_dir, image_dim):
2829

29-
image_paths=[]
3030
inputs=[]
3131
target=[]
3232

3333
for (dirpath,dirnames,filenames) in os.walk(data_dir):
3434
print "Loading "+dirpath
3535
paths = [os.path.join(dirpath, f) for f in filenames if ((f[-4:] == '.png') and ('normalized' in f))]
36-
image_paths += paths
37-
for f in image_paths:
38-
test_target = int(os.path.basename(f).split("_")[2].split(".")[0])
39-
if test_target>=0:
40-
test_image = numpy.array(scipy.misc.imread(f))
41-
if (len(test_image.flatten())!=2304):
42-
continue
43-
if test_target > 50:
44-
test_image2 = test_image.astype(float)
45-
temp = logistic_transform(test_image2.flatten(), 120, 0.1)
46-
if numpy.isnan(temp).any():
47-
continue
48-
else:
49-
inputs = numpy.append(inputs, temp, axis=0)
50-
target = numpy.append(target, [1], axis=0)
51-
elif test_target == 0:
52-
test_image2 = test_image.astype(float)
53-
temp = logistic_transform(test_image2.flatten(), 120, 0.1)
54-
if numpy.isnan(temp).any():
55-
continue
56-
else:
57-
inputs = numpy.append(inputs, temp, axis=0)
58-
target = numpy.append(target, [0], axis=0)
59-
inputs = inputs.reshape(-1,2304)
60-
data_set = (inputs, target)
36+
#image_paths += paths
37+
if len(paths)==0:
38+
continue
39+
40+
# Tab indent this to do each directory separately. This is useful when adding labels.
41+
# Load label file:
42+
try:
43+
print os.path.join(dirpath,'labels.csv')
44+
label_df = pandas.read_csv(os.path.join(dirpath,'labels.csv'))
45+
except:
46+
print "No labels file in this folder."
47+
48+
# Loop through paths and store images and labels:
49+
for f in paths:
50+
print(f)
51+
base=os.path.basename(f)
52+
filename = base
53+
#test_target = int(os.path.basename(f).split("_")[2].split(".")[0])
54+
test_targets = label_df.loc[label_df['image_name']==filename]
55+
test_target = test_targets.iloc[0]['Smile']
56+
57+
58+
#temp = test_targets==0
59+
#neutral = temp.any(1)
60+
#print neutral.iloc[0]
61+
62+
63+
#if neutral.iloc[0]:
64+
if test_target>=-1:
65+
test_image = numpy.array(scipy.misc.imread(f))
66+
if (len(test_image.flatten())!=(image_dim*image_dim)):
67+
continue
68+
#for i, row in enumerate(test_targets.iloc[0].values):
69+
# print i + str(test_targets.iloc[0][i])
70+
71+
if test_target > 50:
72+
test_image2 = test_image.astype(float)
73+
temp = logistic_transform(test_image2.flatten(), 120, 0.1)
74+
if numpy.isnan(temp).any():
75+
print "NaN found :("
76+
continue
77+
else:
78+
inputs = numpy.append(inputs, temp, axis=0)
79+
target = numpy.append(target, [1], axis=0)
80+
elif test_target == 0:
81+
test_image2 = test_image.astype(float)
82+
temp = logistic_transform(test_image2.flatten(), 120, 0.1)
83+
if numpy.isnan(temp).any():
84+
print "NaN found :("
85+
continue
86+
else:
87+
inputs = numpy.append(inputs, temp, axis=0)
88+
target = numpy.append(target, [0], axis=0)
89+
elif test_target == -1:
90+
test_image2 = test_image.astype(float)
91+
temp = logistic_transform(test_image2.flatten(), 120, 0.1)
92+
if numpy.isnan(temp).any():
93+
print "NaN found :("
94+
continue
95+
else:
96+
inputs = numpy.append(inputs, temp, axis=0)
97+
target = numpy.append(target, [-1], axis=0)
98+
99+
inputs = inputs.reshape(-1,(image_dim*image_dim))
100+
perm = numpy.random.permutation(len(inputs[:,1]))
101+
inputs = inputs[perm,:]
102+
target = target[perm]
103+
104+
## Create label mask:
105+
data_mask = [target >= 0]
106+
107+
data_set = (inputs, target, data_mask)
61108
print label + " Data:"
62-
print "Images: "+str(len(inputs[:,1]))
63-
print "Features: "+str(len(inputs[1,:]))
64-
print "Labels: "+str(len(target))
65-
print "Positive Labels: "+str(sum(target))
109+
print "Images: "+str(len(inputs[:,1]))
110+
print "Features: "+str(len(inputs[1,:]))
111+
print "Labels: "+str(len(target))
112+
print " Positive Labels: "+str(sum(target[data_mask]))
113+
print " Negative Labels: "+str(len(target) - sum(target[data_mask]))
114+
print " Unlabeled Examples: "+str(len(target) - numpy.sum(data_mask))
66115

67116
return data_set
68117

69-
def load_faces(dataset):
118+
def load_faces(dataset, image_dim):
70119
''' Loads the dataset
71120
:type dataset: string
72121
:param dataset: the path to the dataset
@@ -80,19 +129,23 @@ def load_faces(dataset):
80129

81130
pickle_dir = os.path.join(dataset,'imported_data_bin.p')
82131
if (os.path.isfile(pickle_dir)):
83-
rval, train_set = pickle.load(open(pickle_dir,"rb"))
84-
return rval, train_set
132+
rval, test_set = pickle.load(open(pickle_dir,"rb"))
133+
return rval, test_set
85134
else:
86135
print "No imported data found. Loading data from images now."
87136

88137
data_dir_test = os.path.join(dataset,'train')
89-
train_set = import_data('Training', data_dir_test)
138+
train_set = import_data('Training', data_dir_test, image_dim)
139+
pre_train_set = (train_set[0], train_set[1])
140+
train_set = (train_set[0][train_set[2]], train_set[1][train_set[2]])
90141

91142
data_dir_test = os.path.join(dataset,'valid')
92-
valid_set = import_data('Validation', data_dir_test)
93-
143+
valid_set = import_data('Validation', data_dir_test, image_dim)
144+
valid_set = (valid_set[0][valid_set[2]], valid_set[1][valid_set[2]])
145+
94146
data_dir_test = os.path.join(dataset,'test')
95-
test_set = import_data('Testing', data_dir_test)
147+
test_set = import_data('Testing', data_dir_test, image_dim)
148+
test_set = (test_set[0][test_set[2]], test_set[1][test_set[2]])
96149

97150
# Make each image a row of the matrix.
98151
# Make the targets a column vector.
@@ -125,13 +178,14 @@ def shared_dataset(data_xy, borrow=True):
125178
test_set_x, test_set_y = shared_dataset(test_set)
126179
valid_set_x, valid_set_y = shared_dataset(valid_set)
127180
train_set_x, train_set_y = shared_dataset(train_set)
181+
pre_train_set_x, pre_train_set_y = shared_dataset(pre_train_set)
128182

129183
rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
130-
(test_set_x, test_set_y)]
184+
(test_set_x, test_set_y), (pre_train_set_x, pre_train_set_y)]
131185

132186
save_dir = os.path.join(dataset,'imported_data_bin.p')
133187
pickle.dump( [rval, train_set], open(save_dir,"wb"))
134-
return rval, train_set
188+
return rval, test_set
135189

136190
if __name__ == '__main__':
137191
test_DBN()

code/rbm.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def sample_h_given_v(self, v0_sample):
156156

157157
rank_1 = ((h1_mean.argsort(axis=1)).argsort(axis=1).astype(theano.config.floatX) + 1.)/T.shape(h1_mean)[1].astype(theano.config.floatX)
158158

159-
h1_mean = (1.-0.5)*(rank_0**((1./0.03)-1.))+0.5*(rank_1**((1./0.03)-1.))
159+
h1_mean = (1.-0.5)*(rank_0**((1./0.05)-1.))+0.5*(rank_1**((1./0.05)-1.))
160160

161161
#pre_sigmoid_h1_bin = T.log(h1_mean) - T.log(1. - h1_mean)
162162
#pre_sigmoid_h1 = pre_sigmoid_h1_bin
@@ -279,14 +279,14 @@ def get_cost_updates(self, lr=0.1, persistent=None, k=1):
279279
gparams = T.grad(cost, self.params, consider_constant=[chain_end])
280280

281281
## DAN ADDED:#########################
282-
#pre_sigmoid_h1_bin = T.log(ph_mean) - T.log(1. - ph_mean)
282+
pre_sigmoid_h1_bin = T.log(ph_mean) - T.log(1. - ph_mean)
283283
# get_gweights_up
284-
#phi = 0.2
285-
#gparams_lat_bias = theano.clone(gparams[0],replace={pre_sigmoid_ph:pre_sigmoid_h1_bin})
286-
#gparams[0] = (1 - phi) * gparams[0] + phi * gparams_lat_bias
284+
phi = 0.2
285+
gparams_lat_bias = theano.clone(gparams[0],replace={pre_sigmoid_ph:pre_sigmoid_h1_bin})
286+
gparams[0] = (1 - phi) * gparams[0] + phi * gparams_lat_bias
287287
## DAN ADDED:
288-
#hparams_lat_bias = theano.clone(gparams[1],replace={pre_sigmoid_ph:pre_sigmoid_h1_bin})
289-
#gparams[1] = (1 - phi) * gparams[1] + phi * hparams_lat_bias
288+
hparams_lat_bias = theano.clone(gparams[1],replace={pre_sigmoid_ph:pre_sigmoid_h1_bin})
289+
gparams[1] = (1 - phi) * gparams[1] + phi * hparams_lat_bias
290290
#######################################
291291

292292
# end-snippet-3 start-snippet-4
@@ -381,7 +381,7 @@ def get_reconstruction_cost(self, updates, pre_sigmoid_nv):
381381

382382

383383
def test_rbm(learning_rate=0.1, training_epochs=15,
384-
dataset='mnist.pkl.gz', batch_size=10,
384+
dataset='mnist.pkl.gz', batch_size=50,
385385
n_chains=20, n_samples=10, output_folder='rbm_plots',
386386
n_hidden=500):
387387
"""

code/rbm_face.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -152,15 +152,12 @@ def sample_h_given_v(self, v0_sample):
152152
# the visibles
153153
pre_sigmoid_h1, h1_mean = self.propup(v0_sample)
154154

155-
156-
##################################################################
157-
## Sparsity: #####################################################
158-
##################################################################
155+
##################################################################
156+
## Sparsity: #####################################################
157+
##################################################################
159158
rank_0 = ((h1_mean.argsort(axis=0)).argsort(axis=0).astype(theano.config.floatX) + 1.)/T.shape(h1_mean)[0].astype(theano.config.floatX)
160-
161-
rank_1 = ((h1_mean.argsort(axis=1)).argsort(axis=1).astype(theano.config.floatX) + 1.)/T.shape(h1_mean)[1].astype(theano.config.floatX)
162-
163-
h1_mean = (1.-0.9)*(rank_0**((1./0.99)-1.))+0.9*(rank_1**((1./0.99)-1.))
159+
rank_1 = ((h1_mean.argsort(axis=1)).argsort(axis=1).astype(theano.config.floatX) + 1.)/T.shape(h1_mean)[1].astype(theano.config.floatX)
160+
h1_mean = (1.-0.9)*(rank_0**((1./0.99)-1.))+0.9*(rank_1**((1./0.99)-1.))
164161

165162

166163
# get a sample of the hiddens given their activation

0 commit comments

Comments
 (0)