Skip to content

Commit a074b32

Browse files
committed
I have made all tutorials (code and documentation) to use shared variable to store dataset as James suggested. I also changed the validation_frequency to Yoshua's suggestion. One should still need to run on gpu the mlp and convolutional mlp to see how fast it runs.
1 parent 10db616 commit a074b32

8 files changed

Lines changed: 158 additions & 100 deletions

File tree

code/convolutional_mlp.py

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ def shared_dataset(data_xy):
197197
valid_set_x, valid_set_y = shared_dataset(valid_set)
198198
train_set_x, train_set_y = shared_dataset(train_set)
199199

200-
batch_size = 500 # sized of the minibatch
200+
batch_size = 500 # size of the minibatch
201201

202202
# compute number of minibatches for training, validation and testing
203203
n_train_batches = train_set_x.value.shape[0] / batch_size
@@ -253,7 +253,15 @@ def shared_dataset(data_xy):
253253
cost = layer3.negative_log_likelihood(y)
254254

255255
# create a function to compute the mistakes that are made by the model
256-
test_model = theano.function([x,y], layer3.errors(y))
256+
test_model = theano.function([minibatch_offset], layer3.errors(y),
257+
givens = {
258+
x: test_set_x[minibatch_offset:minibatch_offset+batch_size],
259+
y: test_set_y[minibatch_offset:minibatch_offset+batch_size]})
260+
261+
validate_model = theano.function([minibatch_offset], layer3.errors(y),
262+
givens = {
263+
x: valid_set_x[minibatch_offset:minibatch_offset+batch_size],
264+
y: valid_set_y[minibatch_offset:minibatch_offset+batch_size]})
257265

258266
# create a list of all model parameters to be fit by gradient descent
259267
params = layer3.params+ layer2.params+ layer1.params + layer0.params
@@ -268,22 +276,25 @@ def shared_dataset(data_xy):
268276
updates = {}
269277
for param_i, grad_i in zip(params, grads):
270278
updates[param_i] = param_i - learning_rate * grad_i
271-
train_model = theano.function([x, y], cost, updates=updates)
279+
280+
train_model = theano.function([minibatch_offset], cost, updates=updates,
281+
givens = {
282+
x: train_set_x[minibatch_offset:minibatch_offset+batch_size],
283+
y: train_set_y[minibatch_offset:minibatch_offset+batch_size]})
272284

273285

274286
###############
275287
# TRAIN MODEL #
276288
###############
277289

278-
n_minibatches = len(train_batches)
279-
280290
# early-stopping parameters
281291
patience = 10000 # look as this many examples regardless
282292
patience_increase = 2 # wait this much longer when a new best is
283293
# found
284294
improvement_threshold = 0.995 # a relative improvement of this much is
285295
# considered significant
286-
validation_frequency = n_minibatches # go through this many
296+
validation_frequency = min(n_train_batches, patience/2)
297+
# go through this many
287298
# minibatche before checking the network
288299
# on the validation set; in this case we
289300
# check every epoch
@@ -295,30 +306,25 @@ def shared_dataset(data_xy):
295306
start_time = time.clock()
296307

297308
# have a maximum of `n_iter` iterations through the entire dataset
298-
for iter in xrange(n_iter * n_minibatches):
309+
for iter in xrange(n_iter * n_train_batches):
299310

300311
# get epoch and minibatch index
301-
epoch = iter / n_minibatches
302-
minibatch_index = iter % n_minibatches
303-
312+
epoch = iter / n_train_batches
313+
minibatch_index = iter % n_train_batches
314+
minibatch_offset = minibatch_index * batch_size
315+
304316
# get the minibatches corresponding to `iter` modulo
305317
# `len(train_batches)`
306-
x,y = train_batches[ minibatch_index ]
307318

308319
if iter %100 == 0:
309320
print 'training @ iter = ', iter
310-
cost_ij = train_model(x,y)
321+
cost_ij = train_model(minibatch_offset)
311322

312323
if (iter+1) % validation_frequency == 0:
313324

314-
# compute zero-one loss on validation set
315-
this_validation_loss = 0.
316-
for x,y in valid_batches:
317-
# sum up the errors for each minibatch
318-
this_validation_loss += test_model(x,y)
319-
320-
# get the average by dividing with the number of minibatches
321-
this_validation_loss /= len(valid_batches)
325+
# compute zero-one loss on validation set
326+
validation_losses = [validate_model(i*batch_size) for i in xrange(n_valid_batches)]
327+
this_validation_loss = numpy.mean(validation_losses)
322328
print('epoch %i, minibatch %i/%i, validation error %f %%' % \
323329
(epoch, minibatch_index+1, n_minibatches, \
324330
this_validation_loss*100.))
@@ -337,10 +343,8 @@ def shared_dataset(data_xy):
337343
best_iter = iter
338344

339345
# test it on the test set
340-
test_score = 0.
341-
for x,y in test_batches:
342-
test_score += test_model(x,y)
343-
test_score /= len(test_batches)
346+
test_losses = [test_model(i*batch_size) for i in xrange(n_test_batches)]
347+
test_score = numpy.mean(test_losses)
344348
print((' epoch %i, minibatch %i/%i, test error of best '
345349
'model %f %%') %
346350
(epoch, minibatch_index+1, n_minibatches,

code/logistic_cg.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,6 @@ def cg_optimization_mnist( n_iter=50, mnist_pkl_gz='mnist.pkl.gz' ):
153153
train_set, valid_set, test_set = cPickle.load(f)
154154
f.close()
155155

156-
# make minibatches of size 20
157-
batch_size = 500 # sized of the minibatch
158156

159157
def shared_dataset(data_xy):
160158
data_x, data_y = data_xy
@@ -166,7 +164,7 @@ def shared_dataset(data_xy):
166164
valid_set_x, valid_set_y = shared_dataset(valid_set)
167165
train_set_x, train_set_y = shared_dataset(train_set)
168166

169-
batch_size = 500 # sized of the minibatch
167+
batch_size = 600 # size of the minibatch
170168

171169
n_train_batches = train_set_x.value.shape[0] / batch_size
172170
n_valid_batches = valid_set_x.value.shape[0] / batch_size
@@ -252,7 +250,7 @@ def callback(theta_value):
252250
# if so, replace the old one, and compute the score on the
253251
# testing dataset
254252
validation_scores[0] = this_validation_loss
255-
test_loses = [test_model(i*batch_size) for i in xrange(n_train_batches)]
253+
test_loses = [test_model(i*batch_size) for i in xrange(n_test_batches)]
256254
validation_scores[1] = numpy.mean(test_loses)
257255

258256
# using scipy conjugate gradient optimizer

code/logistic_sgd.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def shared_dataset(data_xy):
165165
valid_set_x, valid_set_y = shared_dataset(valid_set)
166166
train_set_x, train_set_y = shared_dataset(train_set)
167167

168-
batch_size = 500 # sized of the minibatch
168+
batch_size = 600 # size of the minibatch
169169

170170
# compute number of minibatches for training, validation and testing
171171
n_train_batches = train_set_x.value.shape[0] / batch_size
@@ -193,7 +193,7 @@ def shared_dataset(data_xy):
193193
x:test_set_x[minibatch_offset:minibatch_offset+batch_size],
194194
y:test_set_y[minibatch_offset:minibatch_offset+batch_size]})
195195

196-
validate_model = theano.function([minibatch_offset], classifier.errors(y),
196+
validate_model =theano.function([minibatch_offset], classifier.errors(y),
197197
givens={
198198
x:valid_set_x[minibatch_offset:minibatch_offset+batch_size],
199199
y:valid_set_y[minibatch_offset:minibatch_offset+batch_size]})
@@ -220,7 +220,8 @@ def shared_dataset(data_xy):
220220
# found
221221
improvement_threshold = 0.995 # a relative improvement of this much is
222222
# considered significant
223-
validation_frequency = n_train_batches # go through this many
223+
validation_frequency = min(n_train_batches, patience/2)
224+
# go through this many
224225
# minibatche before checking the network
225226
# on the validation set; in this case we
226227
# check every epoch

code/mlp.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ def shared_dataset(data_xy):
188188
valid_set_x, valid_set_y = shared_dataset(valid_set)
189189
train_set_x, train_set_y = shared_dataset(train_set)
190190

191-
batch_size = 500 # sized of the minibatch
191+
batch_size = 20 # size of the minibatch
192192

193193
# compute number of minibatches for training, validation and testing
194194
n_train_batches = train_set_x.value.shape[0] / batch_size
@@ -252,7 +252,8 @@ def shared_dataset(data_xy):
252252
# found
253253
improvement_threshold = 0.995 # a relative improvement of this much is
254254
# considered significant
255-
validation_frequency = n_train_batches # go through this many
255+
validation_frequency = min(n_train_batches,patience/2)
256+
# go through this many
256257
# minibatche before checking the network
257258
# on the validation set; in this case we
258259
# check every epoch

doc/gettingstarted.txt

Lines changed: 65 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.. _gettingstarted:
1+
.. _gettingstarted:
22

33

44
===============
@@ -55,9 +55,7 @@ MNIST Dataset
5555
images. An image is represented as numpy 1-dimensional array of 784 (28
5656
x 28) float values between 0 and 1 (0 stands for black, 1 for white).
5757
The labels are numbers between 0 and 9 indicating which digit the image
58-
represents. When using the dataset, we usually divide it in minibatches
59-
(see :ref:`opt_SGD`). The code block below shows how to load the
60-
dataset and how to divide it in minibatches of a given size :
58+
represents. The code block below shows how to load the dataset.
6159

6260

6361
.. code-block:: python
@@ -69,43 +67,59 @@ MNIST Dataset
6967
train_set, valid_set, test_set = cPickle.load(f)
7068
f.close()
7169

72-
# make minibatches of size 20
73-
batch_size = 20 # sized of the minibatch
74-
75-
# Dealing with the training set
76-
# get the list of training images (x) and their labels (y)
77-
(train_set_x, train_set_y) = train_set
78-
# initialize the list of training minibatches with empty list
79-
train_batches = []
80-
for i in xrange(0, len(train_set_x), batch_size):
81-
# add to the list of minibatches the minibatch starting at
82-
# position i, ending at position i+batch_size
83-
# a minibatch is a pair ; the first element of the pair is a list
84-
# of datapoints, the second element is the list of corresponding
85-
# labels
86-
train_batches = train_batches + \
87-
[(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
88-
89-
# Dealing with the validation set
90-
(valid_set_x, valid_set_y) = valid_set
91-
# initialize the list of validation minibatches
92-
valid_batches = []
93-
for i in xrange(0, len(valid_set_x), batch_size):
94-
valid_batches = valid_batches + \
95-
[(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
96-
97-
# Dealing with the testing set
98-
(test_set_x, test_set_y) = test_set
99-
# initialize the list of testing minibatches
100-
test_batches = []
101-
for i in xrange(0, len(test_set_x), batch_size):
102-
test_batches = test_batches + \
103-
[(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
104-
105-
106-
# accessing training example i of minibatch j
107-
image = training_set[j][0][i]
108-
label = training_set[j][1][i]
70+
71+
When using the dataset, we usually divide it in minibatches (see
72+
:ref:`opt_SGD`). We encourage you to store the dataset into shared
73+
variables and access it based on the minibatch offset, given a fixed
74+
and known batch size. The reason behind shared variables is
75+
related to using the GPU. There is a large overhead when copying data
76+
into the GPU memory. If you would copy data on request ( each minibatch
77+
individually when needed) as the code will do if you do not use shared
78+
variables, due to this overhead, the GPU code will not be much faster
79+
then the CPU code (maybe even slower). If you have your data into a
80+
Theano shared variables though, you give Theano the possibility to copy
81+
the entire data on the GPU in a single call when the shared variables are constructed.
82+
Afterwards the GPU can access any minibatch by taking a slice from this
83+
shared variables, without needing to copy any information from the CPU
84+
memory and therefore bypassing the overhead.
85+
Because the datapoints and their labels are usually of different nature
86+
(labels are usually integers while datapoints are real numbers) we
87+
suggest to use different variables for labes and data. Also we recomand
88+
using different variables for the training set, validation set and
89+
testing set to make the code more readable (resulting in 6 different
90+
shared variables).
91+
92+
Since now the data is in one variable, and a minibatch is defined as a
93+
slice of that variable, it comes more natural to define a minibatch by
94+
indicating where the slice starts (the offset) and how large it is (the
95+
batch size). Note that since the batch size stays constant through out the
96+
execution of the code, a function will
97+
require only the offset as input in order to identify on which minibatch to work.
98+
The code below shows how to store your data and how to
99+
access a minibatch:
100+
101+
102+
.. code-block:: python
103+
104+
def shared_dataset(data_xy):
105+
data_x, data_y = data_xy
106+
shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
107+
shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
108+
return shared_x, T.cast(shared_y, 'int32')
109+
110+
test_set_x, test_set_y = shared_dataset(test_set)
111+
valid_set_x, valid_set_y = shared_dataset(valid_set)
112+
train_set_x, train_set_y = shared_dataset(train_set)
113+
114+
batch_size = 500 # size of the minibatch
115+
116+
# accessing the third minibatch of the training set
117+
118+
data = train_set_x[2*500:3*500]
119+
label = train_set_y[2*500:3*500]
120+
121+
122+
109123

110124

111125
.. index:: Notation
@@ -503,7 +517,8 @@ of a strategy based on a geometrically increasing amount of patience.
503517
# validation error is found
504518
improvement_threshold = 0.995 # a relative improvement of this much is
505519
# considered significant
506-
validation_frequency = 2500 # make this many SGD updates between validations
520+
validation_frequency = min(2500, patience/2.)
521+
# make this many SGD updates between validations
507522

508523
# initialize cross-validation variables
509524
best_params = None
@@ -547,6 +562,14 @@ of a strategy based on a geometrically increasing amount of patience.
547562
If we run out of batches of training data before running out of patience, then
548563
we just go back to the beginning of the training set and repeat.
549564

565+
566+
.. note::
567+
568+
The ``validation_frequency`` should always be smaller than the
569+
``patience``. The code should check at least two times how it
570+
performs before running out of patience. This is the reason we used
571+
the formulation ``validation_frequency = min( 2500, patience/2.)``
572+
550573
.. note::
551574

552575
This algorithm could possibly be improved by using a test of statistical significance

doc/lenet.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,11 @@ instantiate the network as follows.
504504
updates = {}
505505
for param_i, grad_i in zip(params, grads):
506506
updates[param_i] = param_i - learning_rate * grad_i
507-
train_model = theano.function([x, y], cost, updates=updates)
507+
train_model = theano.function([minibatch_offset], cost, updates = updates,
508+
givens={
509+
x:train_set_x[minibatch_offset:minibatch_offset+batch_size],
510+
y:train_set_y[minibatch_offset:minibatch_offset+batch_size]})
511+
508512

509513

510514
We leave out the code, which performs the actual training and early-stopping,

0 commit comments

Comments
 (0)