Skip to content

Commit e56647b

Browse files
committed
Merge branch 'master' of git@github.com:lisa-lab/DeepLearningTutorials
2 parents 6583b95 + e972e03 commit e56647b

7 files changed

Lines changed: 96 additions & 48 deletions

File tree

code/SdA.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ def test_score():
279279

280280

281281

282-
def test_SdA( finetune_lr = 0.1, pretraining_epochs = 2, \
282+
def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \
283283
pretrain_lr = 0.1, training_epochs = 1000, \
284284
dataset='mnist.pkl.gz'):
285285
"""
@@ -322,7 +322,7 @@ def test_SdA( finetune_lr = 0.1, pretraining_epochs = 2, \
322322
print '... building the model'
323323
# construct the stacked denoising autoencoder class
324324
sda = SdA( numpy_rng = numpy_rng, n_ins = 28*28,
325-
hidden_layers_sizes = [100,100,100],
325+
hidden_layers_sizes = [1000,1000,1000],
326326
n_outs = 10)
327327

328328

code/dA.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,17 @@ def get_corrupted_input(self, input, corruption_level):
170170
171171
this will produce an array of 0s and 1s where 1 has a probability of
172172
1 - ``corruption_level`` and 0 with ``corruption_level``
173+
174+
The binomial function return int64 data type by default.
175+
int64 multiplicated by the input type(floatX) always return float64.
176+
To keep all data in floatX when floatX is float32, we set the dtype
177+
of the binomial to floatX. As in our case the value of the binomial
178+
is always 0 or 1, this don't change the result. This is needed to allow
179+
the gpu to work correctly as it only support float32 for now.
173180
"""
174-
return self.theano_rng.binomial( size = input.shape, n = 1, prob = 1 - corruption_level) * input
181+
if corruption_level==0:
182+
return input
183+
return self.theano_rng.binomial( size = input.shape, n = 1, prob = 1 - corruption_level, dtype=theano.config.floatX) * input
175184

176185

177186
def get_hidden_values(self, input):
@@ -254,7 +263,7 @@ def test_dA( learning_rate = 0.1, training_epochs = 15, dataset ='mnist.pkl.gz'
254263

255264
train_da = theano.function([index], cost, updates = updates,
256265
givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]})
257-
266+
258267
start_time = time.clock()
259268

260269
############

code/rbm.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -105,15 +105,17 @@ def sample_h_given_v(self, v0_sample):
105105
# compute the activation of the hidden units given a sample of the visibles
106106
h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias)
107107
# get a sample of the hiddens given their activation
108-
h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean)
108+
h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean,
109+
dtype = theano.config.floatX)
109110
return [h1_mean, h1_sample]
110111

111112
def sample_v_given_h(self, h0_sample):
112113
''' This function infers state of visible units given hidden units '''
113114
# compute the activation of the visible given the hidden sample
114115
v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias)
115116
# get a sample of the visible given their activation
116-
v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean)
117+
v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean,
118+
dtype = theano.config.floatX)
117119
return [v1_mean, v1_sample]
118120

119121
def gibbs_hvh(self, h0_sample):
@@ -159,10 +161,14 @@ def cd(self, lr = 0.1, persistent=None):
159161
[nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start)
160162

161163
# determine gradients on RBM parameters
162-
g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size
163-
g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/self.batch_size
164-
g_W = T.dot(ph_mean.T, self.input )/ self.batch_size - \
165-
T.dot(nh_mean.T, nv_mean )/ self.batch_size
164+
# cast batch_size to floatX, because its type is int64,
165+
# and otherwise the gradients are upcasted to float64,
166+
# even when floatX == float32
167+
batch_size = T.cast(self.batch_size, dtype=theano.config.floatX)
168+
g_vbias = T.sum( self.input - nv_mean, axis = 0)/batch_size
169+
g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/batch_size
170+
g_W = T.dot(ph_mean.T, self.input )/ batch_size - \
171+
T.dot(nh_mean.T, nv_mean )/ batch_size
166172

167173
gparams = [g_W.T, g_hbias, g_vbias]
168174

@@ -324,8 +330,8 @@ def test_rbm(learning_rate=0.1, training_epochs = 15,
324330
# define one step of Gibbs sampling (mf = mean-field)
325331
[hid_mf, hid_sample, vis_mf, vis_sample] = rbm.gibbs_vhv(persistent_vis_chain)
326332

327-
# the sample at the end of the channel is returned by ``gibbs_1`` as
328-
# its second output; note that this is computed as a binomial draw,
333+
# the sample at the end of the channel is returned by ``gibbs_vhb`` as
334+
# its last output; note that this is computed as a binomial draw,
329335
# therefore it is formed of ints (0 and 1) and therefore needs to
330336
# be converted to the same dtype as ``persistent_vis_chain``
331337
vis_sample = T.cast(vis_sample, dtype=theano.config.floatX)
@@ -343,7 +349,7 @@ def test_rbm(learning_rate=0.1, training_epochs = 15,
343349

344350
for idx in xrange(n_samples):
345351

346-
# do `plot_every` intermediate samplings of which we do not care
352+
# generate `plot_every` intermediate samples that we discard, because successive samples in the chain are too correlated
347353
for jdx in xrange(plot_every):
348354
vis_mf, vis_sample = sample_fn()
349355

code/test.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,33 @@
11
#import convolutional_mlp, dbn, logistic_cg, logistic_sgd, mlp, rbm, SdA_loops, SdA
2-
import convolutional_mlp, logistic_cg, logistic_sgd, mlp, SdA
2+
import convolutional_mlp, logistic_cg, logistic_sgd, mlp, SdA, dA
33
from nose.plugins.skip import SkipTest
4+
import time,sys
45
#TODO: dbn, rbm, SdA, SdA_loops, convolutional_mlp
56
def test_logistic_sgd():
7+
t0=time.time()
68
logistic_sgd.sgd_optimization_mnist(n_epochs=10)
9+
print >> sys.stderr, "test_logistic_sgd took %.3fs expected 15.2s in our buildbot"%(time.time()-t0)
710
def test_logistic_cg():
11+
t0=time.time()
812
logistic_cg.cg_optimization_mnist(n_epochs=10)
13+
print >> sys.stderr, "test_logistic_cg took %.3fs expected 14s in our buildbot"%(time.time()-t0)
914
def test_mlp():
15+
t0=time.time()
1016
mlp.test_mlp(n_epochs=5)
17+
print >> sys.stderr, "test_mlp took %.3fs expected 118s in our buildbot"%(time.time()-t0)
1118
def test_convolutional_mlp():
19+
t0=time.time()
1220
convolutional_mlp.evaluate_lenet5(n_epochs=5,nkerns=[5,5])
21+
print >> sys.stderr, "test_convolutional_mlp took %.3fs expected 168s in our buildbot"%(time.time()-t0)
1322
def test_dbn():
1423
raise SkipTest('Implementation not finished')
1524
def test_rbm():
1625
raise SkipTest('Implementation not finished')
26+
def test_dA():
27+
t0=time.time()
28+
dA.test_dA(training_epochs = 3)
29+
print >> sys.stderr, "test_dA took %.3fs expected Xs in our buildbot"%(time.time()-t0)
1730
def test_SdA():
31+
t0=time.time()
1832
SdA.test_SdA(pretraining_epochs = 2, training_epochs = 3)
33+
print >> sys.stderr, "test_SdA took %.3fs expected 971s in our buildbot"%(time.time()-t0)

doc/SdA.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -427,9 +427,9 @@ The user can run the code by calling:
427427
python code/SdA.py
428428

429429
By default the code runs 15 pre-training epochs for each layer, with
430-
a corruption level of 0.1 and a learning rate of 0.1. Pre-training takes
431-
78.88 minutes. Fine-tuning is completed after 32 epochs in 65.89
432-
minutes and results in a validation score of 1.7 %, with a test
433-
performace of 1.65 %.
430+
a corruption level of 0.2 and a learning rate of 0.1. Pre-training takes
431+
80.63 minutes. Fine-tuning is completed after 48 epochs in 97.18
432+
minutes and results in a validation score of 1.63 %, with a test
433+
performace of 1.68 %.
434434

435435

doc/rbm.txt

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ descent on the empirical log-likelihood of the training data:
3939
\mathcal{L}(\theta, \mathcal{D}) = \frac{1}{N} \sum_{x^{(i)} \in
4040
\mathcal{D}} \log\ p(x^{(i)}).
4141

42-
using the stochastic gradient :math:`\frac{\partial p(x^{(i)})}{\partial
42+
using the stochastic gradient :math:`\frac{\partial \log p(x^{(i)})}{\partial
4343
\theta}`, where :math:`\theta` are the parameters of the model.
4444

4545

@@ -102,9 +102,11 @@ denoted as :math:`\mathcal{N}`. The gradient can then be written as:
102102
\frac{\partial \log p(x)}{\partial \theta}
103103
&\approx
104104
- \frac{\partial \mathcal{F}(x)}{\partial \theta} +
105-
\sum_{\tilde{x} \in \mathcal{N}} p(\tilde{x}) \
105+
\frac{1}{|\mathcal{N}|}\sum_{\tilde{x} \in \mathcal{N}} \
106106
\frac{\partial \mathcal{F}(\tilde{x})}{\partial \theta}.
107107

108+
where we would ideally like elements :math:`\tilde{x}` of :math:`\mathcal{N}` to be sampled
109+
according to :math:`P` (i.e. we are doing Monte-Carlo).
108110
With the above formula, we almost have a pratical, stochastic algorithm for
109111
learning an EBM. The only missing ingredient is how to extract these negative
110112
particles :math:`\mathcal{N}`. While the statistical litterature abounds with
@@ -116,8 +118,14 @@ EBM.
116118
Restricted Boltzmann Machines (RBM)
117119
+++++++++++++++++++++++++++++++++++
118120

119-
Boltzmann Machines (BMs) are a particular form of energy-based model which
120-
contain hidden variables. Restricted Boltzmann Machines further restrict BMs to
121+
Boltzmann Machines (BMs) are a particular form of log-linear Markov Random Field (MRF),
122+
i.e., for which the energy function is linear in its free parameters. To make
123+
them powerful enough to represent complicated distributions (i.e., go from the
124+
limited parametric setting to a non-parametric one), we consider that some of
125+
the variables are never observed (they are called hidden). By having more hidden
126+
variables (also called hidden units), we can increase the modeling capacity
127+
of the Boltzmann Machine (BM).
128+
Restricted Boltzmann Machines further restrict BMs to
121129
those without visible-visible and hidden-hidden connections. A graphical
122130
depiction of an RBM is shown below.
123131

@@ -151,8 +159,8 @@ write:
151159

152160
**RBMs with binary units**
153161

154-
In the commonly studied case of using binary units (where :math:`h_i \in
155-
\{0,1\}`, we obtain from Eq. :eq:`rbm_energy` and :eq:`energy2`, a stochastic
162+
In the commonly studied case of using binary units (where :math:`x_j` and :math:`h_i \in
163+
\{0,1\}`), we obtain from Eq. :eq:`rbm_energy` and :eq:`energy2`, a probabilistic
156164
version of the usual neuron activation function:
157165

158166
.. math::
@@ -181,15 +189,16 @@ following log-likelihood gradients for an RBM with binary units:
181189
:label: rbm_grad
182190

183191
\frac {\partial{\log p(v)}} {\partial W_{ij}} &=
184-
- x^{(i)}_j \cdot sigm(W_i \cdot x^{(i)} + c_i)
185-
+ E_v[p(h_i|v) \cdot v_j] \\
192+
x^{(i)}_j \cdot sigm(W_i \cdot x^{(i)} + c_i)
193+
- E_v[p(h_i|v) \cdot v_j] \\
186194
\frac {\partial{\log p(v)}} {\partial c_i} &=
187-
- sigm(W_i \cdot x^{(i)}) + E_v[p(h_i|v)] \\
195+
sigm(W_i \cdot x^{(i)}) - E_v[p(h_i|v)] \\
188196
\frac {\partial{\log p(v)}} {\partial b_j} &=
189-
- x^{(i)}_j + E_v[p(v_j|h)]
197+
x^{(i)}_j - E_v[p(v_j|h)]
190198

191199
For a more detailed derivation of these equations, we refer the reader to the
192-
following `page <http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Public/DBNEquations>`_.
200+
following `page <http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Public/DBNEquations>`_,
201+
or to section 5 of `Learning Deep Architectures for AI <http://www.iro.umontreal.ca/%7Elisa/publications2/index.php/publications/show/239>`_.
193202

194203
.. note::
195204
We will be updating the tutorial shortly, such that the gradients are
@@ -219,7 +228,11 @@ follows:
219228
x^{(n+1)} &\sim sigm(W h^{(n+1)} + b),
220229

221230
where :math:`h^{(n)}` refers to the set of all hidden units at the n-th step of
222-
the Markov chain.
231+
the Markov chain. What it means is that, for example, :math:`h^{(n+1)}_i` is
232+
randomly chosen to be 1 (versus 0) with probability :math:`sigm(W_i'x^{(n)} + c_i)`,
233+
and similarly,
234+
:math:`x^{(n+1)}_j` is
235+
randomly chosen to be 1 (versus 0) with probability :math:`sigm(W_{.j} h^{(n+1)} + b_j)`.
223236

224237
This can be illustrated graphically:
225238

@@ -241,9 +254,10 @@ Contrastive Divergence (CD-k)
241254

242255
Contrastive Divergence uses two tricks to speed up the sampling process:
243256

244-
* since we eventually want :math:`p(x) \approx p_T(x)` (the true, underlying
257+
* since we eventually want :math:`p(x) \approx p_{train}(x)` (the true, underlying
245258
distribution of the data), we initialize the Markov chain with a training
246-
example.
259+
example (i.e., from a distribution that is expected to be close to :math:`p`,
260+
so that the chain will be already close to having converged to its final distribution :math:`p`).
247261

248262
* CD does not wait for the chain to converge. Samples are obtained after only
249263
k-steps of Gibbs sampling. In pratice, :math:`k=1` has been shown to work
@@ -255,8 +269,9 @@ Persistent CD
255269

256270
Persistent CD [Tieleman08]_ uses another approximation for sampling from
257271
:math:`p(x,h)`. It relies on a single Markov chain, which has a persistent
258-
state. For each parameter update, we extract new samples by simply running the
259-
chain for k-steps. The state of the chain is then preserved for subsequent updates.
272+
state (i.e., not restarting a chain for each observed example). For each
273+
parameter update, we extract new samples by simply running the chain for
274+
k-steps. The state of the chain is then preserved for subsequent updates.
260275

261276
The general intuition is that if parameter updates are small enough compared
262277
to the mixing rate of the chain, the Markov chain should be able to "catch up"
@@ -447,7 +462,7 @@ compute the gradients of Eq. :eq:`rbm_grad`.
447462
gparams = [g_W.T, g_hbias, g_vbias]
448463

449464
Finally, we construct the updates dictionary containing the parameter
450-
updates. In case of PCD, these should also update the shared variable
465+
updates. In the case of PCD, these should also update the shared variable
451466
containing the state of the Gibbs chain.
452467

453468
.. code-block:: python
@@ -536,8 +551,8 @@ samples at every 1000 steps.
536551
# define one step of Gibbs sampling (mf = mean-field)
537552
[hid_mf, hid_sample, vis_mf, vis_sample] = rbm.gibbs_vhv(persistent_vis_chain)
538553

539-
# the sample at the end of the channel is returned by ``gibbs_1`` as
540-
# its second output; note that this is computed as a binomial draw,
554+
# the sample at the end of the channel is returned by ``gibbs_vhv`` as
555+
# its last output; note that this is computed as a binomial draw,
541556
# therefore it is formed of ints (0 and 1) and therefore needs to
542557
# be converted to the same dtype as ``persistent_vis_chain``
543558
vis_sample = T.cast(vis_sample, dtype=theano.config.floatX)
@@ -554,13 +569,15 @@ samples at every 1000 steps.
554569
plot_every = 1000
555570

556571
for idx in xrange(n_samples):
557-
# do `plot_every` intermediate samplings of which we do not care
572+
# generate `plot_every` intermediate samples that we discard, because successive samples in the chain are too correlated
558573
for jdx in xrange(plot_every):
559574
vis_mf, vis_sample = sample_fn()
560575

561576
# construct image
562577
image = PIL.Image.fromarray(tile_raster_images(
563-
X = vis_mf, img_shape = (28,28), tile_shape = (10,10),
578+
X = vis_mf,
579+
img_shape = (28,28),
580+
tile_shape = (10,10),
564581
tile_spacing = (1,1) ) )
565582
print ' ... plotting sample ', idx
566583
image.save('sample_%i_step_%i.png'%(idx,idx*jdx))
@@ -580,7 +597,7 @@ Several options are available to the user.
580597

581598
Negative samples obtained during training can be visualized. As training
582599
progresses, we know that the model defined by the RBM becomes closer to the
583-
true underlying distribution, :math:`p_T(x)`. Negative samples should thus
600+
true underlying distribution, :math:`p_{train}(x)`. Negative samples should thus
584601
look like samples from the training set. Obviously bad hyperparameters can be
585602
discarded in this fashion.
586603

@@ -605,17 +622,18 @@ all bits are independent. Therefore,
605622
PL(x) = \prod_i P(x_i | x_{-i}) \text{ and }\\
606623
\log PL(x) = \sum_i \log P(x_i | x_{-i})
607624

608-
Here :math:`x_{-i}` denotes the set of all bits of :math:`x` minus bit
625+
Here :math:`x_{-i}` denotes the set of all bits of :math:`x` except bit
609626
:math:`i`. The log-PL is therefore the sum of the log-probabilities of each
610627
bit :math:`x_i`, conditionned on the state of all other bits. For MNIST, this
611628
would involve summing over the 784 input dimensions, which remains rather
612629
expensive. For this reason, we use the following stochastic approximation to
613630
log-PL:
614631

615632
.. math::
616-
\log PL(x) &\approx N \cdot \log P(x_i | x_{-i}) \text{, where }
617-
i \sim U(0,N),
618-
633+
g = N \cdot \log P(x_i | x_{-i}) \text{, where } i \sim U(0,N), \text{, and}\\
634+
E[ g ] = \log PL(x)
635+
636+
where the expectation is taken over the uniform random choice of index :math:`i`,
619637
and :math:`N` is the number of visible units. In order to work with binary
620638
units, we further introduce the notation :math:`\tilde{x}_i` to refer to
621639
:math:`x` with bit-i being flipped (1->0, 0->1). The log-PL for an RBM with binary unit is
@@ -649,7 +667,7 @@ values :math:`\{0,1,...,N\}`, from one update to another.
649667
# calculate free energy for the given bit configuration
650668
fe_xi = self.free_energy(xi)
651669

652-
# flip bit x_i of matrix xi and preserve all other bits x_{\i}
670+
# flip bit x_i of matrix xi and preserve all other bits x_{-i}
653671
# Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx]
654672
# NB: slice(start,stop,step) is the python object used for
655673
# slicing, e.g. to index matrix x as follows: x[start:stop:step]

doc/utilities.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Plotting Samples and Filters
1515

1616
To plot a sample, what we need to do is to take the visible units, which
1717
are a flattened image (there is no 2D structure to the visible units,
18-
just a 1D string of nodes) and reshape it into a 2D image. The order in
18+
just a 1D string of unit activations) and reshape it into a 2D image. The order in
1919
which the points from the 1D array go into the 2D image is given by the
2020
order in which the inital MNIST images where converted into a 1D array.
2121
Lucky for us this is just a call of the ``numpy.reshape`` function.
@@ -30,12 +30,12 @@ the input image.
3030

3131
We need a utility function that takes a minibatch, or the weight matrix,
3232
and converts each row ( for the weight matrix we do a transpose ) into a
33-
2D image and then tile this images together. Once we converted the
33+
2D image and then tile these images together. Once we converted the
3434
minibatch or the weights in this image of tiles, we can use PIL to plot
3535
and save. `PIL <http://www.pythonware.com/products/pil/>`_ is a standard
3636
python libarary to deal with images.
3737

38-
Tiling minibatches together is done for us by
38+
Tiling minibatches together is done for us by the
3939
``tile_raster_image`` function which we provide here.
4040

4141
.. code-block:: python

0 commit comments

Comments
 (0)