22Build a tweet sentiment analyzer
33'''
44from collections import OrderedDict
5- import copy
65import cPickle as pkl
76import random
87import sys
98import time
109
1110import numpy
1211import theano
12+ from theano import config
1313import theano .tensor as tensor
1414from theano .sandbox .rng_mrg import MRG_RandomStreams as RandomStreams
1515
1818datasets = {'imdb' : (imdb .load_data , imdb .prepare_data )}
1919
2020
21+ def numpy_floatX (data ):
22+ return numpy .asarray (data , dtype = config .floatX )
23+
24+
2125def get_minibatches_idx (n , minibatch_size , shuffle = False ):
2226 """
2327 Used to shuffle the dataset at each iteration.
@@ -86,14 +90,14 @@ def init_params(options):
8690 # embedding
8791 randn = numpy .random .rand (options ['n_words' ],
8892 options ['dim_proj' ])
89- params ['Wemb' ] = (0.01 * randn ).astype ('float32' )
93+ params ['Wemb' ] = (0.01 * randn ).astype (config . floatX )
9094 params = get_layer (options ['encoder' ])[0 ](options ,
9195 params ,
9296 prefix = options ['encoder' ])
9397 # classifier
9498 params ['U' ] = 0.01 * numpy .random .randn (options ['dim_proj' ],
95- options ['ydim' ]).astype ('float32' )
96- params ['b' ] = numpy .zeros ((options ['ydim' ],)).astype ('float32' )
99+ options ['ydim' ]).astype (config . floatX )
100+ params ['b' ] = numpy .zeros ((options ['ydim' ],)).astype (config . floatX )
97101
98102 return params
99103
@@ -123,7 +127,7 @@ def get_layer(name):
123127def ortho_weight (ndim ):
124128 W = numpy .random .randn (ndim , ndim )
125129 u , s , v = numpy .linalg .svd (W )
126- return u .astype ('float32' )
130+ return u .astype (config . floatX )
127131
128132
129133def param_init_lstm (options , params , prefix = 'lstm' ):
@@ -143,7 +147,7 @@ def param_init_lstm(options, params, prefix='lstm'):
143147 ortho_weight (options ['dim_proj' ])], axis = 1 )
144148 params [_p (prefix , 'U' )] = U
145149 b = numpy .zeros ((4 * options ['dim_proj' ],))
146- params [_p (prefix , 'b' )] = b .astype ('float32' )
150+ params [_p (prefix , 'b' )] = b .astype (config . floatX )
147151
148152 return params
149153
@@ -159,8 +163,8 @@ def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
159163
160164 def _slice (_x , n , dim ):
161165 if _x .ndim == 3 :
162- return _x [:, :, n * dim :(n + 1 ) * dim ]
163- return _x [:, n * dim :(n + 1 ) * dim ]
166+ return _x [:, :, n * dim :(n + 1 ) * dim ]
167+ return _x [:, n * dim :(n + 1 ) * dim ]
164168
165169 def _step (m_ , x_ , h_ , c_ ):
166170 preact = tensor .dot (h_ , tparams [_p (prefix , 'U' )])
@@ -186,9 +190,11 @@ def _step(m_, x_, h_, c_):
186190 dim_proj = options ['dim_proj' ]
187191 rval , updates = theano .scan (_step ,
188192 sequences = [mask , state_below ],
189- outputs_info = [tensor .alloc (0. , n_samples ,
193+ outputs_info = [tensor .alloc (numpy_floatX (0. ),
194+ n_samples ,
190195 dim_proj ),
191- tensor .alloc (0. , n_samples ,
196+ tensor .alloc (numpy_floatX (0. ),
197+ n_samples ,
192198 dim_proj )],
193199 name = _p (prefix , '_layers' ),
194200 n_steps = nsteps )
@@ -229,21 +235,21 @@ def sgd(lr, tparams, grads, x, mask, y, cost):
229235
230236
231237def adadelta (lr , tparams , grads , x , mask , y , cost ):
232- zipped_grads = [theano .shared (p .get_value () * numpy . float32 (0. ),
238+ zipped_grads = [theano .shared (p .get_value () * numpy_floatX (0. ),
233239 name = '%s_grad' % k )
234240 for k , p in tparams .iteritems ()]
235- running_up2 = [theano .shared (p .get_value () * numpy . float32 (0. ),
241+ running_up2 = [theano .shared (p .get_value () * numpy_floatX (0. ),
236242 name = '%s_rup2' % k )
237243 for k , p in tparams .iteritems ()]
238- running_grads2 = [theano .shared (p .get_value () * numpy . float32 (0. ),
244+ running_grads2 = [theano .shared (p .get_value () * numpy_floatX (0. ),
239245 name = '%s_rgrad2' % k )
240246 for k , p in tparams .iteritems ()]
241247
242248 zgup = [(zg , g ) for zg , g in zip (zipped_grads , grads )]
243249 rg2up = [(rg2 , 0.95 * rg2 + 0.05 * (g ** 2 ))
244250 for rg2 , g in zip (running_grads2 , grads )]
245251
246- f_grad_shared = theano .function ([x , mask , y ], cost , updates = zgup + rg2up ,
252+ f_grad_shared = theano .function ([x , mask , y ], cost , updates = zgup + rg2up ,
247253 name = 'adadelta_f_grad_shared' )
248254
249255 updir = [- tensor .sqrt (ru2 + 1e-6 ) / tensor .sqrt (rg2 + 1e-6 ) * zg
@@ -254,21 +260,21 @@ def adadelta(lr, tparams, grads, x, mask, y, cost):
254260 for ru2 , ud in zip (running_up2 , updir )]
255261 param_up = [(p , p + ud ) for p , ud in zip (tparams .values (), updir )]
256262
257- f_update = theano .function ([lr ], [], updates = ru2up + param_up ,
263+ f_update = theano .function ([lr ], [], updates = ru2up + param_up ,
258264 on_unused_input = 'ignore' ,
259265 name = 'adadelta_f_update' )
260266
261267 return f_grad_shared , f_update
262268
263269
264270def rmsprop (lr , tparams , grads , x , mask , y , cost ):
265- zipped_grads = [theano .shared (p .get_value () * numpy . float32 (0. ),
271+ zipped_grads = [theano .shared (p .get_value () * numpy_floatX (0. ),
266272 name = '%s_grad' % k )
267273 for k , p in tparams .iteritems ()]
268- running_grads = [theano .shared (p .get_value () * numpy . float32 (0. ),
274+ running_grads = [theano .shared (p .get_value () * numpy_floatX (0. ),
269275 name = '%s_rgrad' % k )
270276 for k , p in tparams .iteritems ()]
271- running_grads2 = [theano .shared (p .get_value () * numpy . float32 (0. ),
277+ running_grads2 = [theano .shared (p .get_value () * numpy_floatX (0. ),
272278 name = '%s_rgrad2' % k )
273279 for k , p in tparams .iteritems ()]
274280
@@ -281,15 +287,15 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
281287 updates = zgup + rgup + rg2up ,
282288 name = 'rmsprop_f_grad_shared' )
283289
284- updir = [theano .shared (p .get_value () * numpy . float32 (0. ),
290+ updir = [theano .shared (p .get_value () * numpy_floatX (0. ),
285291 name = '%s_updir' % k )
286292 for k , p in tparams .iteritems ()]
287293 updir_new = [(ud , 0.9 * ud - 1e-4 * zg / tensor .sqrt (rg2 - rg ** 2 + 1e-4 ))
288294 for ud , zg , rg , rg2 in zip (updir , zipped_grads , running_grads ,
289295 running_grads2 )]
290296 param_up = [(p , p + udn [1 ])
291297 for p , udn in zip (tparams .values (), updir_new )]
292- f_update = theano .function ([lr ], [], updates = updir_new + param_up ,
298+ f_update = theano .function ([lr ], [], updates = updir_new + param_up ,
293299 on_unused_input = 'ignore' ,
294300 name = 'rmsprop_f_update' )
295301
@@ -300,10 +306,10 @@ def build_model(tparams, options):
300306 trng = RandomStreams (1234 )
301307
302308 # Used for dropout.
303- use_noise = theano .shared (numpy . float32 (0. ))
309+ use_noise = theano .shared (numpy_floatX (0. ))
304310
305311 x = tensor .matrix ('x' , dtype = 'int64' )
306- mask = tensor .matrix ('mask' , dtype = 'float32' )
312+ mask = tensor .matrix ('mask' , dtype = config . floatX )
307313 y = tensor .vector ('y' , dtype = 'int64' )
308314
309315 n_timesteps = x .shape [0 ]
@@ -321,7 +327,7 @@ def build_model(tparams, options):
321327 if options ['use_dropout' ]:
322328 proj = dropout_layer (proj , use_noise , trng )
323329
324- pred = tensor .nnet .softmax (tensor .dot (proj , tparams ['U' ])+ tparams ['b' ])
330+ pred = tensor .nnet .softmax (tensor .dot (proj , tparams ['U' ]) + tparams ['b' ])
325331
326332 f_pred_prob = theano .function ([x , mask ], pred , name = 'f_pred_prob' )
327333 f_pred = theano .function ([x , mask ], pred .argmax (axis = 1 ), name = 'f_pred' )
@@ -336,7 +342,7 @@ def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
336342 the probabilities of new examples.
337343 """
338344 n_samples = len (data [0 ])
339- probs = numpy .zeros ((n_samples , 2 )).astype ('float32' )
345+ probs = numpy .zeros ((n_samples , 2 )).astype (config . floatX )
340346
341347 n_done = 0
342348
@@ -368,7 +374,7 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
368374 preds = f_pred (x , mask )
369375 targets = numpy .array (data [1 ])[valid_index ]
370376 valid_err += (preds == targets ).sum ()
371- valid_err = 1. - numpy . float32 (valid_err ) / len (data [0 ])
377+ valid_err = 1. - numpy_floatX (valid_err ) / len (data [0 ])
372378
373379 return valid_err
374380
@@ -396,6 +402,7 @@ def train_lstm(
396402 use_dropout = True , # if False slightly faster, but worst test error
397403 # This frequently need a bigger model.
398404 reload_model = "" , # Path to a saved model we want to start from.
405+ test_size = - 1 , # If >0, we keep only this number of test example.
399406):
400407
401408 # Model options
@@ -407,8 +414,16 @@ def train_lstm(
407414 print 'Loading data'
408415 train , valid , test = load_data (n_words = n_words , valid_portion = 0.05 ,
409416 maxlen = maxlen )
417+ if test_size > 0 :
418+ # The test set is sorted by size, but we want to keep random
419+ # size example. So we must select a random selection of the
420+ # examples.
421+ idx = numpy .arange (len (test [0 ]))
422+ random .shuffle (idx )
423+ idx = idx [:test_size ]
424+ test = ([test [0 ][n ] for n in idx ], [test [1 ][n ] for n in idx ])
410425
411- ydim = numpy .max (train [1 ])+ 1
426+ ydim = numpy .max (train [1 ]) + 1
412427
413428 model_options ['ydim' ] = ydim
414429
@@ -430,9 +445,9 @@ def train_lstm(
430445 y , f_pred_prob , f_pred , cost ) = build_model (tparams , model_options )
431446
432447 if decay_c > 0. :
433- decay_c = theano .shared (numpy . float32 (decay_c ), name = 'decay_c' )
448+ decay_c = theano .shared (numpy_floatX (decay_c ), name = 'decay_c' )
434449 weight_decay = 0.
435- weight_decay += (tparams ['U' ]** 2 ).sum ()
450+ weight_decay += (tparams ['U' ] ** 2 ).sum ()
436451 weight_decay *= decay_c
437452 cost += weight_decay
438453
@@ -447,10 +462,8 @@ def train_lstm(
447462
448463 print 'Optimization'
449464
450- kf_valid = get_minibatches_idx (len (valid [0 ]), valid_batch_size ,
451- shuffle = True )
452- kf_test = get_minibatches_idx (len (test [0 ]), valid_batch_size ,
453- shuffle = True )
465+ kf_valid = get_minibatches_idx (len (valid [0 ]), valid_batch_size )
466+ kf_test = get_minibatches_idx (len (test [0 ]), valid_batch_size )
454467
455468 print "%d train examples" % len (train [0 ])
456469 print "%d valid examples" % len (valid [0 ])
@@ -460,9 +473,9 @@ def train_lstm(
460473 bad_count = 0
461474
462475 if validFreq == - 1 :
463- validFreq = len (train [0 ])/ batch_size
476+ validFreq = len (train [0 ]) / batch_size
464477 if saveFreq == - 1 :
465- saveFreq = len (train [0 ])/ batch_size
478+ saveFreq = len (train [0 ]) / batch_size
466479
467480 uidx = 0 # the number of update done
468481 estop = False # early stop
@@ -482,12 +495,10 @@ def train_lstm(
482495 y = [train [1 ][t ] for t in train_index ]
483496 x = [train [0 ][t ]for t in train_index ]
484497
485- # Get the data in numpy.ndarray formet.
486- # It return something of the shape (minibatch maxlen, n samples)
487- x , mask , y = prepare_data (x , y , maxlen = maxlen )
488- if x is None :
489- print 'Minibatch with zero sample under length ' , maxlen
490- continue
498+ # Get the data in numpy.ndarray format
499+ # This swap the axis!
500+ # Return something of shape (minibatch maxlen, n samples)
501+ x , mask , y = prepare_data (x , y )
491502 n_samples += x .shape [1 ]
492503
493504 cost = f_grad_shared (x , mask , y )
@@ -514,7 +525,8 @@ def train_lstm(
514525 if numpy .mod (uidx , validFreq ) == 0 :
515526 use_noise .set_value (0. )
516527 train_err = pred_error (f_pred , prepare_data , train , kf )
517- valid_err = pred_error (f_pred , prepare_data , valid , kf_valid )
528+ valid_err = pred_error (f_pred , prepare_data , valid ,
529+ kf_valid )
518530 test_err = pred_error (f_pred , prepare_data , test , kf_test )
519531
520532 history_errs .append ([valid_err , test_err ])
@@ -553,7 +565,8 @@ def train_lstm(
553565 best_p = unzip (tparams )
554566
555567 use_noise .set_value (0. )
556- train_err = pred_error (f_pred , prepare_data , train , kf )
568+ kf_train_sorted = get_minibatches_idx (len (train [0 ]), batch_size )
569+ train_err = pred_error (f_pred , prepare_data , train , kf_train_sorted )
557570 valid_err = pred_error (f_pred , prepare_data , valid , kf_valid )
558571 test_err = pred_error (f_pred , prepare_data , test , kf_test )
559572
@@ -570,14 +583,9 @@ def train_lstm(
570583
571584
572585if __name__ == '__main__' :
573-
574- # We must have floatX=float32 for this tutorial to work correctly.
575- theano .config .floatX = "float32"
576- # The next line is the new Theano default. This is a speed up.
577- theano .config .scan .allow_gc = False
578-
579586 # See function train for all possible parameter and there definition.
580587 train_lstm (
581588 #reload_model="lstm_model.npz",
582589 max_epochs = 100 ,
590+ test_size = 500 ,
583591 )
0 commit comments