PageRenderTime 63ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/code/mlp.py

https://github.com/delallea/DeepLearningTutorials
Python | 344 lines | 328 code | 0 blank | 16 comment | 1 complexity | 66eb72eb9be3e7ace3feca6ddbaa7450 MD5 | raw file
  1. """
  2. This tutorial introduces the multilayer perceptron using Theano.
  3. A multilayer perceptron is a logistic regressor where
  4. instead of feeding the input to the logistic regression you insert a
  5. intermediate layer, called the hidden layer, that has a nonlinear
  6. activation function (usually tanh or sigmoid) . One can use many such
  7. hidden layers making the architecture deep. The tutorial will also tackle
  8. the problem of MNIST digit classification.
  9. .. math::
  10. f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
  11. References:
  12. - textbooks: "Pattern Recognition and Machine Learning" -
  13. Christopher M. Bishop, section 5
  14. """
  15. __docformat__ = 'restructedtext en'
  16. import numpy, time, cPickle, gzip, sys, os
  17. import theano
  18. import theano.tensor as T
  19. from logistic_sgd import LogisticRegression, load_data
  20. class HiddenLayer(object):
  21. def __init__(self, rng, input, n_in, n_out, activation = T.tanh):
  22. """
  23. Typical hidden layer of a MLP: units are fully-connected and have
  24. sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
  25. and the bias vector b is of shape (n_out,).
  26. NOTE : The nonlinearity used here is tanh
  27. Hidden unit activation is given by: tanh(dot(input,W) + b)
  28. :type rng: numpy.random.RandomState
  29. :param rng: a random number generator used to initialize weights
  30. :type input: theano.tensor.dmatrix
  31. :param input: a symbolic tensor of shape (n_examples, n_in)
  32. :type n_in: int
  33. :param n_in: dimensionality of input
  34. :type n_out: int
  35. :param n_out: number of hidden units
  36. :type activation: theano.Op or function
  37. :param activation: Non linearity to be applied in the hidden
  38. layer
  39. """
  40. self.input = input
  41. # `W` is initialized with `W_values` which is uniformely sampled
  42. # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
  43. # for tanh activation function
  44. # the output of uniform if converted using asarray to dtype
  45. # theano.config.floatX so that the code is runable on GPU
  46. # Note : optimal initialization of weights is dependent on the
  47. # activation function used (among other things).
  48. # For example, results presented in [Xavier10] suggest that you
  49. # should use 4 times larger initial weights for sigmoid
  50. # compared to tanh
  51. # We have no info for other function, so we use the same as tanh.
  52. W_values = numpy.asarray( rng.uniform(
  53. low = - numpy.sqrt(6./(n_in+n_out)),
  54. high = numpy.sqrt(6./(n_in+n_out)),
  55. size = (n_in, n_out)), dtype = theano.config.floatX)
  56. if activation == theano.tensor.nnet.sigmoid:
  57. W_values *= 4
  58. self.W = theano.shared(value = W_values, name ='W')
  59. b_values = numpy.zeros((n_out,), dtype= theano.config.floatX)
  60. self.b = theano.shared(value= b_values, name ='b')
  61. self.output = activation(T.dot(input, self.W) + self.b)
  62. # parameters of the model
  63. self.params = [self.W, self.b]
  64. class MLP(object):
  65. """Multi-Layer Perceptron Class
  66. A multilayer perceptron is a feedforward artificial neural network model
  67. that has one layer or more of hidden units and nonlinear activations.
  68. Intermediate layers usually have as activation function thanh or the
  69. sigmoid function (defined here by a ``SigmoidalLayer`` class) while the
  70. top layer is a softamx layer (defined here by a ``LogisticRegression``
  71. class).
  72. """
  73. def __init__(self, rng, input, n_in, n_hidden, n_out):
  74. """Initialize the parameters for the multilayer perceptron
  75. :type rng: numpy.random.RandomState
  76. :param rng: a random number generator used to initialize weights
  77. :type input: theano.tensor.TensorType
  78. :param input: symbolic variable that describes the input of the
  79. architecture (one minibatch)
  80. :type n_in: int
  81. :param n_in: number of input units, the dimension of the space in
  82. which the datapoints lie
  83. :type n_hidden: int
  84. :param n_hidden: number of hidden units
  85. :type n_out: int
  86. :param n_out: number of output units, the dimension of the space in
  87. which the labels lie
  88. """
  89. # Since we are dealing with a one hidden layer MLP, this will
  90. # translate into a TanhLayer connected to the LogisticRegression
  91. # layer; this can be replaced by a SigmoidalLayer, or a layer
  92. # implementing any other nonlinearity
  93. self.hiddenLayer = HiddenLayer(rng = rng, input = input,
  94. n_in = n_in, n_out = n_hidden,
  95. activation = T.tanh)
  96. # The logistic regression layer gets as input the hidden units
  97. # of the hidden layer
  98. self.logRegressionLayer = LogisticRegression(
  99. input = self.hiddenLayer.output,
  100. n_in = n_hidden,
  101. n_out = n_out)
  102. # L1 norm ; one regularization option is to enforce L1 norm to
  103. # be small
  104. self.L1 = abs(self.hiddenLayer.W).sum() \
  105. + abs(self.logRegressionLayer.W).sum()
  106. # square of L2 norm ; one regularization option is to enforce
  107. # square of L2 norm to be small
  108. self.L2_sqr = (self.hiddenLayer.W**2).sum() \
  109. + (self.logRegressionLayer.W**2).sum()
  110. # negative log likelihood of the MLP is given by the negative
  111. # log likelihood of the output of the model, computed in the
  112. # logistic regression layer
  113. self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
  114. # same holds for the function computing the number of errors
  115. self.errors = self.logRegressionLayer.errors
  116. # the parameters of the model are the parameters of the two layer it is
  117. # made out of
  118. self.params = self.hiddenLayer.params + self.logRegressionLayer.params
  119. def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000,
  120. dataset = '../data/mnist.pkl.gz', batch_size = 20):
  121. """
  122. Demonstrate stochastic gradient descent optimization for a multilayer
  123. perceptron
  124. This is demonstrated on MNIST.
  125. :type learning_rate: float
  126. :param learning_rate: learning rate used (factor for the stochastic
  127. gradient
  128. :type L1_reg: float
  129. :param L1_reg: L1-norm's weight when added to the cost (see
  130. regularization)
  131. :type L2_reg: float
  132. :param L2_reg: L2-norm's weight when added to the cost (see
  133. regularization)
  134. :type n_epochs: int
  135. :param n_epochs: maximal number of epochs to run the optimizer
  136. :type dataset: string
  137. :param dataset: the path of the MNIST dataset file from
  138. http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
  139. """
  140. datasets = load_data(dataset)
  141. train_set_x, train_set_y = datasets[0]
  142. valid_set_x, valid_set_y = datasets[1]
  143. test_set_x , test_set_y = datasets[2]
  144. # compute number of minibatches for training, validation and testing
  145. n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
  146. n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
  147. n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
  148. ######################
  149. # BUILD ACTUAL MODEL #
  150. ######################
  151. print '... building the model'
  152. # allocate symbolic variables for the data
  153. index = T.lscalar() # index to a [mini]batch
  154. x = T.matrix('x') # the data is presented as rasterized images
  155. y = T.ivector('y') # the labels are presented as 1D vector of
  156. # [int] labels
  157. rng = numpy.random.RandomState(1234)
  158. # construct the MLP class
  159. classifier = MLP( rng = rng, input=x, n_in=28*28, n_hidden = 500, n_out=10)
  160. # the cost we minimize during training is the negative log likelihood of
  161. # the model plus the regularization terms (L1 and L2); cost is expressed
  162. # here symbolically
  163. cost = classifier.negative_log_likelihood(y) \
  164. + L1_reg * classifier.L1 \
  165. + L2_reg * classifier.L2_sqr
  166. # compiling a Theano function that computes the mistakes that are made
  167. # by the model on a minibatch
  168. test_model = theano.function(inputs = [index],
  169. outputs = classifier.errors(y),
  170. givens={
  171. x:test_set_x[index*batch_size:(index+1)*batch_size],
  172. y:test_set_y[index*batch_size:(index+1)*batch_size]})
  173. validate_model = theano.function(inputs = [index],
  174. outputs = classifier.errors(y),
  175. givens={
  176. x:valid_set_x[index*batch_size:(index+1)*batch_size],
  177. y:valid_set_y[index*batch_size:(index+1)*batch_size]})
  178. # compute the gradient of cost with respect to theta (sotred in params)
  179. # the resulting gradients will be stored in a list gparams
  180. gparams = []
  181. for param in classifier.params:
  182. gparam = T.grad(cost, param)
  183. gparams.append(gparam)
  184. # specify how to update the parameters of the model as a dictionary
  185. updates = {}
  186. # given two list the zip A = [ a1,a2,a3,a4] and B = [b1,b2,b3,b4] of
  187. # same length, zip generates a list C of same size, where each element
  188. # is a pair formed from the two lists :
  189. # C = [ (a1,b1), (a2,b2), (a3,b3) , (a4,b4) ]
  190. for param, gparam in zip(classifier.params, gparams):
  191. updates[param] = param - learning_rate*gparam
  192. # compiling a Theano function `train_model` that returns the cost, but
  193. # in the same time updates the parameter of the model based on the rules
  194. # defined in `updates`
  195. train_model =theano.function( inputs = [index], outputs = cost,
  196. updates = updates,
  197. givens={
  198. x:train_set_x[index*batch_size:(index+1)*batch_size],
  199. y:train_set_y[index*batch_size:(index+1)*batch_size]})
  200. ###############
  201. # TRAIN MODEL #
  202. ###############
  203. print '... training'
  204. # early-stopping parameters
  205. patience = 10000 # look as this many examples regardless
  206. patience_increase = 2 # wait this much longer when a new best is
  207. # found
  208. improvement_threshold = 0.995 # a relative improvement of this much is
  209. # considered significant
  210. validation_frequency = min(n_train_batches,patience/2)
  211. # go through this many
  212. # minibatche before checking the network
  213. # on the validation set; in this case we
  214. # check every epoch
  215. best_params = None
  216. best_validation_loss = numpy.inf
  217. best_iter = 0
  218. test_score = 0.
  219. start_time = time.clock()
  220. epoch = 0
  221. done_looping = False
  222. while (epoch < n_epochs) and (not done_looping):
  223. epoch = epoch + 1
  224. for minibatch_index in xrange(n_train_batches):
  225. minibatch_avg_cost = train_model(minibatch_index)
  226. # iteration number
  227. iter = epoch * n_train_batches + minibatch_index
  228. if (iter+1) % validation_frequency == 0:
  229. # compute zero-one loss on validation set
  230. validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
  231. this_validation_loss = numpy.mean(validation_losses)
  232. print('epoch %i, minibatch %i/%i, validation error %f %%' % \
  233. (epoch, minibatch_index+1,n_train_batches, \
  234. this_validation_loss*100.))
  235. # if we got the best validation score until now
  236. if this_validation_loss < best_validation_loss:
  237. #improve patience if loss improvement is good enough
  238. if this_validation_loss < best_validation_loss * \
  239. improvement_threshold :
  240. patience = max(patience, iter * patience_increase)
  241. best_validation_loss = this_validation_loss
  242. # test it on the test set
  243. test_losses = [test_model(i) for i in xrange(n_test_batches)]
  244. test_score = numpy.mean(test_losses)
  245. print((' epoch %i, minibatch %i/%i, test error of best '
  246. 'model %f %%') % \
  247. (epoch, minibatch_index+1, n_train_batches,test_score*100.))
  248. if patience <= iter :
  249. done_looping = True
  250. break
  251. end_time = time.clock()
  252. print(('Optimization complete. Best validation score of %f %% '
  253. 'obtained at iteration %i, with test performance %f %%') %
  254. (best_validation_loss * 100., best_iter, test_score*100.))
  255. print >> sys.stderr, ('The code for file '+os.path.split(__file__)[1]+' ran for %.2fm' % ((end_time-start_time)/60.))
  256. if __name__ == '__main__':
  257. test_mlp()