diff --git a/tmva/tmva/inc/TMVA/MethodDNN.h b/tmva/tmva/inc/TMVA/MethodDNN.h
index 31a3f11d27775..af7dc3cb2e964 100644
--- a/tmva/tmva/inc/TMVA/MethodDNN.h
+++ b/tmva/tmva/inc/TMVA/MethodDNN.h
@@ -1,5 +1,5 @@
 // @(#)root/tmva $Id$
-// Author: Peter Speckmayer
+// Authors: Peter Speckmayer, Aditya Sharma
 
 /**********************************************************************************
  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
@@ -10,7 +10,8 @@
  * Description:                                                                   *
  *      NeuralNetwork                                                             *
  *                                                                                *
- * Authors (alphabetical):                                                        *
+ * Authors (alphabetical):
+ *      Aditya Sharma         <adisharma075@gmail.com>  - CERN, Switzerland
  *      Peter Speckmayer      <peter.speckmayer@gmx.at> - CERN, Switzerland       *
  *                                                                                *
  * Copyright (c) 2005-2015:                                                       *
@@ -70,12 +71,10 @@ namespace TMVA {
       MethodDNN ( const TString& jobName,
                   const TString&  methodTitle,
                   DataSetInfo& theData,
-                  const TString& theOption,
-                  TDirectory* theTargetDir = 0 );
+                  const TString& theOption);
 
       MethodDNN ( DataSetInfo& theData,
-                  const TString& theWeightFile,
-                  TDirectory* theTargetDir = 0 );
+                  const TString& theWeightFile );
 
       virtual ~MethodDNN();
 
@@ -131,7 +130,8 @@ namespace TMVA {
 
    private:
       TMVA::DNN::Net fNet;
-      std::vector<double> fWeights;
+      std::vector<double> fWeightBucket;
+      int fBucketSize;
 
       TString  fLayoutString;
       std::vector<std::pair<int,TMVA::DNN::EnumFunction>> fLayout;
diff --git a/tmva/tmva/inc/TMVA/NeuralNet.h b/tmva/tmva/inc/TMVA/NeuralNet.h
index 3ffaca7df35bb..aac47572fabca 100644
--- a/tmva/tmva/inc/TMVA/NeuralNet.h
+++ b/tmva/tmva/inc/TMVA/NeuralNet.h
@@ -1,6 +1,6 @@
 /**
  * @file NeuralNet
- * @author  Peter Speckmayer
+ * @author  Peter Speckmayer, Aditya Sharma
  * @version 1.0
  *
  * @section LICENSE
@@ -52,6 +52,7 @@
 
 #include <fenv.h> // turn on or off exceptions for NaN and other numeric exceptions
 
+#include <functional>
 
 namespace TMVA
 {
@@ -61,6 +62,17 @@ namespace TMVA
 
       //    double gaussDoubl (edouble mean, double sigma);
 
+      // const int BUCKET_SIZE = 8; // ------------------------------- Declare Bucket Size --------------------------------------------
+        /*! \brief Hash initialization
+          *
+          * 
+          */
+              // std::hash<int> hasherFunction;
+
+
+          int hasherFunction(int a);
+
+        // ---------------------------------------------------------------------------------
 
 
       double gaussDouble (double mean, double sigma);
@@ -74,63 +86,80 @@ namespace TMVA
       {
       public:
       MeanVariance() 
-         : m_n(0)
-            , m_sumWeights(0)
-            , m_sumWeightsSquared(0)
-            , m_mean(0)
-            , m_squared(0)
-            {}
-
-         inline void clear() 
-         { 
-            m_n = 0; 
-            m_sumWeights = 0;
-            m_sumWeightsSquared = 0;
-         }
-
-         template <typename T>
-            inline void add(T value, double weight = 1.0)
-            {
-               m_n++; // a value has been added
-
-               double dValue = (double)value;
-               if (m_n == 1) // initialization
-                  {
-                     m_mean = dValue;
-                     m_squared = 0.0;
-                     m_sumWeightsSquared = weight*weight;
-                     m_sumWeights = weight;
-                     return;
-                  }
-
-               double tmpWeight = m_sumWeights+weight;
-               double diff      = dValue - m_mean;
-
-               double tmp = diff*weight/tmpWeight;
-               m_mean    = m_mean + tmp;
-               m_squared = m_squared + tmpWeight*diff*tmp;
-
-               m_sumWeights = tmpWeight;
-               m_sumWeightsSquared += weight*weight;
-            }
-
-
-
-         inline int    count()      const { return m_n; }
-         inline double weights()    const { if(m_n==0) return 0; return m_sumWeights; }
-         inline double mean()       const { if(m_n==0) return 0; return m_mean; }
-         inline double var_N() const { if(m_n==0) return 0; return (m_squared/m_sumWeights); }
-         //        inline double var ()   const { return (Variance_N()*m_n/(m_n-1)); }    // unbiased for small sample sizes
-         inline double var ()   const { if(m_n==0) return 0; if(m_squared<=0) return 0.0; return (m_squared*m_sumWeights/(m_sumWeights*m_sumWeights-m_sumWeightsSquared)); }    // unbiased for small sample sizes
-         inline double stdDev_N () const { return sqrt( var_N() ); }
-         inline double stdDev ()   const { return sqrt( var() ); } // unbiased for small sample sizes
+          : m_n(0)
+              , m_sumWeights(0)
+              , m_mean(0)
+              , m_squared(0)
+          {}
+
+          inline void clear() 
+          { 
+              m_n = 0; 
+              m_sumWeights = 0;
+              m_mean = 0;
+              m_squared = 0;
+          }
+
+          template <typename T>
+              inline void add(T value, double weight = 1.0)
+          {
+              ++m_n; // a value has been added
+
+              if (m_n == 1) // initialization
+              {
+                  m_mean = value;
+                  m_squared = 0.0;
+                  m_sumWeights = weight;
+                  return;
+              }
+
+              double tmpWeight = m_sumWeights+weight;
+              double Q      = value - m_mean;
+
+              double R = Q*weight/tmpWeight;
+              m_mean    += R;
+              m_squared += m_sumWeights*R*Q;
+
+              m_sumWeights = tmpWeight;
+          }
+
+          template <typename ITERATOR>
+              inline void add (ITERATOR itBegin, ITERATOR itEnd)
+          {
+              for (ITERATOR it = itBegin; it != itEnd; ++it)
+                  add (*it);
+          }
+
+
+
+          inline int    count()      const { return m_n; }
+          inline double weights()    const { if(m_n==0) return 0; return m_sumWeights; }
+          inline double mean()       const { if(m_n==0) return 0; return m_mean; }
+          inline double var() const
+          {
+              if(m_n==0)
+                  return 0;
+              if (m_squared <= 0)
+                  return 0;
+              return (m_squared/m_sumWeights);
+          }
+    
+          inline double var_corr ()   const
+          {
+              if (m_n <= 1)
+                  return var ();
+        
+              return (var()*m_n/(m_n-1));    // unbiased for small sample sizes
+          } 
+    
+          inline double stdDev_corr () const { return sqrt( var_corr() ); }
+          inline double stdDev ()   const { return sqrt( var() ); } // unbiased for small sample sizes
 
       private:
-         size_t m_n;
-         double m_sumWeights;
-         double m_sumWeightsSquared;
-         double m_mean;
-         double m_squared;
+          size_t m_n;
+          double m_sumWeights;
+          double m_mean;
+          double m_squared;
       };
 
 
@@ -238,41 +267,41 @@ namespace TMVA
 
 
 
-      template <typename ItSource, typename ItWeight, typename ItTarget>
-         void applyWeights (ItSource itSourceBegin, ItSource itSourceEnd, ItWeight itWeight, ItTarget itTargetBegin, ItTarget itTargetEnd);
+      template <typename ItSource, typename ItTarget>
+         void applyWeights (ItSource itSourceBegin, ItSource itSourceEnd, int itWeight, std::vector<double>& weightBucket, size_t layerNumber, int BUCKET_SIZE, ItTarget itTargetBegin, ItTarget itTargetEnd);
 
 
 
-      template <typename ItSource, typename ItWeight, typename ItPrev>
-         void applyWeightsBackwards (ItSource itCurrBegin, ItSource itCurrEnd, ItWeight itWeight, ItPrev itPrevBegin, ItPrev itPrevEnd);
+      template <typename ItSource, typename ItPrev, typename ItDrop>
+          void applyWeightsBackwards (ItSource itCurrBegin, ItSource itCurrEnd, int itWeight, std::vector<double>& weightBucket, size_t layerNumber, int BUCKET_SIZE, ItPrev itPrevBegin, ItPrev itPrevEnd, ItDrop itDrop);
 
 
 
 
 
-      template <typename ItValue, typename ItFunction>
-         void applyFunctions (ItValue itValue, ItValue itValueEnd, ItFunction itFunction);
+      template <typename ItValue, typename Fnc>
+         void applyFunctions (ItValue itValue, ItValue itValueEnd, Fnc fnc);
 
 
-      template <typename ItValue, typename ItFunction, typename ItInverseFunction, typename ItGradient>
-         void applyFunctions (ItValue itValue, ItValue itValueEnd, ItFunction itFunction, ItInverseFunction itInverseFunction, ItGradient itGradient);
+      template <typename ItValue, typename Fnc, typename ItInverseFunction, typename ItGradient>
+          void applyFunctions (ItValue itValue, ItValue itValueEnd, Fnc fnc, ItInverseFunction invFnc, ItGradient itGradient);
 
 
 
-      template <typename ItSource, typename ItDelta, typename ItTargetGradient, typename ItGradient>
+      template <typename ItSource, typename ItDelta, typename ItTargetGradient>
          void update (ItSource itSource, ItSource itSourceEnd, 
                       ItDelta itTargetDeltaBegin, ItDelta itTargetDeltaEnd, 
                       ItTargetGradient itTargetGradientBegin, 
-                      ItGradient itGradient);
+                      int itGradient, std::vector<double>& gradientBucket, size_t layerNumber, int BUCKET_SIZE);
 
 
 
-      template <EnumRegularization Regularization, typename ItSource, typename ItDelta, typename ItTargetGradient, typename ItGradient, typename ItWeight>
+      template <EnumRegularization Regularization, typename ItSource, typename ItDelta, typename ItTargetGradient>
          void update (ItSource itSource, ItSource itSourceEnd, 
                       ItDelta itTargetDeltaBegin, ItDelta itTargetDeltaEnd, 
                       ItTargetGradient itTargetGradientBegin, 
-                      ItGradient itGradient, 
-                      ItWeight itWeight, double weightDecay);
+                      int itGradient, std::vector<double>& gradientBucket,
+                      int itWeight, std::vector<double>& weightBucket, double& factorWeightDecay, size_t layerNumber, int BUCKET_SIZE);
 
 
 
@@ -346,13 +375,13 @@ namespace TMVA
           *                    is not touched by the minimizer; This object is provided to the fitness function when
           *                    called
           */
-         template <typename Function, typename Weights, typename PassThrough>
-            double operator() (Function& fitnessFunction, Weights& weights, PassThrough& passThrough);
+         template <typename Function, typename PassThrough>
+            double operator() (Function& fitnessFunction, std::vector<double>& weightBucket, PassThrough& passThrough, const size_t& numWeights, std::vector<int>& layerWeightNumber, const int& BUCKET_SIZE);
 
 
          double m_alpha; ///< internal parameter (learningRate)
          double m_beta;  ///< internal parameter (momentum)
-         std::vector<double> m_prevGradients; ///< vector remembers the gradients of the previous step
+         std::vector<double> m_prevGradientBucket; ///< vector remembers the gradients of the previous step
       };
 
 
@@ -372,26 +401,26 @@ namespace TMVA
 
 
 
-      template <typename ItOutput, typename ItTruth, typename ItDelta, typename ItInvActFnc>
-         double sumOfSquares (ItOutput itOutputBegin, ItOutput itOutputEnd, ItTruth itTruthBegin, ItTruth itTruthEnd, ItDelta itDelta, ItDelta itDeltaEnd, ItInvActFnc itInvActFnc, double patternWeight);
+      template <typename ItOutput, typename ItTruth, typename ItDelta, typename InvFnc>
+          double sumOfSquares (ItOutput itOutputBegin, ItOutput itOutputEnd, ItTruth itTruthBegin, ItTruth /*itTruthEnd*/, ItDelta itDelta, ItDelta itDeltaEnd, InvFnc invFnc, double patternWeight);
 
 
 
       template <typename ItProbability, typename ItTruth, typename ItDelta, typename ItInvActFnc>
-         double crossEntropy (ItProbability itProbabilityBegin, ItProbability itProbabilityEnd, ItTruth itTruthBegin, ItTruth itTruthEnd, ItDelta itDelta, ItDelta itDeltaEnd, ItInvActFnc itInvActFnc, double patternWeight);
+          double crossEntropy (ItProbability itProbabilityBegin, ItProbability itProbabilityEnd, ItTruth itTruthBegin, ItTruth /*itTruthEnd*/, ItDelta itDelta, ItDelta itDeltaEnd, ItInvActFnc /*itInvActFnc*/, double patternWeight);
 
 
 
 
       template <typename ItOutput, typename ItTruth, typename ItDelta, typename ItInvActFnc>
-         double softMaxCrossEntropy (ItOutput itProbabilityBegin, ItOutput itProbabilityEnd, ItTruth itTruthBegin, ItTruth itTruthEnd, ItDelta itDelta, ItDelta itDeltaEnd, ItInvActFnc itInvActFnc, double patternWeight);
+          double softMaxCrossEntropy (ItOutput itProbabilityBegin, ItOutput itProbabilityEnd, ItTruth itTruthBegin, ItTruth /*itTruthEnd*/, ItDelta itDelta, ItDelta itDeltaEnd, ItInvActFnc /*itInvActFnc*/, double patternWeight);
 
 
 
 
 
-      template <typename ItWeight>
-         double weightDecay (double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization);
+      template <typename EnumRegularization>
+         double weightDecay (double error, int currLayerWeightIndex, int nextLayerWeightIndex, std::vector<double>& weightBucket, double factorWeightDecay, EnumRegularization eRegularization, size_t layerNumber, int BUCKET_SIZE);
 
 
 
@@ -467,8 +496,8 @@ namespace TMVA
           *                    output values (mutually exclusive probability)
           */
          LayerData (size_t size, 
-                    const_iterator_type itWeightBegin, 
-                    iterator_type itGradientBegin, 
+                    int itWeightBegin, 
+                    int itGradientBegin, 
                     std::shared_ptr<std::function<double(double)>> activationFunction, 
                     std::shared_ptr<std::function<double(double)>> inverseActivationFunction,
                     ModeOutputValues eModeOutput = ModeOutputValues::DIRECT);
@@ -486,7 +515,7 @@ namespace TMVA
           *                    output value (to create a probability); SOFTMAX applies a softmax transformation to all 
           *                    output values (mutually exclusive probability)
           */
-         LayerData (size_t size, const_iterator_type itWeightBegin, 
+         LayerData (size_t size, int itWeightBegin, 
                     std::shared_ptr<std::function<double(double)>> activationFunction, 
                     ModeOutputValues eModeOutput = ModeOutputValues::DIRECT);
 
@@ -501,7 +530,8 @@ namespace TMVA
             , m_deltas (other.m_deltas)
             , m_valueGradients (other.m_valueGradients)
             , m_values (other.m_values)
-            , m_hasDropOut (false)
+        , m_itDropOut (other.m_itDropOut)
+        , m_hasDropOut (other.m_hasDropOut)
             , m_itConstWeightBegin   (other.m_itConstWeightBegin)
             , m_itGradientBegin (other.m_itGradientBegin)
             , m_activationFunction (other.m_activationFunction)
@@ -520,14 +550,15 @@ namespace TMVA
             : m_size (other.m_size)
             , m_itInputBegin (other.m_itInputBegin)
             , m_itInputEnd (other.m_itInputEnd)
-            , m_deltas (other.m_deltas)
-            , m_valueGradients (other.m_valueGradients)
-            , m_values (other.m_values)
-            , m_hasDropOut (false)
+        , m_deltas (std::move(other.m_deltas))
+        , m_valueGradients (std::move(other.m_valueGradients))
+        , m_values (std::move(other.m_values))
+        , m_itDropOut (other.m_itDropOut)
+        , m_hasDropOut (other.m_hasDropOut)
             , m_itConstWeightBegin   (other.m_itConstWeightBegin)
             , m_itGradientBegin (other.m_itGradientBegin)
-            , m_activationFunction (other.m_activationFunction)
-            , m_inverseActivationFunction (other.m_inverseActivationFunction)
+        , m_activationFunction (std::move(other.m_activationFunction))
+        , m_inverseActivationFunction (std::move(other.m_inverseActivationFunction))
             , m_isInputLayer (other.m_isInputLayer)
             , m_hasWeights (other.m_hasWeights)
             , m_hasGradients (other.m_hasGradients)
@@ -566,7 +597,7 @@ namespace TMVA
          iterator_type valuesEnd   () { assert (!m_isInputLayer); return end (m_values); } ///< returns iterator to the end of the (node) values
 
          ModeOutputValues outputMode () const { return m_eModeOutput; } ///< returns the output mode
-         container_type probabilities () { return computeProbabilities (); } ///< computes the probabilities from the current node values and returns them 
+    container_type probabilities () const { return computeProbabilities (); } ///< computes the probabilities from the current node values and returns them 
 
          iterator_type deltasBegin () { return begin (m_deltas); } ///< returns iterator to the begin of the deltas (back-propagation)
          iterator_type deltasEnd   () { return end   (m_deltas); } ///< returns iterator to the end of the deltas (back-propagation)
@@ -580,9 +611,10 @@ namespace TMVA
          const_iterator_type valueGradientsBegin () const { return begin (m_valueGradients); } ///< returns const iterator to the begin of the gradients
          const_iterator_type valueGradientsEnd   () const { return end   (m_valueGradients); } ///< returns const iterator to the end of the gradients
 
-         iterator_type gradientsBegin () { assert (m_hasGradients); return m_itGradientBegin; } ///< returns iterator to the begin of the gradients
-         const_iterator_type gradientsBegin () const { assert (m_hasGradients); return m_itGradientBegin; } ///< returns const iterator to the begin of the gradients
-         const_iterator_type weightsBegin   () const { assert (m_hasWeights); return m_itConstWeightBegin; } ///< returns const iterator to the begin of the weights for this layer
+         int gradientsBegin () { assert (m_hasGradients); return m_itGradientBegin; } ///< returns iterator to the begin of the gradients
+         int gradientsBegin () const { assert (m_hasGradients); return m_itGradientBegin; } ///< returns const iterator to the begin of the gradients
+         int weightsBegin   () const { assert (m_hasWeights); return m_itConstWeightBegin; } ///< returns const iterator to the begin of the weights for this layer
+
 
          std::shared_ptr<std::function<double(double)>> activationFunction () const { return m_activationFunction; }
          std::shared_ptr<std::function<double(double)>> inverseActivationFunction () const { return m_inverseActivationFunction; }
@@ -600,7 +632,7 @@ namespace TMVA
          void clearDropOut () { m_hasDropOut = false; }
     
          bool hasDropOut () const { return m_hasDropOut; } ///< has this layer drop-out turned on?
-         const_dropout_iterator dropOut () const { return m_itDropOut; } ///< return the begin of the drop-out information
+    const_dropout_iterator dropOut () const { assert (m_hasDropOut); return m_itDropOut; } ///< return the begin of the drop-out information
     
          size_t size () const { return m_size; } ///< return the size of the layer
 
@@ -610,7 +642,7 @@ namespace TMVA
           *
           * 
           */
-         container_type computeProbabilities ();
+    container_type computeProbabilities () const;
 
       private:
     
@@ -620,13 +652,14 @@ namespace TMVA
          const_iterator_type m_itInputEnd;   ///< iterator to the end of the nodes in the input node vector
 
          std::vector<double> m_deltas; ///< stores the deltas for the DNN training 
-         std::vector<double> m_valueGradients; ///< stores the gradients of the values (nodes) 
+         std::vector<double> m_valueGradients; ///< stores the gradients of the values (nodes)
+
          std::vector<double> m_values; ///< stores the values of the nodes in this layer
          const_dropout_iterator m_itDropOut; ///< iterator to a container indicating if the corresponding node is to be dropped
          bool m_hasDropOut; ///< dropOut is turned on?
 
-         const_iterator_type m_itConstWeightBegin; ///< const iterator to the first weight of this layer in the weight vector
-         iterator_type       m_itGradientBegin;  ///< const iterator to the first gradient of this layer in the gradient vector
+         int m_itConstWeightBegin; ///< const iterator to the first weight of this layer in the weight vector
+         int       m_itGradientBegin;  ///< iterator to the first gradient of this layer in the gradient vector
 
          std::shared_ptr<std::function<double(double)>> m_activationFunction; ///< activation function for this layer
          std::shared_ptr<std::function<double(double)>> m_inverseActivationFunction; ///< inverse activation function for this layer
@@ -693,18 +726,15 @@ namespace TMVA
 
 
       template <typename LAYERDATA>
-         void forward (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData);
-
-      template <typename LAYERDATA>
-         void forward_training (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData);
+         void forward (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData, std::vector<double>& weightBucket, size_t layerNumber, int BUCKET_SIZE);
 
 
       template <typename LAYERDATA>
-         void backward (LAYERDATA& prevLayerData, LAYERDATA& currLayerData);
+         void backward (LAYERDATA& prevLayerData, LAYERDATA& currLayerData, std::vector<double>& weightBucket, size_t layerNumber, int BUCKET_SIZE);
 
 
       template <typename LAYERDATA>
-         void update (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData, double weightDecay, EnumRegularization regularization);
+         void update (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData, double factorWeightDecay, EnumRegularization regularization, std::vector<double>& weightBucket, std::vector<double>& gradientBucket, size_t layerNumber, int BUCKET_SIZE);
 
 
 
@@ -722,12 +752,11 @@ namespace TMVA
           */
          Settings (TString name,
                    size_t _convergenceSteps = 15, size_t _batchSize = 10, size_t _testRepetitions = 7, 
-                   double _factorWeightDecay = 1e-5, TMVA::DNN::EnumRegularization _regularization = TMVA::DNN::EnumRegularization::NONE,
+                   double _factorWeightDecay = 1e-5, int _bucketSize = 8, TMVA::DNN::EnumRegularization _regularization = TMVA::DNN::EnumRegularization::NONE,
                    MinimizerType _eMinimizerType = MinimizerType::fSteepest, 
                    double _learningRate = 1e-5, double _momentum = 0.3, 
                    int _repetitions = 3,
-                   bool _multithreading = true,
-                   bool _doBatchNormalization = true);
+                   bool _multithreading = true);
     
          /*! \brief d'tor
           *
@@ -759,6 +788,7 @@ namespace TMVA
          double momentum () const { return fMomentum; } ///< get the momentum (e.g. for SGD)
          int repetitions () const { return fRepetitions; } ///< how many steps have to be gone until the batch is changed
          MinimizerType minimizerType () const { return fMinimizerType; } ///< which minimizer shall be used (e.g. SGD)
+         int bucketSize () const { return fBucketSize; } ///< Number of Weight Buckets per Layer
 
 
 
@@ -800,7 +830,6 @@ namespace TMVA
          EnumRegularization regularization () const { return m_regularization; } ///< some regularization of the DNN is turned on?
 
          bool useMultithreading () const { return m_useMultithreading; } ///< is multithreading turned on?
-         bool doBatchNormalization () const { return m_doBatchNormalization; }
 
 
          void pads (int numPads) { if (fMonitoring) fMonitoring->pads (numPads); } ///< preparation for monitoring
@@ -841,6 +870,7 @@ namespace TMVA
          double fMomentum;
          int fRepetitions;
          MinimizerType fMinimizerType;
+         int fBucketSize;
 
          size_t m_convergenceCount;
          size_t m_maxConvergenceCount;
@@ -849,7 +879,6 @@ namespace TMVA
 
       protected:
          bool m_useMultithreading;
-         bool m_doBatchNormalization;
 
          std::shared_ptr<Monitoring> fMonitoring;
       };
@@ -889,13 +918,12 @@ namespace TMVA
           */
          ClassificationSettings (TString name,
                                  size_t _convergenceSteps = 15, size_t _batchSize = 10, size_t _testRepetitions = 7, 
-                                 double _factorWeightDecay = 1e-5, EnumRegularization _regularization = EnumRegularization::NONE, 
+                                 double _factorWeightDecay = 1e-5, int _bucketSize = 8, EnumRegularization _regularization = EnumRegularization::NONE, 
                                  size_t _scaleToNumEvents = 0, MinimizerType _eMinimizerType = MinimizerType::fSteepest, 
                                  double _learningRate = 1e-5, double _momentum = 0.3, int _repetitions = 3,
-                                 bool _useMultithreading = true,
-                                 bool _useBatchNormalization = true)
-            : Settings (name, _convergenceSteps, _batchSize, _testRepetitions, _factorWeightDecay, 
-                        _regularization, _eMinimizerType, _learningRate, _momentum, _repetitions, _useMultithreading, _useBatchNormalization)
+                                 bool _useMultithreading = true)
+            : Settings (name, _convergenceSteps, _batchSize, _testRepetitions, _factorWeightDecay, _bucketSize, 
+                        _regularization, _eMinimizerType, _learningRate, _momentum, _repetitions, _useMultithreading)
             , m_ams ()
             , m_sumOfSigWeights (0)
             , m_sumOfBkgWeights (0)
@@ -1065,6 +1093,7 @@ namespace TMVA
             : m_eErrorFunction (ModeErrorFunction::SUMOFSQUARES)
             , m_sizeInput (0)
             , m_layers ()
+            , m_bucketSize (8)
             {
             }
 
@@ -1076,6 +1105,7 @@ namespace TMVA
             : m_eErrorFunction (other.m_eErrorFunction)
             , m_sizeInput (other.m_sizeInput)
             , m_layers (other.m_layers)
+            , m_bucketSize (other.m_bucketSize)
             {
             }
 
@@ -1092,8 +1122,8 @@ namespace TMVA
           *
           * 
           */
-         template <typename WeightsType, typename DropProbabilities>
-            void dropOutWeightFactor (WeightsType& weights,
+         template <typename DropProbabilities>
+            void dropOutWeightFactor (std::vector<double>& weightBucket,
                                       const DropProbabilities& drops, 
                                       bool inverse = false);
 
@@ -1106,10 +1136,11 @@ namespace TMVA
           * \param settings settings used for this training run
           */
          template <typename Minimizer>
-            double train (std::vector<double>& weights, 
+            double train (std::vector<double>& weightBucket, std::vector<int>& layerWeightNumber, 
                           std::vector<Pattern>& trainPattern, 
                           const std::vector<Pattern>& testPattern, 
-                          Minimizer& minimizer, Settings& settings);
+                  Minimizer& minimizer,
+                  Settings& settings);
 
          /*! \brief pre-training for future use
           *
@@ -1132,39 +1163,84 @@ namespace TMVA
           * \param dropContainer the configuration for DNN drop-out
           */
          template <typename Iterator, typename Minimizer>
-            inline double trainCycle (Minimizer& minimizer, std::vector<double>& weights, 
-                                      Iterator itPatternBegin, Iterator itPatternEnd, Settings& settings, DropContainer& dropContainer);
+            double trainCycle (Minimizer& minimizer, std::vector<double>& weightBucket, std::vector<int>& layerWeightNumber,
+			                        Iterator itPatternBegin, Iterator itPatternEnd,
+                              Settings& settings,
+                              DropContainer& dropContainer);
+
+    template <typename LayerContainer>
+        void forwardPattern (const LayerContainer& _layers,
+                             std::vector<LayerData>& layerData, std::vector<double>& weightBucket, int BUCKET_SIZE) const;
 
          size_t numWeights (size_t trainingStartLayer = 0) const; ///< returns the number of weights in this net
+    size_t numNodes   (size_t trainingStartLayer = 0) const; ///< returns the number of nodes in this net
+
+          template <typename Weights>
+            std::vector<double> compute (const std::vector<double>& input, Weights& weightBucket, int BUCKET_SIZE) const; ///< compute the net with the given input and the given weights
 
-         template <typename Weights>
-            std::vector<double> compute (const std::vector<double>& input, const Weights& weights) const; ///< compute the net with the given input and the given weights
+         template <typename PassThrough>
+            double operator() (PassThrough& settingsAndBatch, std::vector<double>& weightBucket) const; ///< execute computation of the DNN for one mini-batch (used by the minimizer); no computation of gradients
+
+         template <typename PassThrough, typename OutContainer>
+            double operator() (PassThrough& settingsAndBatch, std::vector<double>& weightBucket, ModeOutput /*eFetch*/, OutContainer& outputContainer) const; ///< execute computation of the DNN for one mini-batch; helper function
+    
+         template <typename PassThrough>
+        double operator() (PassThrough& settingsAndBatch, std::vector<double>& weightBucket, std::vector<double>& gradientBucket) const;  ///< execute computation of the DNN for one mini-batch (used by the minimizer); returns gradients as well
+
+         template <typename PassThrough, typename OutContainer>
+        double operator() (PassThrough& settingsAndBatch, std::vector<double>& weightBucket, std::vector<double>& gradientBucket, ModeOutput eFetch, OutContainer& outputContainer) const;
+
+
+    template <typename LayerContainer, typename DropContainer>
+        std::vector<std::vector<LayerData>> prepareLayerData (LayerContainer& layers,
+                                                              Batch& batch,
+                                                              const DropContainer& dropContainer,
+                                                              int itWeightBegin,
+                                                              int itWeightEnd, 
+                                                              int itGradientBegin,
+                                                              int itGradientEnd,
+                                                              size_t& totalNumWeights) const;
 
-         template <typename Weights, typename PassThrough>
-            double operator() (PassThrough& settingsAndBatch, const Weights& weights) const; ///< execute computation of the DNN for one mini-batch (used by the minimizer); no computation of gradients
 
-         template <typename Weights, typename PassThrough, typename OutContainer>
-            double operator() (PassThrough& settingsAndBatch, const Weights& weights, ModeOutput eFetch, OutContainer& outputContainer) const; ///< execute computation of the DNN for one mini-batch; helper function
+
+
+    template <typename LayerContainer, typename LayerPatternContainer>
+        void forwardBatch (const LayerContainer& _layers,
+                           LayerPatternContainer& layerPatternData,
+                           std::vector<double>& valuesMean,
+                           std::vector<double>& valuesStdDev,
+                           size_t trainFromLayer, std::vector<double>& weightBucket) const;
     
-         template <typename Weights, typename Gradients, typename PassThrough>
-            double operator() (PassThrough& settingsAndBatch, const Weights& weights, Gradients& gradients) const;  ///< execute computation of the DNN for one mini-batch (used by the minimizer); returns gradients as well
+    template <typename OutputContainer>
+        void fetchOutput (const LayerData& lastLayerData, OutputContainer& outputContainer) const;
 
-         template <typename Weights, typename Gradients, typename PassThrough, typename OutContainer>
-            double operator() (PassThrough& settingsAndBatch, const Weights& weights, Gradients& gradients, ModeOutput eFetch, OutContainer& outputContainer) const;
+    template <typename OutputContainer>
+        void fetchOutput (const std::vector<LayerData>& layerPatternData, OutputContainer& outputContainer) const;
 
 
+    template <typename Weights>
+        std::tuple</*sumError*/double,/*sumWeights*/double> computeError (const Settings& settings,
+                                                                          std::vector<LayerData>& lastLayerData,
+                                                                          Batch& batch, Weights& weightBucket) const;
 
+    template <typename Settings>
+        void backPropagate (std::vector<std::vector<LayerData>>& layerPatternData, std::vector<double>& weightBucket, std::vector<double>& gradientBucket,
+                            const Settings& settings,
+                            size_t trainFromLayer,
+                            size_t totalNumWeights) const;
 
-         /*! \brief main DNN computation function
+    
+
+    /*! \brief main NN computation function
           *
           * 
           */
-         template <typename LayerContainer, typename PassThrough, typename ItWeight, typename ItGradient, typename OutContainer>
+         template <typename LayerContainer, typename PassThrough, typename OutContainer>
             double forward_backward (LayerContainer& layers, PassThrough& settingsAndBatch, 
-                                     ItWeight itWeightBegin, 
-                                     ItGradient itGradientBegin, ItGradient itGradientEnd, 
+			     int itWeightBegin, int itWeightEnd, 
+                                     int itGradientBegin, int itGradientEnd, 
                                      size_t trainFromLayer, 
-                                     OutContainer& outputContainer, bool fetchOutput) const;
+                                     OutContainer& outputContainer, bool fetchOutput, std::vector<double>& weightBucket, std::vector<double>& gradientBucket) const;
 
 
     
@@ -1176,14 +1252,14 @@ namespace TMVA
           *
           * 
           */
-         template <typename Container, typename ItWeight>
+         template <typename Container>
             double errorFunction (LayerData& layerData,
+                                  LayerData& nextLayerData,
                                   Container truth,
-                                  ItWeight itWeight,
-                                  ItWeight itWeightEnd,
                                   double patternWeight,
+                                  std::vector<double>& weightBucket,
                                   double factorWeightDecay,
-                                  EnumRegularization eRegularization) const;
+                                  EnumRegularization eRegularization, size_t layerNumber) const;
 
 
          const std::vector<Layer>& layers () const { return m_layers; } ///< returns the layers (structure)
@@ -1201,7 +1277,8 @@ namespace TMVA
 
          template <typename OutIterator>
             void initializeWeights (WeightInitializationStrategy eInitStrategy, 
-                                    OutIterator itWeight); ///< initialize the weights with the given strategy
+                                    OutIterator itWeight, std::vector<int>& layerWeightNumber, int BUCKET_SIZE); ///< initialize the weights with the given strategy
+
 
       protected:
 
@@ -1214,11 +1291,13 @@ namespace TMVA
          size_t m_sizeInput; ///< input size of this DNN
          size_t m_sizeOutput; ///< outut size of this DNN
          std::vector<Layer> m_layers; ///< layer-structure-data
+         int m_bucketSize;
       };
 
 
 
 
+typedef std::tuple<Settings&, Batch&, DropContainer&> pass_through_type;
 
 
 
diff --git a/tmva/tmva/inc/TMVA/NeuralNet.icc b/tmva/tmva/inc/TMVA/NeuralNet.icc
index 3ef36f1cf6eb5..ae0e7c884b493 100644
--- a/tmva/tmva/inc/TMVA/NeuralNet.icc
+++ b/tmva/tmva/inc/TMVA/NeuralNet.icc
@@ -8,6 +8,8 @@
 
 #include "Math/Util.h"
 
+#include <iostream>
+
 
 namespace TMVA
 {
@@ -17,10 +19,6 @@ namespace TMVA
 
 
 
-
-
-
-
         template <typename T>
             T uniformFromTo (T from, T to)
         {
@@ -75,13 +73,13 @@ namespace TMVA
 
 
 
-/*! \brief apply weights using drop-out
+/*! \brief apply weights using drop-out; for no drop out, provide (&bool = true) to itDrop such that *itDrop becomes "true"
  *
  * itDrop correlates with itSourceBegin 
  */
-        template <typename ItSource, typename ItWeight, typename ItTarget, typename ItDrop>
+template <bool HasDropOut, typename ItSource, typename ItTarget, typename ItDrop>
             void applyWeights (ItSource itSourceBegin, ItSource itSourceEnd,
-                               ItWeight itWeight,
+                               int itWeight, std::vector<double>& weightBucket, size_t layerNumber, int BUCKET_SIZE,
                                ItTarget itTargetBegin, ItTarget itTargetEnd,
                                ItDrop itDrop)
         {
@@ -89,81 +87,40 @@ namespace TMVA
             {
                 for (auto itTarget = itTargetBegin; itTarget != itTargetEnd; ++itTarget)
                 {
-                    if (*itDrop)
-                        (*itTarget) += (*itSource) * (*itWeight);
+            if (!HasDropOut || *itDrop)
+                        (*itTarget) += (*itSource) * (weightBucket[(hasherFunction(itWeight) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)]);
                     ++itWeight;
                 }
-                ++itDrop;        
+        if (HasDropOut) ++itDrop;        
             }
         }
 
 
 
-/*! \brief apply weights without drop-out
- *
- * 
- */
-        template <typename ItSource, typename ItWeight, typename ItTarget>
-            void applyWeights (ItSource itSourceBegin, ItSource itSourceEnd,
-                               ItWeight itWeight,
-                               ItTarget itTargetBegin, ItTarget itTargetEnd)
-        {
-            for (auto itSource = itSourceBegin; itSource != itSourceEnd; ++itSource)
-            {
-                for (auto itTarget = itTargetBegin; itTarget != itTargetEnd; ++itTarget)
-                {
-                    (*itTarget) += (*itSource) * (*itWeight);
-                    ++itWeight;
-                }
-            }
-        }
-
 
 
 
-/*! \brief apply weights backwards (for backprop)
+/*! \brief apply weights backwards (for backprop); for no drop out, provide (&bool = true) to itDrop such that *itDrop becomes "true"
  *
- * 
+ * itDrop correlates with itPrev (to be in agreement with "applyWeights" where it correlates with itSources (same node as itTarget here in applyBackwards)
  */
-        template <typename ItSource, typename ItWeight, typename ItPrev>
-            void applyWeightsBackwards (ItSource itCurrBegin, ItSource itCurrEnd,
-                                        ItWeight itWeight,
-                                        ItPrev itPrevBegin, ItPrev itPrevEnd)
+template <bool HasDropOut, typename ItSource, typename ItPrev, typename ItDrop>
+            void applyWeightsBackwards (ItSource itCurrBegin, ItSource itCurrEnd, int itWeight, std::vector<double>& weightBucket, size_t layerNumber, int BUCKET_SIZE, ItPrev itPrevBegin, ItPrev itPrevEnd, ItDrop itDrop)
         {
             for (auto itPrev = itPrevBegin; itPrev != itPrevEnd; ++itPrev)
             {
                 for (auto itCurr = itCurrBegin; itCurr != itCurrEnd; ++itCurr)
                 {
-                    (*itPrev) += (*itCurr) * (*itWeight);
+                   if (!HasDropOut || *itDrop)
+                      (*itPrev) += (*itCurr) * (weightBucket[(hasherFunction(itWeight) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)]);
                     ++itWeight;
                 }
+        if (HasDropOut) ++itDrop;
             }
         }
 
 
 
-/*! \brief apply weights backwards (for backprop)
- *
- * itDrop correlates with itPrev (to be in agreement with "applyWeights" where it correlates with itSources (same node as itTarget here in applyBackwards)
- */
-        template <typename ItSource, typename ItWeight, typename ItPrev, typename ItDrop>
-            void applyWeightsBackwards (ItSource itCurrBegin, ItSource itCurrEnd,
-                                        ItWeight itWeight,
-                                        ItPrev itPrevBegin, ItPrev itPrevEnd,
-                                        ItDrop itDrop)
-        {
-            for (auto itPrev = itPrevBegin; itPrev != itPrevEnd; ++itPrev)
-            {
-                for (auto itCurr = itCurrBegin; itCurr != itCurrEnd; ++itCurr)
-                {
-                    if (*itDrop)
-                        (*itPrev) += (*itCurr) * (*itWeight);
-                    ++itWeight; 
-                }
-                ++itDrop;
-            }
-        }
-
 
 
 
@@ -190,8 +147,8 @@ namespace TMVA
  *
  * 
  */
-        template <typename ItValue, typename Fnc, typename InvFnc, typename ItGradient>
-            void applyFunctions (ItValue itValue, ItValue itValueEnd, Fnc fnc, InvFnc invFnc, ItGradient itGradient)
+        template <typename ItValue, typename Fnc, typename ItInverseFunction, typename ItGradient>
+            void applyFunctions (ItValue itValue, ItValue itValueEnd, Fnc fnc, ItInverseFunction invFnc, ItGradient itGradient)
         {
             while (itValue != itValueEnd)
             {
@@ -209,11 +166,11 @@ namespace TMVA
  *
  * 
  */
-        template <typename ItSource, typename ItDelta, typename ItTargetGradient, typename ItGradient>
+        template <typename ItSource, typename ItDelta, typename ItTargetGradient>
             void update (ItSource itSource, ItSource itSourceEnd, 
                          ItDelta itTargetDeltaBegin, ItDelta itTargetDeltaEnd, 
                          ItTargetGradient itTargetGradientBegin, 
-                         ItGradient itGradient)
+                         int itGradient,  std::vector<double>& gradientBucket, size_t layerNumber, int BUCKET_SIZE)
         {
             while (itSource != itSourceEnd)
             {
@@ -221,7 +178,7 @@ namespace TMVA
                 auto itTargetGradient = itTargetGradientBegin;
                 while (itTargetDelta != itTargetDeltaEnd)
                 {
-                    (*itGradient) += - (*itTargetDelta) * (*itSource) * (*itTargetGradient);
+            (gradientBucket[(hasherFunction(itGradient) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)]) -= (*itTargetDelta) * (*itSource) * (*itTargetGradient);
                     ++itTargetDelta; ++itTargetGradient; ++itGradient;
                 }
                 ++itSource; 
@@ -263,12 +220,12 @@ namespace TMVA
  *
  * 
  */
-        template <EnumRegularization Regularization, typename ItSource, typename ItDelta, typename ItTargetGradient, typename ItGradient, typename ItWeight>
-            void update (ItSource itSource, ItSource itSourceEnd, 
-                         ItDelta itTargetDeltaBegin, ItDelta itTargetDeltaEnd, 
-                         ItTargetGradient itTargetGradientBegin, 
-                         ItGradient itGradient, 
-                         ItWeight itWeight, double weightDecay)
+      template <EnumRegularization Regularization, typename ItSource, typename ItDelta, typename ItTargetGradient>
+         void update (ItSource itSource, ItSource itSourceEnd, 
+                      ItDelta itTargetDeltaBegin, ItDelta itTargetDeltaEnd, 
+                      ItTargetGradient itTargetGradientBegin, 
+                      int itGradient, std::vector<double>& gradientBucket,
+                      int itWeight, std::vector<double>& weightBucket, double& factorWeightDecay, size_t layerNumber, int BUCKET_SIZE)
         {
             // ! the factor weightDecay has to be already scaled by 1/n where n is the number of weights
             while (itSource != itSourceEnd)
@@ -277,7 +234,7 @@ namespace TMVA
                 auto itTargetGradient = itTargetGradientBegin;
                 while (itTargetDelta != itTargetDeltaEnd)
                 {
-                    (*itGradient) -= + (*itTargetDelta) * (*itSource) * (*itTargetGradient) + computeRegularization<Regularization>(*itWeight,weightDecay);
+                    (gradientBucket[(hasherFunction(itGradient) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)]) -= + (*itTargetDelta) * (*itSource) * (*itTargetGradient) + computeRegularization<Regularization>(weightBucket[(hasherFunction(itWeight) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)],factorWeightDecay);
                     ++itTargetDelta; ++itTargetGradient; ++itGradient; ++itWeight;
                 }
                 ++itSource; 
@@ -288,7 +245,6 @@ namespace TMVA
 
 
 
-
 #define USELOCALWEIGHTS 1
 
 
@@ -297,18 +253,18 @@ namespace TMVA
  *
  * Can be used with multithreading (i.e. "HogWild!" style); see call in trainCycle
  */
-        template <typename Function, typename Weights, typename PassThrough>
-            double Steepest::operator() (Function& fitnessFunction, Weights& weights, PassThrough& passThrough) 
+        template <typename Function, typename PassThrough>
+            double Steepest::operator() (Function& fitnessFunction, std::vector<double>& weightBucket, PassThrough& passThrough, const size_t& numWeights, std::vector<int>& layerWeightNumber, const int& BUCKET_SIZE) 
         {
-            size_t numWeights = weights.size ();
-            std::vector<double> gradients (numWeights, 0.0);
-            std::vector<double> localWeights (begin (weights), end (weights));
+            // std::vector<double> gradients (numWeights, 0.0);
+            std::vector<double> gradientBucket (weightBucket.size (), 0.0);
+            std::vector<double> localWeightBucket (begin (weightBucket), end (weightBucket));
 
             double E = 1e10;
-            if (m_prevGradients.size () != numWeights)
+            if (m_prevGradientBucket.size () != weightBucket.size ())
             {
-                m_prevGradients.clear ();
-                m_prevGradients.assign (weights.size (), 0);
+                m_prevGradientBucket.clear ();
+                m_prevGradientBucket.assign (weightBucket.size (), 0);
             }
 
             bool success = true;
@@ -318,58 +274,71 @@ namespace TMVA
                 if (currentRepetition >= m_repetitions)
                     break;
 
-                gradients.assign (numWeights, 0.0);
+                gradientBucket.assign (weightBucket.size (), 0.0);
 
                 // --- nesterov momentum ---
                 // apply momentum before computing the new gradient
-                auto itPrevG = begin (m_prevGradients);
-                auto itPrevGEnd = end (m_prevGradients);
-                auto itLocWeight = begin (localWeights);
-                for (; itPrevG != itPrevGEnd; ++itPrevG)
+                int itPrevG = 0;
+                int itPrevGEnd = numWeights;
+                int itLocWeight = 0;
+                int itLWN, layerNumber = 0;
+
+                for (auto itLayerWeightNumber = layerWeightNumber.begin(); itLayerWeightNumber != layerWeightNumber.end(); ++itLayerWeightNumber, ++layerNumber)
                 {
-                    (*itPrevG) *= m_beta;
-                    (*itLocWeight) += (*itPrevG);
+                    for(itLWN = 0; itLWN < *itLayerWeightNumber; ++itLWN)
+                    {
+                        (m_prevGradientBucket[(hasherFunction(itPrevG) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)]) *= m_beta;
+                        (localWeightBucket[(hasherFunction(itLocWeight) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)]) += (m_prevGradientBucket[(hasherFunction(itPrevG) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)]);
+                        ++itPrevG; ++itLocWeight;
+                    }
                 }
 
-                E = fitnessFunction (passThrough, localWeights, gradients);
+                E = fitnessFunction (passThrough, localWeightBucket, gradientBucket); // **************************
 //            plotGradients (gradients);
+//            plotWeights (localWeightBucket);
 
                 double alpha = gaussDouble (m_alpha, m_alpha/2.0);
 //            double alpha = m_alpha;
 
-                auto itG = begin (gradients);
-                auto itGEnd = end (gradients);
-                itPrevG = begin (m_prevGradients);
+                int itG = 0;
+                int itGEnd = numWeights;
+                itPrevG = 0;
                 double maxGrad = 0.0;
-                for (; itG != itGEnd; ++itG, ++itPrevG)
+                layerNumber = 0;
+
+                for (auto itLayerWeightNumber = layerWeightNumber.begin(); itLayerWeightNumber != layerWeightNumber.end(); ++itLayerWeightNumber, ++layerNumber)
                 {
-                    double currGrad = (*itG);
-                    double prevGrad = (*itPrevG);
-                    currGrad *= alpha;
-                
-                    //(*itPrevG) = m_beta * (prevGrad + currGrad);
-                    currGrad += prevGrad;
-                    (*itG) = currGrad;
-                    (*itPrevG) = currGrad;
+                    for(itLWN = 0; itLWN < *itLayerWeightNumber; ++itLWN)
+                    {
+                        double currGrad = (gradientBucket[(hasherFunction(itG) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)]);
+                        double prevGrad = (m_prevGradientBucket[(hasherFunction(itPrevG) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)]);
+                        currGrad *= alpha;
                     
-                    if (std::fabs (currGrad) > maxGrad)
-                        maxGrad = currGrad;
+                        //(*itPrevG) = m_beta * (prevGrad + currGrad);
+                        currGrad += prevGrad;
+                        (gradientBucket[(hasherFunction(itG) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)]) = currGrad;
+                        (m_prevGradientBucket[(hasherFunction(itPrevG) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)]) = currGrad;
+                        
+                        if (std::fabs (currGrad) > maxGrad)
+                            maxGrad = currGrad;
+                        ++itG; ++itPrevG;
+                    }
                 }
 
                 if (maxGrad > 1)
                 {
                     m_alpha /= 2;
                     std::cout << "\nlearning rate reduced to " << m_alpha << std::endl;
-                    std::for_each (weights.begin (), weights.end (), [maxGrad](double& w)
+                    std::for_each (weightBucket.begin (), weightBucket.end (), [maxGrad](double& w)
                                    {
                                        w /= maxGrad;
                                    });
-                    m_prevGradients.clear ();
+                    m_prevGradientBucket.clear ();
                 }
                 else
                 {
-                    auto itW = std::begin (weights);
-                    std::for_each (std::begin (gradients), std::end (gradients), [&itW](double& g)
+                    auto itW = std::begin (weightBucket);
+                    std::for_each (gradientBucket.begin (), gradientBucket.end (), [&itW](double& g)
                                    {
                                        *itW += g;
                                        ++itW;
@@ -398,6 +367,8 @@ namespace TMVA
 
 
 
+
+
 /*! \brief sum of squares error function
  *
  * 
@@ -412,7 +383,7 @@ namespace TMVA
             bool hasDeltas = (itDelta != itDeltaEnd);
             for (ItOutput itOutput = itOutputBegin; itOutput != itOutputEnd; ++itOutput, ++itTruth)
             {
-//	assert (itTruth != itTruthEnd);
+//  assert (itTruth != itTruthEnd);
                 double output = (*itOutput);
                 double error = output - (*itTruth);
                 if (hasDeltas)
@@ -449,7 +420,7 @@ namespace TMVA
                 {
                     double delta = probability - truth;
                     (*itDelta) = delta*patternWeight;
-//	    (*itDelta) = (*itInvActFnc)(probability) * delta * patternWeight;
+//      (*itDelta) = (*itInvActFnc)(probability) * delta * patternWeight;
                     ++itDelta;
                 }
                 double error (0);
@@ -488,13 +459,13 @@ namespace TMVA
             ItTruth itTruth = itTruthBegin;
             for (auto itProbability = itProbabilityBegin; itProbability != itProbabilityEnd; ++itProbability, ++itTruth)
             {
-//	assert (itTruth != itTruthEnd);
+//  assert (itTruth != itTruthEnd);
                 double probability = (*itProbability);
                 double truth = (*itTruth);
                 if (hasDeltas)
                 {
                     (*itDelta) = probability - truth;
-//	    (*itDelta) = (*itInvActFnc)(sm) * delta * patternWeight;
+//      (*itDelta) = (*itInvActFnc)(sm) * delta * patternWeight;
                     ++itDelta; //++itInvActFnc;
                 }
                 double error (0);
@@ -518,17 +489,18 @@ namespace TMVA
  *
  * 
  */
-        template <typename ItWeight>
-            double weightDecay (double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization)
+        template <typename EnumRegularization>
+            double weightDecay (double error, int currLayerWeightIndex, int nextLayerWeightIndex, std::vector<double>& weightBucket, double factorWeightDecay, EnumRegularization eRegularization, size_t layerNumber, int BUCKET_SIZE)
         {
             if (eRegularization == EnumRegularization::L1)
             {
                 // weight decay (regularization)
                 double w = 0;
                 size_t n = 0;
-                for (; itWeight != itWeightEnd; ++itWeight, ++n)
+                int itWeight;
+                for (itWeight = currLayerWeightIndex; itWeight != nextLayerWeightIndex; ++itWeight, ++n)
                 {
-                    double weight = (*itWeight);
+                    double weight = (weightBucket[(hasherFunction(itWeight) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)]);
                     w += std::fabs (weight);
                 }
                 return error + 0.5 * w * factorWeightDecay / n;
@@ -538,9 +510,10 @@ namespace TMVA
                 // weight decay (regularization)
                 double w = 0;
                 size_t n = 0;
-                for (; itWeight != itWeightEnd; ++itWeight, ++n)
+                int itWeight;
+                for (itWeight = currLayerWeightIndex; itWeight != nextLayerWeightIndex; ++itWeight, ++n)
                 {
-                    double weight = (*itWeight);
+                    double weight = (weightBucket[(hasherFunction(itWeight) % BUCKET_SIZE) + (layerNumber * BUCKET_SIZE)]);
                     w += weight*weight;
                 }
                 return error + 0.5 * w * factorWeightDecay / n;
@@ -562,72 +535,57 @@ namespace TMVA
 
 
 
-/*! \brief apply the weights in forward direction of the DNN
+/*! \brief apply the weights (and functions) in forward direction of the DNN
  *
  * 
  */
         template <typename LAYERDATA>
-            void forward (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData)
+            void forward (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData, std::vector<double>& weightBucket, size_t layerNumber, int BUCKET_SIZE)
         {
             if (prevLayerData.hasDropOut ())
             {        
-                applyWeights (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (), 
-                              currLayerData.weightsBegin (), 
+        applyWeights<true> (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (), 
+                              currLayerData.weightsBegin (), weightBucket, layerNumber, BUCKET_SIZE,
                               currLayerData.valuesBegin (), currLayerData.valuesEnd (),
                               prevLayerData.dropOut ());
             }
             else
             {
-                applyWeights (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (), 
-                              currLayerData.weightsBegin (), 
-                              currLayerData.valuesBegin (), currLayerData.valuesEnd ());
+        bool dummy = true;
+        applyWeights<false> (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (), 
+                              currLayerData.weightsBegin (), weightBucket, layerNumber, BUCKET_SIZE,
+                             currLayerData.valuesBegin (), currLayerData.valuesEnd (),
+                             &dummy); // dummy to turn on all nodes (no drop out)
             }
         }
 
-/*! \brief apply weights (and functions) in forward direction and compute the gradients
- *
- * 
- */
-        template <typename LAYERDATA>
-            void forward_training (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData)
-        {
-            if (prevLayerData.hasDropOut ())
-            {        
-                applyWeights (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (), 
-                              currLayerData.weightsBegin (), 
-                              currLayerData.valuesBegin (), currLayerData.valuesEnd (),
-                              prevLayerData.dropOut ());
-            }
-            else
-            {
-                applyWeights (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (), 
-                              currLayerData.weightsBegin (), 
-                              currLayerData.valuesBegin (), currLayerData.valuesEnd ());
-            }
-        }
 
 
 /*! \brief backward application of the weights (back-propagation of the error)
  *
  * 
  */
-        template <typename LAYERDATA>
-            void backward (LAYERDATA& prevLayerData, LAYERDATA& currLayerData)
-        {
-            if (prevLayerData.hasDropOut ())
-            {
-                applyWeightsBackwards (currLayerData.deltasBegin (), currLayerData.deltasEnd (), 
-                                       currLayerData.weightsBegin (), 
-                                       prevLayerData.deltasBegin (), prevLayerData.deltasEnd (),
-                                       prevLayerData.dropOut ());
-            }
-            else
-            {
-                applyWeightsBackwards (currLayerData.deltasBegin (), currLayerData.deltasEnd (), 
-                                       currLayerData.weightsBegin (), 
-                                       prevLayerData.deltasBegin (), prevLayerData.deltasEnd ());
-            }
-        }
+template <typename LAYERDATA>
+    void backward (LAYERDATA& prevLayerData, LAYERDATA& currLayerData, std::vector<double>& weightBucket, size_t layerNumber, int BUCKET_SIZE)
+{
+    if (prevLayerData.hasDropOut ())
+    {
+        applyWeightsBackwards<true> (currLayerData.deltasBegin (), currLayerData.deltasEnd (), 
+                                     currLayerData.weightsBegin (), weightBucket, layerNumber, BUCKET_SIZE,
+                                     prevLayerData.deltasBegin (), prevLayerData.deltasEnd (),
+                                     prevLayerData.dropOut ());
+    }
+    else
+    {
+        bool dummy = true;
+        applyWeightsBackwards<false> (currLayerData.deltasBegin (), currLayerData.deltasEnd (), 
+                                      currLayerData.weightsBegin (), weightBucket, layerNumber, BUCKET_SIZE,
+                                      prevLayerData.deltasBegin (), prevLayerData.deltasEnd (),
+                                      &dummy); // dummy to use all nodes (no drop out)
+    }
+}
+
+
 
 
 
@@ -636,36 +594,42 @@ namespace TMVA
  * 
  */
         template <typename LAYERDATA>
-            void update (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData, double factorWeightDecay, EnumRegularization regularization)
+            void update (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData, double factorWeightDecay, EnumRegularization regularization, std::vector<double>& weightBucket, std::vector<double>& gradientBucket, size_t layerNumber, int BUCKET_SIZE)
         {
             // ! the "factorWeightDecay" has already to be scaled by 1/n where n is the number of weights
             if (factorWeightDecay != 0.0) // has weight regularization
                 if (regularization == EnumRegularization::L1)  // L1 regularization ( sum(|w|) )
                 {
-                    update<EnumRegularization::L1> (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (), 
-                                                    currLayerData.deltasBegin (), currLayerData.deltasEnd (), 
-                                                    currLayerData.valueGradientsBegin (), currLayerData.gradientsBegin (), 
-                                                    currLayerData.weightsBegin (), factorWeightDecay);
+                    update<EnumRegularization::L1> (prevLayerData.valuesBegin (),prevLayerData.valuesEnd (),
+                                                    currLayerData.deltasBegin (),
+                                                    currLayerData.deltasEnd (),
+                                                    currLayerData.valueGradientsBegin (),
+                                                    currLayerData.gradientsBegin (), gradientBucket,
+                                                    currLayerData.weightsBegin (), weightBucket, factorWeightDecay, layerNumber, BUCKET_SIZE);
                 }
                 else if (regularization == EnumRegularization::L2) // L2 regularization ( sum(w^2) )
                 {
-                    update<EnumRegularization::L2> (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (), 
-                                                    currLayerData.deltasBegin (), currLayerData.deltasEnd (), 
-                                                    currLayerData.valueGradientsBegin (), currLayerData.gradientsBegin (), 
-                                                    currLayerData.weightsBegin (), factorWeightDecay);
+                    update<EnumRegularization::L2> (prevLayerData.valuesBegin (),prevLayerData.valuesEnd (),
+                                                    currLayerData.deltasBegin (),
+                                                    currLayerData.deltasEnd (),
+                                                    currLayerData.valueGradientsBegin (),
+                                                    currLayerData.gradientsBegin (), gradientBucket,
+                                                    currLayerData.weightsBegin (), weightBucket, factorWeightDecay, layerNumber, BUCKET_SIZE);
                 }
                 else 
                 {
                     update (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (), 
                             currLayerData.deltasBegin (), currLayerData.deltasEnd (), 
-                            currLayerData.valueGradientsBegin (), currLayerData.gradientsBegin ());
+                            currLayerData.valueGradientsBegin (),
+                            currLayerData.gradientsBegin (),  gradientBucket, layerNumber, BUCKET_SIZE);
                 }
     
             else
             { // no weight regularization
                 update (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (), 
-                        currLayerData.deltasBegin (), currLayerData.deltasEnd (), 
-                        currLayerData.valueGradientsBegin (), currLayerData.gradientsBegin ());
+                            currLayerData.deltasBegin (), currLayerData.deltasEnd (), 
+                            currLayerData.valueGradientsBegin (),
+                            currLayerData.gradientsBegin (),  gradientBucket, layerNumber, BUCKET_SIZE);
             }
         }
 
@@ -687,28 +651,30 @@ namespace TMVA
  * the weights have to be adjusted to account for the different number of active nodes
  * this function computes the factor and applies it to the weights
  */
-        template <typename WeightsType, typename DropProbabilities>
-            void Net::dropOutWeightFactor (WeightsType& weights,
+        template <typename DropProbabilities>
+            void Net::dropOutWeightFactor (std::vector<double>& weightBucket,
                                            const DropProbabilities& drops, 
                                            bool inverse)
         {
-            if (drops.empty () || weights.empty ())
+            if (drops.empty () || weightBucket.empty ())
                 return;
 
-            auto itWeight = std::begin (weights);
-            auto itWeightEnd = std::end (weights);
+            int itWeightBucket = 0;
+            int itWeightBucketEnd = (int) weightBucket.size();
             auto itDrop = std::begin (drops);
             auto itDropEnd = std::end (drops);
-            size_t numNodesPrev = inputSize ();
+            // size_t numNodesPrev = inputSize ();
             double dropFractionPrev = *itDrop;
             ++itDrop;
 
+            // size_t layerNumber = 0;
+
             for (auto& layer : layers ())
             {
                 if (itDrop == itDropEnd)
                     break;
 
-                size_t numNodes = layer.numNodes ();
+                // size_t _numNodes = layer.numNodes ();
 
                 double dropFraction = *itDrop;
                 double pPrev = 1.0 - dropFractionPrev;
@@ -719,18 +685,19 @@ namespace TMVA
                 {
                     p = 1.0/p;
                 }
-                size_t _numWeights = layer.numWeights (numNodesPrev);
-                for (size_t iWeight = 0; iWeight < _numWeights; ++iWeight)
+                // size_t _numWeights = layer.numWeights (numNodesPrev);
+                for (size_t iWeightBucket = 0; iWeightBucket < m_bucketSize; ++iWeightBucket)
                 {
-                    if (itWeight == itWeightEnd)
+                    if (itWeightBucket == itWeightBucketEnd)
                         break;
                 
-                    *itWeight *= p;
-                    ++itWeight;
+                    weightBucket[itWeightBucket] *= p;
+                    ++itWeightBucket;
                 }
-                numNodesPrev = numNodes;
+                // numNodesPrev = _numNodes;
                 dropFractionPrev = dropFraction;
                 ++itDrop;
+                // ++layerNumber;
             }
         }
 
@@ -748,10 +715,11 @@ namespace TMVA
  * \param settings the settings for the training (e.g. multithreading or not, regularization etc.)
  */
         template <typename Minimizer>
-            double Net::train (std::vector<double>& weights, 
+            double Net::train (std::vector<double>& weightBucket, std::vector<int>& layerWeightNumber, 
                                std::vector<Pattern>& trainPattern, 
                                const std::vector<Pattern>& testPattern, 
-                               Minimizer& minimizer, Settings& settings)
+                           Minimizer& minimizer,
+                           Settings& settings)
         {
 //        std::cout << "START TRAINING" << std::endl;
             settings.startTrainCycle ();
@@ -760,6 +728,8 @@ namespace TMVA
             settings.create ("trainErrors", 100, 0, 100, 100, 0,1);
             settings.create ("testErrors", 100, 0, 100, 100, 0,1);
 
+            m_bucketSize = settings.bucketSize ();
+
             size_t cycleCount = 0;
             size_t testCycleCount = 0;
             double testError = 1e20;
@@ -771,7 +741,6 @@ namespace TMVA
             const std::vector<double>& dropFractions = settings.dropFractions ();
             bool isWeightsForDrop = false;
 
-        
             // until convergence
             do
             {
@@ -783,39 +752,40 @@ namespace TMVA
                 {
                     // fill the dropOut-container
                     dropContainer.clear ();
-                    size_t numNodes = inputSize ();
+                    size_t _numNodes = inputSize ();
                     double dropFraction = 0.0;
                     dropFraction = dropFractions.at (dropIndex);
                     ++dropIndex;
-                    fillDropContainer (dropContainer, dropFraction, numNodes);
+                    fillDropContainer (dropContainer, dropFraction, _numNodes);
                     for (auto itLayer = begin (m_layers), itLayerEnd = end (m_layers); itLayer != itLayerEnd; ++itLayer, ++dropIndex)
                     {
                         auto& layer = *itLayer;
-                        numNodes = layer.numNodes ();
+                        _numNodes = layer.numNodes ();
                         // how many nodes have to be dropped
                         dropFraction = 0.0;
                         if (dropFractions.size () > dropIndex)
                             dropFraction = dropFractions.at (dropIndex);
                     
-                        fillDropContainer (dropContainer, dropFraction, numNodes);
+                        fillDropContainer (dropContainer, dropFraction, _numNodes);
                     }
                     isWeightsForDrop = true;
                 }
 
                 // execute training cycle
-                trainError = trainCycle (minimizer, weights, begin (trainPattern), end (trainPattern), settings, dropContainer);
+                trainError = trainCycle (minimizer, weightBucket, layerWeightNumber, begin (trainPattern), end (trainPattern), settings, dropContainer);
 	    
 
-                // check if we execute a test
+	    // ------ check if we have to execute a test ------------------
                 bool hasConverged = false;
-                if (testCycleCount % settings.testRepetitions () == 0)
+            if (testCycleCount % settings.testRepetitions () == 0) // we test only everye "testRepetitions" repetition
                 {
                     if (isWeightsForDrop)
                     {
-                        dropOutWeightFactor (weights, dropFractions);
+                        dropOutWeightFactor (weightBucket, dropFractions);
                         isWeightsForDrop = false;
                     }
 
+
                     testError = 0;
                     //double weightSum = 0;
                     settings.startTestCycle ();
@@ -825,13 +795,14 @@ namespace TMVA
                         size_t patternPerThread = testPattern.size () / numThreads;
                         std::vector<Batch> batches;
                         auto itPat = testPattern.begin ();
-                        auto itPatEnd = testPattern.end ();
+                    // auto itPatEnd = testPattern.end ();
                         for (size_t idxThread = 0; idxThread < numThreads-1; ++idxThread)
                         {
                             batches.push_back (Batch (itPat, itPat + patternPerThread));
                             itPat += patternPerThread;
                         }
-                        batches.insert (batches.end (), Batch (itPat, itPatEnd));
+                        if (itPat != testPattern.end ())
+                            batches.push_back (Batch (itPat, testPattern.end ()));
 
                         std::vector<std::future<std::tuple<double,std::vector<double>>>> futures;
                         for (auto& batch : batches)
@@ -841,62 +812,68 @@ namespace TMVA
                                 std::async (std::launch::async, [&]() 
                                             {
                                                 std::vector<double> localOutput;
-                                                std::tuple<Settings&, Batch&, DropContainer&> passThrough (settings, batch, dropContainerTest);
-                                                double testBatchError = (*this) (passThrough, weights, ModeOutput::FETCH, localOutput);
+                                            pass_through_type passThrough (settings, batch, dropContainerTest);
+                                                double testBatchError = (*this) (passThrough, weightBucket, ModeOutput::FETCH, localOutput); 
                                                 return std::make_tuple (testBatchError, localOutput);
                                             })
                                 );
                         }
 
+                    auto itBatch = batches.begin  ();
                         for (auto& f : futures)
                         {
                             std::tuple<double,std::vector<double>> result = f.get ();
                             testError += std::get<0>(result) / batches.size ();
                             std::vector<double> output = std::get<1>(result);
-                            if (output.size () == testPattern.size ())
+
+                        //if (output.size () == testPattern.size ())
                             {
-                                auto it = begin (testPattern);
+                            //auto it = begin (testPattern);
+                            auto it = (*itBatch).begin ();
                                 for (double out : output)
                                 {
                                     settings.testSample (0, out, (*it).output ().at (0), (*it).weight ());
                                     ++it;
                                 }
                             }
+                        ++itBatch;
                         }
                     
                     }
                     else
                     {
                         std::vector<double> output;
-                        for (auto it = begin (testPattern), itEnd = end (testPattern); it != itEnd; ++it)
+                    //for (auto it = begin (testPattern), itEnd = end (testPattern); it != itEnd; ++it)
                         {
-                            const Pattern& p = (*it);
-                            double weight = p.weight ();
-                            Batch batch (it, it+1);
+                        //const Pattern& p = (*it);
+                        //double weight = p.weight ();
+                        //Batch batch (it, it+1);
+                        Batch batch (begin (testPattern), end (testPattern));
                             output.clear ();
-                            std::tuple<Settings&, Batch&, DropContainer&> passThrough (settings, batch, dropContainerTest);
-                            double testPatternError = (*this) (passThrough, weights, ModeOutput::FETCH, output);
-                            if (output.size () == 1)
+                        pass_through_type passThrough (settings, batch, dropContainerTest);
+                            double testPatternError = (*this) (passThrough, weightBucket, ModeOutput::FETCH, output);
+
+                        auto it = batch.begin ();
+                        for (double out : output)
                             {
-                                /* std::vector<double> out = (*this).compute (p.input (), weights); */
-                                /* assert (output.at (0) == out.at (0)); */
-                                settings.testSample (testPatternError, output.at (0), p.output ().at (0), weight);
+                            settings.testSample (0, out, (*it).output ().at (0), (*it).weight ());
+                            ++it;
                             }
                             //weightSum += fabs (weight);
                             //testError += testPatternError*weight;
-                            testError += testPatternError;
+                        testError += testPatternError; /// batch.size ();
                         }
-                        testError /= testPattern.size ();
+                    // testError /= testPattern.size ();
                     }
                     settings.endTestCycle ();
 //                    testError /= weightSum;
 
-                    settings.computeResult (*this, weights);
+                    settings.computeResult (*this, weightBucket);
 
                     hasConverged = settings.hasConverged (testError);
                     if (!hasConverged && !isWeightsForDrop)
                     {
-                        dropOutWeightFactor (weights, dropFractions, true); // inverse
+                        dropOutWeightFactor (weightBucket, dropFractions, true); // inverse
                         isWeightsForDrop = true;
                     }
                 }
@@ -953,13 +930,14 @@ namespace TMVA
  * \param dropContainer the data for dropping-out nodes (regularization technique)
  */
         template <typename Iterator, typename Minimizer>
-            inline double Net::trainCycle (Minimizer& minimizer, std::vector<double>& weights, 
+            double Net::trainCycle (Minimizer& minimizer, std::vector<double>& weightBucket, std::vector<int>& layerWeightNumber, 
                                            Iterator itPatternBegin, Iterator itPatternEnd, Settings& settings, DropContainer& dropContainer)
         {
             double error = 0.0;
             size_t numPattern = std::distance (itPatternBegin, itPatternEnd);
             size_t numBatches = numPattern/settings.batchSize ();
             size_t numBatches_stored = numBatches;
+            const int const_m_bucketSize = m_bucketSize;
 
             std::random_shuffle (itPatternBegin, itPatternEnd);
             Iterator itPatternBatchBegin = itPatternBegin;
@@ -1013,8 +991,8 @@ namespace TMVA
                                         for (auto it = batchRange.first, itEnd = batchRange.second; it != itEnd; ++it)
                                         {
                                             Batch& batch = *it;
-                                            std::tuple<Settings&, Batch&, DropContainer&> settingsAndBatch (settings, batch, dropContainer);
-                                            localError += minimizer ((*this), weights, settingsAndBatch); /// call the minimizer
+                                        pass_through_type settingsAndBatch (settings, batch, dropContainer);
+                                            localError += minimizer ((*this), weightBucket, settingsAndBatch, numWeights (), layerWeightNumber, const_m_bucketSize); /// call the minimizer
                                         }
                                         return localError;
                                     })
@@ -1028,8 +1006,8 @@ namespace TMVA
             {
                 for (auto& batch : batches)
                 {
-                    std::tuple<Settings&, Batch&, DropContainer&> settingsAndBatch (settings, batch, dropContainer);
-                    error += minimizer ((*this), weights, settingsAndBatch);
+                    pass_through_type settingsAndBatch (settings, batch, dropContainer);
+                    error += minimizer ((*this), weightBucket, settingsAndBatch, numWeights (), layerWeightNumber, const_m_bucketSize);
                 }
             }
         
@@ -1050,15 +1028,17 @@ namespace TMVA
  * \param weights the weight data
  */
         template <typename Weights>
-            std::vector<double> Net::compute (const std::vector<double>& input, const Weights& weights) const
+            std::vector<double> Net::compute (const std::vector<double>& input, Weights& weightBucket, int BUCKET_SIZE) const
         {
             std::vector<LayerData> layerData;
             layerData.reserve (m_layers.size ()+1);
-            auto itWeight = begin (weights);
+            int itWeight = 0;
             auto itInputBegin = begin (input);
             auto itInputEnd = end (input);
             layerData.push_back (LayerData (itInputBegin, itInputEnd));
             size_t numNodesPrev = input.size ();
+        
+        // -------------------- prepare layer data with one pattern -------------------------------
             for (auto& layer: m_layers)
             {
                 layerData.push_back (LayerData (layer.numNodes (), itWeight, 
@@ -1071,331 +1051,414 @@ namespace TMVA
 	    
 
             // --------- forward -------------
-            size_t idxLayer = 0, idxLayerEnd = m_layers.size ();
-            for (; idxLayer < idxLayerEnd; ++idxLayer)
-            {
-                LayerData& prevLayerData = layerData.at (idxLayer);
-                LayerData& currLayerData = layerData.at (idxLayer+1);
-		
-                forward (prevLayerData, currLayerData);
-                applyFunctions (currLayerData.valuesBegin (), currLayerData.valuesEnd (), currLayerData.activationFunction ());
-            }
+        forwardPattern (m_layers, layerData, weightBucket, BUCKET_SIZE);
 
             // ------------- fetch output ------------------
-            if (TMVA::DNN::isFlagSet (ModeOutputValues::DIRECT, layerData.back ().outputMode ()))
-            {
                 std::vector<double> output;
-                output.assign (layerData.back ().valuesBegin (), layerData.back ().valuesEnd ());
-                return output;
-            }
-            std::vector<double> output (layerData.back ().probabilities ());
+        fetchOutput (layerData.back (), output);
             return output;
         }
 
 
-        template <typename Weights, typename PassThrough>
-            double Net::operator() (PassThrough& settingsAndBatch, const Weights& weights) const
+        template <typename PassThrough>
+            double Net::operator() (PassThrough& settingsAndBatch, std::vector<double>& weightBucket) const
         {
             std::vector<double> nothing; // empty gradients; no backpropagation is done, just forward
-            assert (numWeights () == weights.size ());
-            double error = forward_backward(m_layers, settingsAndBatch, std::begin (weights), std::begin (nothing), std::end (nothing), 10000, nothing, false);
+
+	double error = forward_backward(m_layers, settingsAndBatch, 0, numWeights () - 1, 0, 0, 10000, nothing, false, weightBucket, nothing);
             return error;
         }
 
-        template <typename Weights, typename PassThrough, typename OutContainer>
-            double Net::operator() (PassThrough& settingsAndBatch, const Weights& weights, ModeOutput /*eFetch*/, OutContainer& outputContainer) const
+        template <typename PassThrough, typename OutContainer>
+            double Net::operator() (PassThrough& settingsAndBatch, std::vector<double>& weightBucket, ModeOutput /*eFetch*/, OutContainer& outputContainer) const
         {
             std::vector<double> nothing; // empty gradients; no backpropagation is done, just forward
-            assert (numWeights () == weights.size ());
-            double error = forward_backward(m_layers, settingsAndBatch, std::begin (weights), std::begin (nothing), std::end (nothing), 10000, outputContainer, true);
+
+	double error = forward_backward(m_layers, settingsAndBatch, 0, numWeights () - 1, 0, 0, 10000, outputContainer, true, weightBucket, nothing);
             return error;
         }
 
     
-        template <typename Weights, typename Gradients, typename PassThrough>
-            double Net::operator() (PassThrough& settingsAndBatch, const Weights& weights, Gradients& gradients) const
+        template <typename PassThrough>
+        double Net::operator() (PassThrough& settingsAndBatch, std::vector<double>& weightBucket, std::vector<double>& gradientBucket) const
         {
             std::vector<double> nothing;
-            assert (numWeights () == weights.size ());
-            assert (weights.size () == gradients.size ());
-            double error = forward_backward(m_layers, settingsAndBatch, std::begin (weights), std::begin (gradients), std::end (gradients), 0, nothing, false);
+            // std::cout<<"\nnumWeights = "<<numWeights ()<<"\n";
+
+            assert ((weightBucket.size ()) == (gradientBucket.size ()));
+	double error = forward_backward(m_layers, settingsAndBatch, 0, numWeights () - 1, 0, numWeights () - 1, 0, nothing, false, weightBucket, gradientBucket);
             return error;
         }
 
-        template <typename Weights, typename Gradients, typename PassThrough, typename OutContainer>
-            double Net::operator() (PassThrough& settingsAndBatch, const Weights& weights, Gradients& gradients, ModeOutput eFetch, OutContainer& outputContainer) const
+        template <typename PassThrough, typename OutContainer>
+        double Net::operator() (PassThrough& settingsAndBatch, std::vector<double>& weightBucket, std::vector<double>& gradientBucket, ModeOutput eFetch, OutContainer& outputContainer) const
         {
             MATH_UNUSED(eFetch);
-            assert (numWeights () == weights.size ());
-            assert (weights.size () == gradients.size ());
-            double error = forward_backward(m_layers, settingsAndBatch, std::begin (weights), std::begin (gradients), std::end (gradients), 0, outputContainer, true);
+
+            assert (weightBucket.size () == gradientBucket.size ());
+	double error = forward_backward(m_layers, settingsAndBatch, 0, numWeights () - 1, 0, numWeights () - 1, 0, outputContainer, true, weightBucket, gradientBucket);
             return error;
         }
 
 
 
+    template <typename LayerContainer, typename DropContainer>
+        std::vector<std::vector<LayerData>> Net::prepareLayerData (LayerContainer& _layers,
+                                                                   Batch& batch,
+                                                                   const DropContainer& dropContainer,
+                                                                   int itWeightBegin,
+                                                                   int /*itWeightEnd*/, 
+                                                                   int itGradientBegin,
+                                                                   int itGradientEnd,
+                                                                   size_t& totalNumWeights) const
+    {
+        LayerData::const_dropout_iterator itDropOut;
+        bool usesDropOut = !dropContainer.empty ();
+        if (usesDropOut)
+            itDropOut = std::begin (dropContainer);
+        
+	if (_layers.empty ())
+	    throw std::string ("no layers in this net");
+        
+        
+        // ----------- create layer data -------------------------------------------------------
+        assert (_layers.back ().numNodes () == outputSize ());
+        totalNumWeights = 0;
+        size_t totalNumNodes = 0;
+        std::vector<std::vector<LayerData>> layerPatternData;
+        layerPatternData.reserve (_layers.size ()+1);
+        int itWeight = itWeightBegin;
+        int itGradient = itGradientBegin;
+        size_t numNodesPrev = inputSize ();
+        typename Pattern::const_iterator itInputBegin;
+        typename Pattern::const_iterator itInputEnd;
 
 
-/*! \brief forward propagation and backward propagation
- *
- * 
- */
-        template <typename LayerContainer, typename PassThrough, typename ItWeight, typename ItGradient, typename OutContainer>
-            double Net::forward_backward (LayerContainer& _layers, PassThrough& settingsAndBatch, 
-                                          ItWeight itWeightBegin, 
-                                          ItGradient itGradientBegin, ItGradient itGradientEnd, 
-                                          size_t trainFromLayer, 
-                                          OutContainer& outputContainer, bool fetchOutput) const
-        {
-            Settings& settings = std::get<0>(settingsAndBatch);
-            Batch& batch = std::get<1>(settingsAndBatch);
-            DropContainer& dropContainer = std::get<2>(settingsAndBatch);
 
-            bool doBatchNormalization = settings.doBatchNormalization ();
-            bool usesDropOut = !dropContainer.empty ();
+        // ItWeight itGammaBegin = itWeightBegin + numWeights ();
+        // ItWeight itBetaBegin = itWeightBegin + numWeights () + numNodes ();
+        // ItGradient itGradGammaBegin = itGradientBegin + numWeights ();
+        // ItGradient itGradBetaBegin = itGradientBegin + numWeights () + numNodes ();
 
-            LayerData::const_dropout_iterator itDropOut;
-            if (usesDropOut)
-                itDropOut = std::begin (dropContainer);
         
-            if (_layers.empty ())
-            {
-                std::cout << "no layers in this net" << std::endl;
-                throw std::string ("no layers in this net");
-            }
+        // --------------------- prepare layer data for input layer ----------------------------
+        layerPatternData.push_back (std::vector<LayerData>());
+	for (const Pattern& _pattern : batch)
+        {
+            std::vector<LayerData>& layerData = layerPatternData.back ();
+            layerData.push_back (LayerData (numNodesPrev));
 
+            itInputBegin = _pattern.beginInput ();
+            itInputEnd = _pattern.endInput ();
+            layerData.back ().setInput (itInputBegin, itInputEnd);
+            
+            if (usesDropOut)
+                layerData.back ().setDropOut (itDropOut);
 
-            double sumError = 0.0;
-            double sumWeights = 0.0;	// -------------
+        }
+
+        
+        if (usesDropOut)
+            itDropOut += _layers.back ().numNodes ();
 
-            // ----------- create layer data -------------------------------------------------------
-            assert (_layers.back ().numNodes () == outputSize ());
-            size_t totalNumWeights = 0;
-            std::vector<std::vector<LayerData>> layerPatternData;
-            layerPatternData.reserve (_layers.size ()+1);
-            ItWeight itWeight = itWeightBegin;
-            ItGradient itGradient = itGradientBegin;
-            size_t numNodesPrev = inputSize ();
-            typename Pattern::const_iterator itInputBegin;
-            typename Pattern::const_iterator itInputEnd;
-
-            // --------------------- prepare layer data for input layer ----------------------------
+        // ---------------- prepare subsequent layers ---------------------------------------------
+        // for each of the layers
+        for (auto itLayer = begin (_layers), itLayerEnd = end (_layers); itLayer != itLayerEnd; ++itLayer)
+        {
+            bool isOutputLayer = (itLayer+1 == itLayerEnd);
+            bool isFirstHiddenLayer = (itLayer == begin (_layers));
+            
+            auto& layer = *itLayer;
             layerPatternData.push_back (std::vector<LayerData>());
-            layerPatternData.back () . reserve(batch.size());
+            // for each pattern, prepare a layerData
             for (const Pattern& _pattern : batch)
             {
                 std::vector<LayerData>& layerData = layerPatternData.back ();
-                layerData.push_back (LayerData (numNodesPrev));
+                //layerData.push_back (LayerData (numNodesPrev));
+
+                if (itGradientBegin == itGradientEnd)
+                {
+                    layerData.push_back (LayerData (layer.numNodes (), itWeight, 
+                                                    layer.activationFunction (),
+                                                    layer.modeOutputValues ()));
+                }
+                else
+                {
+                    layerData.push_back (LayerData (layer.numNodes (), itWeight, itGradient, 
+                                                    layer.activationFunction (),
+                                                    layer.inverseActivationFunction (),
+                                                    layer.modeOutputValues ()));
+                }
 
-                itInputBegin = _pattern.beginInput ();
-                itInputEnd = _pattern.endInput ();
-                layerData.back ().setInput (itInputBegin, itInputEnd);
-            
                 if (usesDropOut)
                 {
                     layerData.back ().setDropOut (itDropOut);
                 }
+
             }
+
             if (usesDropOut)
             {
-                itDropOut += _layers.back ().numNodes ();
+                itDropOut += layer.numNodes ();
             }
+            size_t _numWeights = layer.numWeights (numNodesPrev);
+            totalNumWeights += _numWeights;
+            itWeight += _numWeights;
+            itGradient += _numWeights;
+            numNodesPrev = layer.numNodes ();
+            totalNumNodes += numNodesPrev;
 
-            // ---------------- prepare subsequent layers ---------------------------------------------
-            // for each of the layers
-            for (auto& layer: _layers)
-            {
-                layerPatternData.push_back (std::vector<LayerData>());
-                layerPatternData.back () . reserve(batch.size());
-                // for each pattern, prepare a layerData
-                for (const Pattern& _pattern : batch)
-                {
-                    std::vector<LayerData>& layerData = layerPatternData.back ();
-                    //layerData.push_back (LayerData (numNodesPrev));
+        }
+	assert (totalNumWeights > 0);
+        return layerPatternData;
+}
 
-                    if (itGradientBegin == itGradientEnd)
-                        layerData.push_back (LayerData (layer.numNodes (), itWeight, 
-                                                        layer.activationFunction (),
-                                                        layer.modeOutputValues ()));
-                    else
-                        layerData.push_back (LayerData (layer.numNodes (), itWeight, itGradient, 
-                                                        layer.activationFunction (),
-                                                        layer.inverseActivationFunction (),
-                                                        layer.modeOutputValues ()));
 
-                    if (usesDropOut)
-                    {
-                        layerData.back ().setDropOut (itDropOut);
-                    }
-                }
-
-                if (usesDropOut)
-                {
-                    itDropOut += layer.numNodes ();
-                }
-                size_t _numWeights = layer.numWeights (numNodesPrev);
-                totalNumWeights += _numWeights;
-                itWeight += _numWeights;
-                itGradient += _numWeights;
-                numNodesPrev = layer.numNodes ();
-            }
-            assert (totalNumWeights > 0);
 
+    template <typename LayerContainer>
+        void Net::forwardPattern (const LayerContainer& _layers,
+                                  std::vector<LayerData>& layerData, std::vector<double>& weightBucket, int BUCKET_SIZE) const
+    {
+	size_t idxLayer = 0, idxLayerEnd = _layers.size ();
+        size_t cumulativeNodeCount = 0;
+	for (; idxLayer < idxLayerEnd; ++idxLayer)
+	{
+	    LayerData& prevLayerData = layerData.at (idxLayer);
+	    LayerData& currLayerData = layerData.at (idxLayer+1);
+		
+	    forward (prevLayerData, currLayerData, weightBucket, idxLayer, BUCKET_SIZE);
 
+            applyFunctions (currLayerData.valuesBegin (), currLayerData.valuesEnd (), currLayerData.activationFunction ());
+	}
+    }
 
-            // ---------------------------------- loop over layers and pattern -------------------------------------------------------
-            for (size_t idxLayer = 0, idxLayerEnd = layerPatternData.size (); idxLayer < idxLayerEnd-1; ++idxLayer) //std::vector<LayerData>& layerPattern : layerPatternData)
-            {
-                bool doTraining = idxLayer >= trainFromLayer;
 
-                // get layer-pattern data for this and the corresponding one from the next layer
-                std::vector<LayerData>& prevLayerPatternData = layerPatternData.at (idxLayer);
-                std::vector<LayerData>& currLayerPatternData = layerPatternData.at (idxLayer+1);
 
-                size_t numPattern = prevLayerPatternData.size ();
 
-                std::vector<MeanVariance> means (_layers.at (idxLayer).numNodes ());
-                // ---------------- loop over layerDatas of pattern compute forward ----------------------------
-                for (size_t idxPattern = 0; idxPattern < numPattern; ++idxPattern)
-                {
-                    const LayerData& prevLayerData = prevLayerPatternData.at (idxPattern);
-                    LayerData& currLayerData = currLayerPatternData.at (idxPattern);
+    template <typename LayerContainer, typename LayerPatternContainer>
+        void Net::forwardBatch (const LayerContainer& _layers,
+                                LayerPatternContainer& layerPatternData,
+                                std::vector<double>& valuesMean,
+                                std::vector<double>& valuesStdDev,
+                                size_t trainFromLayer, std::vector<double>& weightBucket) const
+    {
+        valuesMean.clear ();
+        valuesStdDev.clear ();
+        
+        // ---------------------------------- loop over layers and pattern -------------------------------------------------------
+        size_t cumulativeNodeCount = 0;
+	for (size_t idxLayer = 0, idxLayerEnd = layerPatternData.size (); idxLayer < idxLayerEnd-1; ++idxLayer) 
+	{
+	    bool doTraining = idxLayer >= trainFromLayer;
+
+            // get layer-pattern data for this and the corresponding one from the next layer
+            std::vector<LayerData>& prevLayerPatternData = layerPatternData.at (idxLayer);
+            std::vector<LayerData>& currLayerPatternData = layerPatternData.at (idxLayer+1);
+
+            size_t numPattern = prevLayerPatternData.size ();
+            size_t numNodesLayer = _layers.at (idxLayer).numNodes ();
+
+            std::vector<MeanVariance> means (numNodesLayer);
+            // ---------------- loop over layerDatas of pattern compute forward ----------------------------
+            for (size_t idxPattern = 0; idxPattern < numPattern; ++idxPattern)
+            {
+		const LayerData& prevLayerData = prevLayerPatternData.at (idxPattern);
+		LayerData& currLayerData = currLayerPatternData.at (idxPattern);
                 
             
-                    if (doTraining)
-                        forward_training (prevLayerData, currLayerData);
-                    else
-                        forward (prevLayerData, currLayerData);
+                forward (prevLayerData, currLayerData, weightBucket, idxLayer, m_bucketSize); // feed forward
+            }
+            
+            // ---------------- loop over layerDatas of pattern apply non-linearities ----------------------------
+            for (size_t idxPattern = 0; idxPattern < numPattern; ++idxPattern)
+            {
+		//const LayerData& prevLayerData = prevLayerPatternData.at (idxPattern);
+		LayerData& currLayerData = currLayerPatternData.at (idxPattern);
+                
+		if (doTraining)
+                    applyFunctions (currLayerData.valuesBegin (), currLayerData.valuesEnd (), currLayerData.activationFunction (), 
+                                    currLayerData.inverseActivationFunction (), currLayerData.valueGradientsBegin ());
+		else
+                    applyFunctions (currLayerData.valuesBegin (), currLayerData.valuesEnd (), currLayerData.activationFunction ());
+            }
 
+            // accumulate node count
+            cumulativeNodeCount += numNodesLayer;
+        }
+}
 
-                    // -------- compute batch mean and variance if batch normalization is turned on ------------------
-                    if (doBatchNormalization && doTraining)
-                    {
-//                    means.at (idxPattern).add (*(prevLayerData.valuesBegin ()+idxPattern));
-                    }
-                }
 
-                // ---------------- do batch normalization ----------------------------
-                if (doBatchNormalization)
-                {
-                    if (doTraining) // take means and variances from batch
-                    {
-                        for (size_t idxPattern = 0; idxPattern < numPattern; ++idxPattern)
-                        {
-                        }
-                    }
-                    else // take average mean and variance for batch normalization
-                    {
-                    }
-                }
-            
-                // ---------------- loop over layerDatas of pattern apply non-linearities ----------------------------
-                for (size_t idxPattern = 0; idxPattern < numPattern; ++idxPattern)
-                {
-//                     const LayerData& prevLayerData = prevLayerPatternData.at (idxPattern);
-                    LayerData& currLayerData = currLayerPatternData.at (idxPattern);
-                
-                    if (doTraining)
-                        applyFunctions (currLayerData.valuesBegin (), currLayerData.valuesEnd (), currLayerData.activationFunction (), 
-                                        currLayerData.inverseActivationFunction (), currLayerData.valueGradientsBegin ());
-                    else
-                        applyFunctions (currLayerData.valuesBegin (), currLayerData.valuesEnd (), currLayerData.activationFunction ());
-                }
 
-            }
 
+    template <typename OutputContainer>
+        void Net::fetchOutput (const LayerData& lastLayerData, OutputContainer& outputContainer) const
+    {
+        ModeOutputValues eModeOutput = lastLayerData.outputMode ();
+        if (isFlagSet (ModeOutputValues::DIRECT, eModeOutput))
+        {
+            outputContainer.insert (outputContainer.end (), lastLayerData.valuesBegin (), lastLayerData.valuesEnd ());
+        }
+        else if (isFlagSet (ModeOutputValues::SIGMOID, eModeOutput) ||
+                 isFlagSet (ModeOutputValues::SOFTMAX, eModeOutput))
+        {
+            const auto& prob = lastLayerData.probabilities ();
+            outputContainer.insert (outputContainer.end (), prob.begin (), prob.end ()) ;
+        }
+        else
+            assert (false);
+    }
+
+
+
+
+    template <typename OutputContainer>
+        void Net::fetchOutput (const std::vector<LayerData>& lastLayerPatternData, OutputContainer& outputContainer) const
+    {
+        for (const LayerData& lastLayerData : lastLayerPatternData)
+            fetchOutput (lastLayerData, outputContainer);
+    }
 
 
 
+    template <typename Weights>
+        std::tuple</*sumError*/double,/*sumWeights*/double> Net::computeError (const Settings& settings,
+                                                                               std::vector<LayerData>& lastLayerData,
+                                                                               Batch& batch, Weights& weightBucket) const
+    {
+        typename std::vector<LayerData>::iterator itLayerData    = lastLayerData.begin ();
+
+        typename std::vector<LayerData>::iterator itLayerDataNext   = itLayerData;
+        ++itLayerDataNext;
+
+        typename std::vector<LayerData>::iterator itLayerDataEnd = lastLayerData.end ();
+
+        typename std::vector<Pattern>::const_iterator itPattern = batch.begin ();
+        typename std::vector<Pattern>::const_iterator itPatternEnd = batch.end ();
+
+        double sumWeights (0.0);
+        double sumError (0.0);
         
-            // ------------- fetch output ------------------
-            if (fetchOutput)
+        size_t idxPattern = 0;
+        for ( ; itPattern != itPatternEnd; ++itPattern, ++itLayerData, ++itLayerDataNext, ++idxPattern)
+        {
+                
+            // compute E and the deltas of the computed output and the true output
+            LayerData& layerData = (*itLayerData);
+            LayerData& nextLayerData = (*itLayerDataNext);
+            const Pattern& _pattern = (*itPattern);
+            double error = errorFunction (layerData, nextLayerData, _pattern.output (),  
+                                          _pattern.weight (), weightBucket, settings.factorWeightDecay (),
+                                          settings.regularization (), idxPattern);
+            sumWeights += fabs (_pattern.weight ());
+            sumError += error;
+        }
+        return std::make_tuple (sumError, sumWeights);
+    }
+
+
+
+    template <typename Settings>
+        void Net::backPropagate (std::vector<std::vector<LayerData>>& layerPatternData, std::vector<double>& weightBucket, std::vector<double>& gradientBucket,
+                                 const Settings& settings,
+                                 size_t trainFromLayer,
+                                 size_t totalNumWeights) const
+    {
+        bool doTraining = layerPatternData.size () > trainFromLayer;
+        if (doTraining) // training
+        {
+            // ------------- backpropagation -------------
+            size_t idxLayer = layerPatternData.size ();
+            for (auto itLayerPatternData = layerPatternData.rbegin (), itLayerPatternDataBegin = layerPatternData.rend ();
+                 itLayerPatternData != itLayerPatternDataBegin; ++itLayerPatternData)
             {
-                for (LayerData& lastLayerData : layerPatternData.back ())
+                --idxLayer;
+                if (idxLayer <= trainFromLayer) // no training
+                    break;
+
+                std::vector<LayerData>& currLayerDataColl = *(itLayerPatternData);
+                std::vector<LayerData>& prevLayerDataColl = *(itLayerPatternData+1);
+                
+                size_t idxPattern = 0;
+                for (typename std::vector<LayerData>::iterator itCurrLayerData = begin (currLayerDataColl), itCurrLayerDataEnd = end (currLayerDataColl),
+                     itPrevLayerData = begin (prevLayerDataColl), itPrevLayerDataEnd = end (prevLayerDataColl);
+                     itCurrLayerData != itCurrLayerDataEnd; ++itCurrLayerData, ++itPrevLayerData, ++idxPattern)
                 {
-                    ModeOutputValues eModeOutput = lastLayerData.outputMode ();
-                    if (TMVA::DNN::isFlagSet (ModeOutputValues::DIRECT, eModeOutput))
-                    {
-                        outputContainer.insert (outputContainer.end (), lastLayerData.valuesBegin (), lastLayerData.valuesEnd ());
-                    }
-                    else if (TMVA::DNN::isFlagSet (ModeOutputValues::SIGMOID, eModeOutput) ||
-                             TMVA::DNN::isFlagSet (ModeOutputValues::SOFTMAX, eModeOutput))
-                    {
-                        const auto& probs = lastLayerData.probabilities ();
-                        outputContainer.insert (outputContainer.end (), probs.begin (), probs.end ());
-                    }
-                    else
-                        assert (false);
+                    LayerData& currLayerData = (*itCurrLayerData);
+                    LayerData& prevLayerData = *(itPrevLayerData);
+
+                    backward (prevLayerData, currLayerData, weightBucket, idxLayer-1, m_bucketSize);
+
+                    // the factorWeightDecay has to be scaled by 1/n where n is the number of weights (synapses)
+                    // because L1 and L2 regularization
+                    //
+                    //  http://neuralnetworksanddeeplearning.com/chap3.html#overfitting_and_regularization
+                    //
+                    // L1 : -factorWeightDecay*sgn(w)/numWeights
+                    // L2 : -factorWeightDecay/numWeights
+                    update (prevLayerData, currLayerData, settings.factorWeightDecay ()/totalNumWeights, settings.regularization (), weightBucket, gradientBucket, idxLayer-1, m_bucketSize);
                 }
             }
+        }
+    }
 
 
-            // ------------- error computation -------------
-            std::vector<LayerData>& lastLayerData = layerPatternData.back ();
 
-            bool doTraining = layerPatternData.size () > trainFromLayer;
+/*! \brief forward propagation and backward propagation
+ *
+ * 
+ */
+        template <typename LayerContainer, typename PassThrough, typename OutContainer>
+            double Net::forward_backward (LayerContainer& _layers, PassThrough& settingsAndBatch, 
+                                      int itWeightBegin, int itWeightEnd,
+                                          int itGradientBegin, int itGradientEnd, 
+                                          size_t trainFromLayer, 
+                                      OutContainer& outputContainer, bool doFetchOutput, std::vector<double>& weightBucket, std::vector<double>& gradientBucket) const
+        {
+            Settings& settings = std::get<0>(settingsAndBatch);
+            Batch& batch = std::get<1>(settingsAndBatch);
+            DropContainer& dropContainer = std::get<2>(settingsAndBatch);
+
+            double sumError = 0.0;
+            double sumWeights = 0.0;	// -------------
 
-            typename std::vector<LayerData>::iterator itLayerData    = lastLayerData.begin ();
-            typename std::vector<LayerData>::iterator itLayerDataEnd = lastLayerData.end ();
 
-            typename std::vector<Pattern>::const_iterator itPattern = batch.begin ();
-            typename std::vector<Pattern>::const_iterator itPatternEnd = batch.end ();
+        // ----------------------------- prepare layer data -------------------------------------
+        size_t totalNumWeights (0);
+        std::vector<std::vector<LayerData>> layerPatternData = prepareLayerData (_layers,
+                                                                                 batch,
+                                                                                 dropContainer,
+                                                                                 itWeightBegin,
+                                                                                 itWeightEnd, 
+                                                                                 itGradientBegin,
+                                                                                 itGradientEnd,
+                                                                                 totalNumWeights);
 
-            size_t idxPattern = 0;
-            for ( ; itPattern != itPatternEnd; ++itPattern, ++itLayerData)
-            {
-                ++idxPattern;
-                
-                // compute E and the deltas of the computed output and the true output
-                LayerData& layerData = (*itLayerData);
-                const Pattern& _pattern = (*itPattern);
-                itWeight = itWeightBegin;
-                double error = errorFunction (layerData, _pattern.output (), 
-                                              itWeight, itWeight + totalNumWeights, 
-                                              _pattern.weight (), settings.factorWeightDecay (),
-                                              settings.regularization ());
-                sumWeights += fabs (_pattern.weight ());
-                sumError += error;
-            }
             
-            if (doTraining) // training
+
+        // ---------------------------------- propagate forward ------------------------------------------------------------------
+        std::vector<double> valuesMean;
+        std::vector<double> valuesStdDev;
+        forwardBatch (_layers, layerPatternData, valuesMean, valuesStdDev, trainFromLayer, weightBucket);
+
+        
+            // ------------- fetch output ------------------
+        if (doFetchOutput)
             {
+            fetchOutput (layerPatternData.back (), outputContainer);
+            }
+
+
+            // ------------- error computation -------------
+        std::tie (sumError, sumWeights) = computeError (settings, layerPatternData.back (), batch, weightBucket);
+
+
                 // ------------- backpropagation -------------
-                size_t idxLayer = layerPatternData.size ();
-                for (auto itLayerPatternData = layerPatternData.rbegin (), itLayerPatternDataBegin = layerPatternData.rend ();
-                     itLayerPatternData != itLayerPatternDataBegin; ++itLayerPatternData)
-                {
-                    --idxLayer;
-                    if (idxLayer <= trainFromLayer) // no training
-                        break;
+        backPropagate (layerPatternData, weightBucket, gradientBucket, settings, trainFromLayer, totalNumWeights);
 
-                    std::vector<LayerData>& currLayerDataColl = *(itLayerPatternData);
-                    std::vector<LayerData>& prevLayerDataColl = *(itLayerPatternData+1);
                 
-                    idxPattern = 0;
-                    for (typename std::vector<LayerData>::iterator itCurrLayerData = begin (currLayerDataColl), itCurrLayerDataEnd = end (currLayerDataColl),
-                             itPrevLayerData = begin (prevLayerDataColl), itPrevLayerDataEnd = end (prevLayerDataColl);
-                         itCurrLayerData != itCurrLayerDataEnd; ++itCurrLayerData, ++itPrevLayerData, ++idxPattern)
-                    {
-                        LayerData& currLayerData = (*itCurrLayerData);
-                        LayerData& prevLayerData = *(itPrevLayerData);
-
-                        backward (prevLayerData, currLayerData);
-
-                        // the factorWeightDecay has to be scaled by 1/n where n is the number of weights (synapses)
-                        // because L1 and L2 regularization
-                        //
-                        //  http://neuralnetworksanddeeplearning.com/chap3.html#overfitting_and_regularization
-                        //
-                        // L1 : -factorWeightDecay*sgn(w)/numWeights
-                        // L2 : -factorWeightDecay/numWeights
-                        update (prevLayerData, currLayerData, settings.factorWeightDecay ()/totalNumWeights, settings.regularization ());
-                    }
-                }
-            }
-        
+        // --- compile the measures
             double batchSize = std::distance (std::begin (batch), std::end (batch));
-            for (auto it = itGradientBegin; it != itGradientEnd; ++it)
+            for (auto it = gradientBucket.begin(); it != gradientBucket.end(); ++it)
                 (*it) /= batchSize;
 
 
@@ -1410,7 +1473,7 @@ namespace TMVA
  * 
  */
         template <typename OutIterator>
-            void Net::initializeWeights (WeightInitializationStrategy eInitStrategy, OutIterator itWeight)
+            void Net::initializeWeights (WeightInitializationStrategy eInitStrategy, OutIterator itWeight, std::vector<int>& layerWeightNumber, int BUCKET_SIZE)
         {
             if (eInitStrategy == WeightInitializationStrategy::XAVIER)
             {
@@ -1419,18 +1482,20 @@ namespace TMVA
 
                 // compute variance and mean of input and output
                 //...
-	
+	            
 
                 // compute the weights
                 for (auto& layer: layers ())
                 {
                     double nIn = numInput;
                     double stdDev = sqrt (2.0/nIn);
-                    for (size_t iWeight = 0, iWeightEnd = layer.numWeights (numInput); iWeight < iWeightEnd; ++iWeight)
+                    // for (size_t iWeight = 0, iWeightEnd = (layer.numWeights (numInput) / BUCKET_SIZE); iWeight < iWeightEnd; ++iWeight)
+                    for (size_t iWeight = 0, iWeightEnd = (BUCKET_SIZE); iWeight < iWeightEnd; ++iWeight)
                     {
                         (*itWeight) = DNN::gaussDouble (0.0, stdDev); // factor 2.0 for ReLU
                         ++itWeight;
                     }
+                    layerWeightNumber.push_back((int)layer.numWeights (numInput));
                     numInput = layer.numNodes ();
                 }
                 return;
@@ -1451,12 +1516,14 @@ namespace TMVA
                     double nIn = numInput;
                     double minVal = -sqrt(2.0/nIn);
                     double maxVal = sqrt (2.0/nIn);
-                    for (size_t iWeight = 0, iWeightEnd = layer.numWeights (numInput); iWeight < iWeightEnd; ++iWeight)
+                    // for (size_t iWeight = 0, iWeightEnd = (layer.numWeights (numInput) / BUCKET_SIZE); iWeight < iWeightEnd; ++iWeight)
+                    for (size_t iWeight = 0, iWeightEnd = (BUCKET_SIZE); iWeight < iWeightEnd; ++iWeight)
                     {
                     
                         (*itWeight) = DNN::uniformDouble (minVal, maxVal); // factor 2.0 for ReLU
                         ++itWeight;
                     }
+                    layerWeightNumber.push_back((int)layer.numWeights (numInput));
                     numInput = layer.numNodes ();
                 }
                 return;
@@ -1475,11 +1542,13 @@ namespace TMVA
                 for (auto& layer: layers ())
                 {
 //                double nIn = numInput;
-                    for (size_t iWeight = 0, iWeightEnd = layer.numWeights (numInput); iWeight < iWeightEnd; ++iWeight)
+                    // for (size_t iWeight = 0, iWeightEnd = (layer.numWeights (numInput) / BUCKET_SIZE); iWeight < iWeightEnd; ++iWeight)
+                    for (size_t iWeight = 0, iWeightEnd = (BUCKET_SIZE); iWeight < iWeightEnd; ++iWeight)
                     {
                         (*itWeight) = DNN::gaussDouble (0.0, 0.1);
                         ++itWeight;
                     }
+                    layerWeightNumber.push_back((int)layer.numWeights (numInput));
                     numInput = layer.numNodes ();
                 }
                 return;
@@ -1498,11 +1567,13 @@ namespace TMVA
                 for (auto& layer: layers ())
                 {
                     double nIn = numInput;
-                    for (size_t iWeight = 0, iWeightEnd = layer.numWeights (numInput); iWeight < iWeightEnd; ++iWeight)
+                    // for (size_t iWeight = 0, iWeightEnd = (layer.numWeights (numInput) / BUCKET_SIZE); iWeight < iWeightEnd; ++iWeight)
+                    for (size_t iWeight = 0, iWeightEnd = (BUCKET_SIZE); iWeight < iWeightEnd; ++iWeight)
                     {
                         (*itWeight) = DNN::gaussDouble (0.0, sqrt (layer.numWeights (nIn))); // factor 2.0 for ReLU
                         ++itWeight;
                     }
+                    layerWeightNumber.push_back((int)layer.numWeights (numInput));
                     numInput = layer.numNodes ();
                 }
                 return;
@@ -1518,14 +1589,14 @@ namespace TMVA
  *
  * 
  */
-        template <typename Container, typename ItWeight>
+        template <typename Container>
             double Net::errorFunction (LayerData& layerData,
+                                       LayerData& nextLayerData,
                                        Container truth,
-                                       ItWeight itWeight,
-                                       ItWeight itWeightEnd,
                                        double patternWeight,
+                                       std::vector<double>& weightBucket,
                                        double factorWeightDecay,
-                                       EnumRegularization eRegularization) const
+                                       EnumRegularization eRegularization, size_t layerNumber) const
         {
             double error (0);
             switch (m_eErrorFunction)
@@ -1563,7 +1634,7 @@ namespace TMVA
             }
             if (factorWeightDecay != 0 && eRegularization != EnumRegularization::NONE)
             {
-                error = weightDecay (error, itWeight, itWeightEnd, factorWeightDecay, eRegularization);
+                error = weightDecay (error, layerData.weightsBegin (), nextLayerData.weightsBegin (), weightBucket, factorWeightDecay, eRegularization, layerNumber, m_bucketSize);
             }
             return error;
         } 
@@ -1574,118 +1645,115 @@ namespace TMVA
 
 
 
-
-/*! \brief pre-training
- *
- * in development
- */
-        template <typename Minimizer>
-            void Net::preTrain (std::vector<double>& weights,
-                                std::vector<Pattern>& trainPattern,
-                                const std::vector<Pattern>& testPattern,
-                                Minimizer& minimizer, Settings& settings)
-        {
-            auto itWeightGeneral = std::begin (weights);
-            std::vector<Pattern> prePatternTrain (trainPattern.size ());
-            std::vector<Pattern> prePatternTest (testPattern.size ());
-
-            size_t _inputSize = inputSize ();
-
-            // transform pattern using the created preNet
-            auto initializePrePattern = [&](const std::vector<Pattern>& pttrnInput, std::vector<Pattern>& pttrnOutput)
-                {
-                    pttrnOutput.clear ();
-                    std::transform (std::begin (pttrnInput), std::end (pttrnInput),
-                                    std::back_inserter (pttrnOutput), 
-                                    [](const Pattern& p)
-            {
-                Pattern pat (p.input (), p.input (), p.weight ());
-                return pat;
-            });
-                };
-
-            initializePrePattern (trainPattern, prePatternTrain);
-            initializePrePattern (testPattern, prePatternTest);
-
-            std::vector<double> originalDropFractions = settings.dropFractions ();
-
-            for (auto& _layer : layers ())
-            {
-                // compute number of weights (as a function of the number of incoming nodes)
-                // fetch number of nodes
-                size_t numNodes = _layer.numNodes ();
-                size_t _numWeights = _layer.numWeights (_inputSize);
-
-                // ------------------
-                DNN::Net preNet;
-                if (!originalDropFractions.empty ())
-                {
-                    originalDropFractions.erase (originalDropFractions.begin ());
-                    settings.setDropOut (originalDropFractions.begin (), originalDropFractions.end (), settings.dropRepetitions ());
-                }
-                std::vector<double> preWeights;
-
-                // define the preNet (pretraining-net) for this layer
-                // outputSize == inputSize, because this is an autoencoder;
-                preNet.setInputSize (_inputSize);
-                preNet.addLayer (DNN::Layer (numNodes, _layer.activationFunctionType ()));
-                preNet.addLayer (DNN::Layer (_inputSize, DNN::EnumFunction::LINEAR, DNN::ModeOutputValues::DIRECT)); 
-                preNet.setErrorFunction (DNN::ModeErrorFunction::SUMOFSQUARES);
-                preNet.setOutputSize (_inputSize); // outputSize is the inputSize (autoencoder)
-
-                // initialize weights
-                preNet.initializeWeights (DNN::WeightInitializationStrategy::XAVIERUNIFORM, 
-                                          std::back_inserter (preWeights));
-
-                // overwrite already existing weights from the "general" weights
-                std::copy (itWeightGeneral, itWeightGeneral+_numWeights, preWeights.begin ());
-                std::copy (itWeightGeneral, itWeightGeneral+_numWeights, preWeights.begin ()+_numWeights); // set identical weights for the temporary output layer
+// /*! \brief pre-training
+//  *
+//  * in development
+//  */
+//         template <typename Minimizer>
+//             void Net::preTrain (std::vector<double>& weights,
+//                                 std::vector<Pattern>& trainPattern,
+//                                 const std::vector<Pattern>& testPattern,
+//                                 Minimizer& minimizer, Settings& settings)
+//         {
+//             auto itWeightGeneral = std::begin (weights);
+//             std::vector<Pattern> prePatternTrain (trainPattern.size ());
+//             std::vector<Pattern> prePatternTest (testPattern.size ());
+
+//             size_t _inputSize = inputSize ();
+
+//             // transform pattern using the created preNet
+//             auto initializePrePattern = [&](const std::vector<Pattern>& pttrnInput, std::vector<Pattern>& pttrnOutput)
+//                 {
+//                     pttrnOutput.clear ();
+//                     std::transform (std::begin (pttrnInput), std::end (pttrnInput),
+//                                     std::back_inserter (pttrnOutput), 
+//                                     [](const Pattern& p)
+//             {
+//                 Pattern pat (p.input (), p.input (), p.weight ());
+//                 return pat;
+//             });
+//                 };
+
+//             initializePrePattern (trainPattern, prePatternTrain);
+//             initializePrePattern (testPattern, prePatternTest);
+
+//             std::vector<double> originalDropFractions = settings.dropFractions ();
+
+//             for (auto& _layer : layers ())
+//             {
+//                 // compute number of weights (as a function of the number of incoming nodes)
+//                 // fetch number of nodes
+//                 size_t numNodes = _layer.numNodes ();
+//                 size_t _numWeights = _layer.numWeights (_inputSize);
+
+//                 // ------------------
+//                 DNN::Net preNet;
+//                 if (!originalDropFractions.empty ())
+//                 {
+//                     originalDropFractions.erase (originalDropFractions.begin ());
+//                     settings.setDropOut (originalDropFractions.begin (), originalDropFractions.end (), settings.dropRepetitions ());
+//                 }
+//                 std::vector<double> preWeights;
+
+//                 // define the preNet (pretraining-net) for this layer
+//                 // outputSize == inputSize, because this is an autoencoder;
+//                 preNet.setInputSize (_inputSize);
+//                 preNet.addLayer (DNN::Layer (numNodes, _layer.activationFunctionType ()));
+//                 preNet.addLayer (DNN::Layer (_inputSize, DNN::EnumFunction::LINEAR, DNN::ModeOutputValues::DIRECT)); 
+//                 preNet.setErrorFunction (DNN::ModeErrorFunction::SUMOFSQUARES);
+//                 preNet.setOutputSize (_inputSize); // outputSize is the inputSize (autoencoder)
+
+//                 // initialize weights
+//                 preNet.initializeWeights (DNN::WeightInitializationStrategy::XAVIERUNIFORM, 
+//                                           std::back_inserter (preWeights));
+
+//                 // overwrite already existing weights from the "general" weights
+//                 std::copy (itWeightGeneral, itWeightGeneral+_numWeights, preWeights.begin ());
+//                 std::copy (itWeightGeneral, itWeightGeneral+_numWeights, preWeights.begin ()+_numWeights); // set identical weights for the temporary output layer
             
 
-                // train the "preNet"
-                preNet.train (preWeights, prePatternTrain, prePatternTest, minimizer, settings);
+//                 // train the "preNet"
+//                 preNet.train (preWeights, prePatternTrain, prePatternTest, minimizer, settings);
 
-                // fetch the pre-trained weights (without the output part of the autoencoder)
-                std::copy (std::begin (preWeights), std::begin (preWeights) + _numWeights, itWeightGeneral);
+//                 // fetch the pre-trained weights (without the output part of the autoencoder)
+//                 std::copy (std::begin (preWeights), std::begin (preWeights) + _numWeights, itWeightGeneral);
 
-                // advance the iterator on the incoming weights
-                itWeightGeneral += _numWeights;
+//                 // advance the iterator on the incoming weights
+//                 itWeightGeneral += _numWeights;
 
-                // remove the weights of the output layer of the preNet
-                preWeights.erase (preWeights.begin () + _numWeights, preWeights.end ());
+//                 // remove the weights of the output layer of the preNet
+//                 preWeights.erase (preWeights.begin () + _numWeights, preWeights.end ());
 
-                // remove the outputLayer of the preNet
-                preNet.removeLayer ();
+//                 // remove the outputLayer of the preNet
+//                 preNet.removeLayer ();
 
-                // set the output size to the number of nodes in the new output layer (== last hidden layer)
-                preNet.setOutputSize (numNodes);
+//                 // set the output size to the number of nodes in the new output layer (== last hidden layer)
+//                 preNet.setOutputSize (numNodes);
             
-                // transform pattern using the created preNet
-                auto proceedPattern = [&](std::vector<Pattern>& pttrn)
-                    {
-                        std::vector<Pattern> newPttrn;
-                        std::for_each (std::begin (pttrn), std::end (pttrn),
-                                       [&preNet,&preWeights,&newPttrn](Pattern& p)
-                {
-                    std::vector<double> output = preNet.compute (p.input (), preWeights);
-                    Pattern pat (output, output, p.weight ());
-                    newPttrn.push_back (pat);
-//                    p = pat;
-                });
-                        return newPttrn;
-                    };
-
-
-                prePatternTrain = proceedPattern (prePatternTrain);
-                prePatternTest = proceedPattern (prePatternTest);
-
-
-                // the new input size is the output size of the already reduced preNet
-                _inputSize = preNet.layers ().back ().numNodes ();
-            }
-        }
-
-
+//                 // transform pattern using the created preNet
+//                 auto proceedPattern = [&](std::vector<Pattern>& pttrn)
+//                     {
+//                         std::vector<Pattern> newPttrn;
+//                         std::for_each (std::begin (pttrn), std::end (pttrn),
+//                                        [&preNet,&preWeights,&newPttrn](Pattern& p)
+//                 {
+//                     std::vector<double> output = preNet.compute (p.input (), preWeights);
+//                     Pattern pat (output, output, p.weight ());
+//                     newPttrn.push_back (pat);
+// //                    p = pat;
+//                 });
+//                         return newPttrn;
+//                     };
+
+
+//                 prePatternTrain = proceedPattern (prePatternTrain);
+//                 prePatternTest = proceedPattern (prePatternTest);
+
+
+//                 // the new input size is the output size of the already reduced preNet
+//                 _inputSize = preNet.layers ().back ().numNodes ();
+//             }
+//         }
 
 
 
diff --git a/tmva/tmva/src/MethodDNN.cxx b/tmva/tmva/src/MethodDNN.cxx
index c16044730c6cc..838ab10bc78f4 100644
--- a/tmva/tmva/src/MethodDNN.cxx
+++ b/tmva/tmva/src/MethodDNN.cxx
@@ -1,5 +1,5 @@
 // @(#)root/tmva $Id$
-// Author: Peter Speckmayer
+// Authors: Peter Speckmayer, Aditya Sharma
 
 /**********************************************************************************
  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
@@ -10,7 +10,8 @@
  * Description:                                                                   *
  *      A neural network implementation                                           *
  *                                                                                *
- * Authors (alphabetical):                                                        *
+ * Authors (alphabetical):
+ *      Aditya Sharma         <adisharma075@gmail.com>  - CERN, Switzerland
  *      Peter Speckmayer      <peter.speckmayer@gmx.ch> - CERN, Switzerland       *
  *                                                                                *
  * Copyright (c) 2005-2015:                                                       *
@@ -79,9 +80,8 @@ ClassImp(TMVA::MethodDNN)
 TMVA::MethodDNN::MethodDNN( const TString& jobName,
                             const TString& methodTitle,
                             DataSetInfo& theData,
-                            const TString& theOption,
-                            TDirectory* theTargetDir )
-   : MethodBase( jobName, Types::kDNN, methodTitle, theData, theOption, theTargetDir )
+                            const TString& theOption )
+   : MethodBase( jobName, Types::kDNN, methodTitle, theData, theOption)
    , fResume (false)
 {
    // standard constructor
@@ -89,9 +89,8 @@ TMVA::MethodDNN::MethodDNN( const TString& jobName,
 
 //______________________________________________________________________________
 TMVA::MethodDNN::MethodDNN( DataSetInfo& theData,
-                            const TString& theWeightFile,
-                            TDirectory* theTargetDir )
-   : MethodBase( Types::kDNN, theData, theWeightFile, theTargetDir )
+                            const TString& theWeightFile)
+   : MethodBase( Types::kDNN, theData, theWeightFile)
    , fResume (false)
 {
    // constructor from a weight file
@@ -429,6 +428,8 @@ void TMVA::MethodDNN::ProcessOptions()
          std::vector<double> dropConfig;
          dropConfig = fetchValue (block, "DropConfig", dropConfig);
          int dropRepetitions = fetchValue (block, "DropRepetitions", 3);
+         int bucketSize = fetchValue (block, "BucketSize", 8);
+         fBucketSize = bucketSize;
 
          TMVA::DNN::EnumRegularization eRegularization = TMVA::DNN::EnumRegularization::NONE;
          if (regularization == "L1")
@@ -452,7 +453,7 @@ void TMVA::MethodDNN::ProcessOptions()
                std::shared_ptr<TMVA::DNN::ClassificationSettings> ptrSettings = make_shared <TMVA::DNN::ClassificationSettings> (
                                                                                                                                  GetName  (),
                                                                                                                                  convergenceSteps, batchSize, 
-                                                                                                                                 testRepetitions, factorWeightDecay,
+                                                                                                                                 testRepetitions, factorWeightDecay, bucketSize,
                                                                                                                                  eRegularization, fScaleToNumEvents, TMVA::DNN::MinimizerType::fSteepest,
                                                                                                                                  learningRate, 
                                                                                                                                  momentum, repetitions, multithreading);
@@ -464,7 +465,7 @@ void TMVA::MethodDNN::ProcessOptions()
                std::shared_ptr<TMVA::DNN::Settings> ptrSettings = make_shared <TMVA::DNN::Settings> (
                                                                                                      GetName  (),
                                                                                                      convergenceSteps, batchSize, 
-                                                                                                     testRepetitions, factorWeightDecay,
+                                                                                                     testRepetitions, factorWeightDecay, bucketSize,
                                                                                                      eRegularization, TMVA::DNN::MinimizerType::fSteepest,
                                                                                                      learningRate, 
                                                                                                      momentum, repetitions, multithreading);
@@ -475,7 +476,7 @@ void TMVA::MethodDNN::ProcessOptions()
                std::shared_ptr<TMVA::DNN::Settings> ptrSettings = make_shared <TMVA::DNN::Settings> (
                                                                                                      GetName  (),
                                                                                                      convergenceSteps, batchSize, 
-                                                                                                     testRepetitions, factorWeightDecay,
+                                                                                                     testRepetitions, factorWeightDecay, bucketSize,
                                                                                                      eRegularization, TMVA::DNN::MinimizerType::fSteepest,
                                                                                                      learningRate, 
                                                                                                      momentum, repetitions, multithreading);
@@ -549,15 +550,17 @@ void TMVA::MethodDNN::Train()
    if (trainPattern.empty () || testPattern.empty ())
       return;
 
-   // create net and weights
+   // create net and weight bucket
    fNet.clear ();
-   fWeights.clear ();
+   fWeightBucket.clear ();
+
+   std::vector <int> layerWeightNumber;
 
    // if "resume" from saved weights
    if (fResume)
       {
          std::cout << ".. resume" << std::endl;
-         //        std::tie (fNet, fWeights) = ReadWeights (fFileName);
+         //        std::tie (fNet, fWeightBucket) = ReadWeights (fFileName);
       }
    else // initialize weights and net
       {
@@ -601,7 +604,7 @@ void TMVA::MethodDNN::Train()
 
          // initialize weights
          fNet.initializeWeights (fWeightInitializationStrategy, 
-                                 std::back_inserter (fWeights));
+                                 std::back_inserter (fWeightBucket), layerWeightNumber, fBucketSize);
       }
 
 
@@ -637,7 +640,7 @@ void TMVA::MethodDNN::Train()
          if (ptrSettings->minimizerType () == TMVA::DNN::MinimizerType::fSteepest)
             {
                DNN::Steepest minimizer (ptrSettings->learningRate (), ptrSettings->momentum (), ptrSettings->repetitions ());
-               /*E =*/fNet.train (fWeights, trainPattern, testPattern, minimizer, *ptrSettings.get ());
+               /*E =*/fNet.train (fWeightBucket, layerWeightNumber, trainPattern, testPattern, minimizer, *ptrSettings.get ());
             }
          ptrSettings.reset ();
          Log () << kINFO << Endl;
@@ -652,13 +655,13 @@ void TMVA::MethodDNN::Train()
 //_______________________________________________________________________
 Double_t TMVA::MethodDNN::GetMvaValue( Double_t* /*errLower*/, Double_t* /*errUpper*/ )
 {
-   if (fWeights.empty ())
+   if (fWeightBucket.empty ())
       return 0.0;
 
    const std::vector<Float_t>& inputValues = GetEvent ()->GetValues ();
    std::vector<double> input (inputValues.begin (), inputValues.end ());
    input.push_back (1.0); // bias node
-   std::vector<double> output = fNet.compute (input, fWeights);
+   std::vector<double> output = fNet.compute (input, fWeightBucket, fBucketSize);
    if (output.empty ())
       return 0.0;
 
@@ -670,8 +673,8 @@ Double_t TMVA::MethodDNN::GetMvaValue( Double_t* /*errLower*/, Double_t* /*errUp
 
 const std::vector<Float_t> &TMVA::MethodDNN::GetRegressionValues() 
 {
-   assert (!fWeights.empty ());
-   if (fWeights.empty ())
+   assert (!fWeightBucket.empty ());
+   if (fWeightBucket.empty ())
       return *fRegressionReturnVal;
 
    const Event * ev = GetEvent();
@@ -679,7 +682,7 @@ const std::vector<Float_t> &TMVA::MethodDNN::GetRegressionValues()
    const std::vector<Float_t>& inputValues = ev->GetValues ();
    std::vector<double> input (inputValues.begin (), inputValues.end ());
    input.push_back (1.0); // bias node
-   std::vector<double> output = fNet.compute (input, fWeights);
+   std::vector<double> output = fNet.compute (input, fWeightBucket, fBucketSize);
 
    if (fRegressionReturnVal == NULL) fRegressionReturnVal = new std::vector<Float_t>();
    fRegressionReturnVal->clear();
@@ -717,13 +720,13 @@ const std::vector<Float_t> &TMVA::MethodDNN::GetRegressionValues()
 
 const std::vector<Float_t> &TMVA::MethodDNN::GetMulticlassValues()
 {
-   if (fWeights.empty ())
+   if (fWeightBucket.empty ())
       return *fRegressionReturnVal;
 
    const std::vector<Float_t>& inputValues = GetEvent ()->GetValues ();
    std::vector<double> input (inputValues.begin (), inputValues.end ());
    input.push_back (1.0); // bias node
-   std::vector<double> output = fNet.compute (input, fWeights);
+   std::vector<double> output = fNet.compute (input, fWeightBucket, fBucketSize);
 
    // check the output of the network
  
@@ -787,10 +790,10 @@ void TMVA::MethodDNN::AddWeightsXMLTo( void* parent ) const
    void* weightsxml = gTools().xmlengine().NewChild(nn, 0, "Synapses");
    gTools().xmlengine().NewAttr (weightsxml, 0, "InputSize", gTools().StringFromInt((int)fNet.inputSize ()));
    gTools().xmlengine().NewAttr (weightsxml, 0, "OutputSize", gTools().StringFromInt((int)fNet.outputSize ()));
-   gTools().xmlengine().NewAttr (weightsxml, 0, "NumberSynapses", gTools().StringFromInt((int)fWeights.size ()));
+   gTools().xmlengine().NewAttr (weightsxml, 0, "NumberSynapses", gTools().StringFromInt((int)fNet.numWeights ()));
    std::stringstream s("");
    s.precision( 16 );
-   for (std::vector<double>::const_iterator it = fWeights.begin (), itEnd = fWeights.end (); it != itEnd; ++it)
+   for (std::vector<double>::const_iterator it = fWeightBucket.begin (), itEnd = fWeightBucket.end (); it != itEnd; ++it)
       {
          s << std::scientific << (*it) << " ";
       }
@@ -857,11 +860,11 @@ void TMVA::MethodDNN::ReadWeightsFromXML( void* wghtnode )
 
    const char* content = gTools().GetContent (xmlWeights);
    std::stringstream sstr (content);
-   for (Int_t iWeight = 0; iWeight<numWeights; ++iWeight) 
+   for (Int_t iWeight = 0; iWeight< numWeights; ++iWeight) // *************
       { // synapses
          Double_t weight;
          sstr >> weight;
-         fWeights.push_back (weight);
+         fWeightBucket.push_back (weight);
       }
 }
 
@@ -1099,9 +1102,9 @@ void TMVA::MethodDNN::checkGradients ()
    fNet.addLayer (DNN::Layer (outputSize, DNN::EnumFunction::LINEAR, DNN::ModeOutputValues::SIGMOID)); 
    fNet.setErrorFunction (DNN::ModeErrorFunction::CROSSENTROPY);
    //    net.setErrorFunction (ModeErrorFunction::SUMOFSQUARES);
-
+   const int BUCKET_SIZE = 8;
    size_t numWeights = fNet.numWeights (inputSize);
-   std::vector<double> weights (numWeights);
+   std::vector<double> weightBucket (numWeights / BUCKET_SIZE);
    //weights.at (0) = 1000213.2;
 
    std::vector<Pattern> pattern;
@@ -1121,7 +1124,7 @@ void TMVA::MethodDNN::checkGradients ()
       }
 
 
-   DNN::Settings settings (TString ("checkGradients"), /*_convergenceSteps*/ 15, /*_batchSize*/ 1, /*_testRepetitions*/ 7, /*_factorWeightDecay*/ 0, /*regularization*/ TMVA::DNN::EnumRegularization::NONE);
+   DNN::Settings settings (TString ("checkGradients"), /*_convergenceSteps*/ 15, /*_batchSize*/ 1, /*_testRepetitions*/ 7, /*_factorWeightDecay*/ 0, /*_bucketSize*/ 8, /*regularization*/ TMVA::DNN::EnumRegularization::NONE);
 
    size_t improvements = 0;
    size_t worsenings = 0;
@@ -1129,26 +1132,26 @@ void TMVA::MethodDNN::checkGradients ()
    size_t largeDifferences = 0;
    for (size_t iTest = 0; iTest < 1000; ++iTest)
       {
-         TMVA::DNN::uniformDouble (weights, 0.7);
-         std::vector<double> gradients (numWeights, 0);
+         TMVA::DNN::uniformDouble (weightBucket, 0.7);
+         std::vector<double> gradientBucket (numWeights / BUCKET_SIZE, 0);
          DNN::Batch batch (begin (pattern), end (pattern));
          DNN::DropContainer dropContainer;
          std::tuple<DNN::Settings&, DNN::Batch&, DNN::DropContainer&> settingsAndBatch (settings, batch, dropContainer);
-         double E = fNet (settingsAndBatch, weights, gradients);
-         std::vector<double> changedWeights;
-         changedWeights.assign (weights.begin (), weights.end ());
+         double E = fNet (settingsAndBatch, weightBucket, gradientBucket);
+         std::vector<double> changedWeightBucket;
+         changedWeightBucket.assign (weightBucket.begin (), weightBucket.end ());
 
-         int changeWeightPosition = TMVA::DNN::randomInt (numWeights);
-         double dEdw = gradients.at (changeWeightPosition);
+         int changeWeightPosition = TMVA::DNN::randomInt (numWeights / BUCKET_SIZE);
+         double dEdw = gradientBucket.at (changeWeightPosition);
          while (dEdw == 0.0)
             {
-               changeWeightPosition = TMVA::DNN::randomInt (numWeights);
-               dEdw = gradients.at (changeWeightPosition);
+               changeWeightPosition = TMVA::DNN::randomInt (numWeights / BUCKET_SIZE);
+               dEdw = gradientBucket.at (changeWeightPosition);
             }
 
          const double gamma = 0.01;
          double delta = gamma*dEdw;
-         changedWeights.at (changeWeightPosition) += delta;
+         changedWeightBucket.at (changeWeightPosition) += delta;
          if (dEdw == 0.0)
             {
                std::cout << "dEdw == 0.0 ";
@@ -1156,7 +1159,7 @@ void TMVA::MethodDNN::checkGradients ()
             }
         
          assert (dEdw != 0.0);
-         double Echanged = fNet (settingsAndBatch, changedWeights);
+         double Echanged = fNet (settingsAndBatch, changedWeightBucket);
 
          //       double difference = fabs((E-Echanged) - delta*dEdw);
          double difference = fabs ((E+delta - Echanged)/E);
@@ -1185,7 +1188,7 @@ void TMVA::MethodDNN::checkGradients ()
             }
          else
             {
-               //            for_each (begin (weights), end (weights), [](double w){ std::cout << w << ", "; });
+               //            for_each (begin (weightBucket), end (weightBucket), [](double w){ std::cout << w << ", "; });
                //            std::cout << std::endl;
                //            assert (isOk);
             }
diff --git a/tmva/tmva/src/NeuralNet.cxx b/tmva/tmva/src/NeuralNet.cxx
index a8394f649ecc1..b8670cf1e1ccf 100644
--- a/tmva/tmva/src/NeuralNet.cxx
+++ b/tmva/tmva/src/NeuralNet.cxx
@@ -1,197 +1,213 @@
  
 
 #include "TMVA/NeuralNet.h"
-
+#include <cmath>
 
 namespace TMVA
 {
-   namespace DNN
-   {
+    namespace DNN
+    {
+
+          int hasherFunction(int a)
+              {
+                 a = (a+0x7ed55d16) + (a<<12);
+                 a = (a^0xc761c23c) ^ (a>>19);
+                 a = (a+0x165667b1) + (a<<5);
+                 a = (a+0xd3a2646c) ^ (a<<9);
+                 a = (a+0xfd7046c5) + (a<<3);
+                 a = (a^0xb55a4f09) ^ (a>>16);
+                 return std::abs(a);
+              }
 
 
 
-      double gaussDouble (double mean, double sigma)
-      {
-         static std::default_random_engine generator;
-         std::normal_distribution<double> distribution (mean, sigma);
-         return distribution (generator);
-      }
+        double gaussDouble (double mean, double sigma)
+        {
+            static std::default_random_engine generator;
+            std::normal_distribution<double> distribution (mean, sigma);
+            return distribution (generator);
+        }
 
 
-      double uniformDouble (double minValue, double maxValue)
-      {
-         static std::default_random_engine generator;
-         std::uniform_real_distribution<double> distribution(minValue, maxValue);
-         return distribution(generator);
-      }
+        double uniformDouble (double minValue, double maxValue)
+        {
+            static std::default_random_engine generator;
+            std::uniform_real_distribution<double> distribution(minValue, maxValue);
+            return distribution(generator);
+        }
 
 
     
-      int randomInt (int maxValue)
-      {
-         static std::default_random_engine generator;
-         std::uniform_int_distribution<int> distribution(0,maxValue-1);
-         return distribution(generator);
-      }
-
-
-      double studenttDouble (double distributionParameter)
-      {
-         static std::default_random_engine generator;
-         std::student_t_distribution<double> distribution (distributionParameter);
-         return distribution (generator);
-      }
-
-
-      LayerData::LayerData (size_t inputSize)
-         : m_isInputLayer (true)
-         , m_hasWeights (false)
-         , m_hasGradients (false)
-         , m_eModeOutput (ModeOutputValues::DIRECT) 
-      {
-         m_size = inputSize;
-         m_deltas.assign (m_size, 0);
-      }
-
-
-
-      LayerData::LayerData (const_iterator_type itInputBegin, const_iterator_type itInputEnd, ModeOutputValues eModeOutput)
-         : m_isInputLayer (true)
-         , m_hasWeights (false)
-         , m_hasGradients (false)
-         , m_eModeOutput (eModeOutput) 
-      {
-         m_itInputBegin = itInputBegin;
-         m_itInputEnd   = itInputEnd;
-         m_size = std::distance (itInputBegin, itInputEnd);
-         m_deltas.assign (m_size, 0);
-      }
-
-
-
-
-      LayerData::LayerData (size_t _size, 
-                            const_iterator_type itWeightBegin, 
-                            iterator_type itGradientBegin, 
-                            std::shared_ptr<std::function<double(double)>> _activationFunction, 
-                            std::shared_ptr<std::function<double(double)>> _inverseActivationFunction,
-                            ModeOutputValues eModeOutput)
-         : m_size (_size)
-         , m_itConstWeightBegin   (itWeightBegin)
-         , m_itGradientBegin (itGradientBegin)
-         , m_activationFunction (_activationFunction)
-         , m_inverseActivationFunction (_inverseActivationFunction)
-         , m_isInputLayer (false)
-         , m_hasWeights (true)
-         , m_hasGradients (true)
-         , m_eModeOutput (eModeOutput) 
-      {
-         m_values.assign (_size, 0);
-         m_deltas.assign (_size, 0);
-         m_valueGradients.assign (_size, 0);
-      }
-
-
-
-
-      LayerData::LayerData (size_t _size, const_iterator_type itWeightBegin, 
-                            std::shared_ptr<std::function<double(double)>> _activationFunction, 
-                            ModeOutputValues eModeOutput)
-         : m_size (_size)
-         , m_itConstWeightBegin   (itWeightBegin)
-         , m_activationFunction (_activationFunction)
-         , m_isInputLayer (false)
-         , m_hasWeights (true)
-         , m_hasGradients (false)
-         , m_eModeOutput (eModeOutput) 
-      {
-         m_values.assign (_size, 0);
-      }
-
-
-
-      typename LayerData::container_type LayerData::computeProbabilities ()
-      {
-         container_type probabilitiesContainer;
-         if (TMVA::DNN::isFlagSet (ModeOutputValues::SIGMOID, m_eModeOutput))
+        int randomInt (int maxValue)
+        {
+            static std::default_random_engine generator;
+            std::uniform_int_distribution<int> distribution(0,maxValue-1);
+            return distribution(generator);
+        }
+
+
+        double studenttDouble (double distributionParameter)
+        {
+            static std::default_random_engine generator;
+            std::student_t_distribution<double> distribution (distributionParameter);
+            return distribution (generator);
+        }
+
+
+        LayerData::LayerData (size_t inputSize)
+            : m_hasDropOut (false)
+            , m_isInputLayer (true)
+            , m_hasWeights (false)
+            , m_hasGradients (false)
+            , m_eModeOutput (ModeOutputValues::DIRECT) 
+        {
+            m_size = inputSize;
+            m_deltas.assign (m_size, 0);
+        }
+
+
+
+        LayerData::LayerData (const_iterator_type itInputBegin, const_iterator_type itInputEnd, ModeOutputValues eModeOutput)
+            : m_hasDropOut (false)
+            , m_isInputLayer (true)
+            , m_hasWeights (false)
+            , m_hasGradients (false)
+            , m_eModeOutput (eModeOutput) 
+        {
+            m_itInputBegin = itInputBegin;
+            m_itInputEnd   = itInputEnd;
+            m_size = std::distance (itInputBegin, itInputEnd);
+            m_deltas.assign (m_size, 0);
+        }
+
+
+
+
+        LayerData::LayerData (size_t _size, 
+                              int itWeightBegin, 
+                              int itGradientBegin, 
+                              std::shared_ptr<std::function<double(double)>> _activationFunction, 
+                              std::shared_ptr<std::function<double(double)>> _inverseActivationFunction,
+                              ModeOutputValues eModeOutput)
+            : m_size (_size)
+            , m_hasDropOut (false)
+            , m_itConstWeightBegin   (itWeightBegin)
+            , m_itGradientBegin (itGradientBegin)
+            , m_activationFunction (_activationFunction)
+            , m_inverseActivationFunction (_inverseActivationFunction)
+            , m_isInputLayer (false)
+            , m_hasWeights (true)
+            , m_hasGradients (true)
+            , m_eModeOutput (eModeOutput) 
+        {
+            m_values.assign (_size, 0);
+            m_deltas.assign (_size, 0);
+            m_valueGradients.assign (_size, 0);
+        }
+
+
+
+
+        LayerData::LayerData (size_t _size, int itWeightBegin, 
+                              std::shared_ptr<std::function<double(double)>> _activationFunction, 
+                              ModeOutputValues eModeOutput)
+            : m_size (_size)
+            , m_hasDropOut (false)
+            , m_itConstWeightBegin   (itWeightBegin)
+            , m_activationFunction (_activationFunction)
+            , m_inverseActivationFunction ()
+            , m_isInputLayer (false)
+            , m_hasWeights (true)
+            , m_hasGradients (false)
+            , m_eModeOutput (eModeOutput) 
+        {
+            m_values.assign (_size, 0);
+        }
+
+
+
+        typename LayerData::container_type LayerData::computeProbabilities () const
+        {
+            container_type probabilitiesContainer;
+            if (TMVA::DNN::isFlagSet (ModeOutputValues::SIGMOID, m_eModeOutput))
             {
-               std::transform (begin (m_values), end (m_values), std::back_inserter (probabilitiesContainer), (*Sigmoid.get ()));
+                std::transform (begin (m_values), end (m_values), std::back_inserter (probabilitiesContainer), (*Sigmoid.get ()));
             }
-         else if (TMVA::DNN::isFlagSet (ModeOutputValues::SOFTMAX, m_eModeOutput))
+            else if (TMVA::DNN::isFlagSet (ModeOutputValues::SOFTMAX, m_eModeOutput))
             {
-               double sum = 0;
-               probabilitiesContainer = m_values;
-               std::for_each (begin (probabilitiesContainer), end (probabilitiesContainer), [&sum](double& p){ p = std::exp (p); sum += p; });
-               if (sum != 0)
-                  std::for_each (begin (probabilitiesContainer), end (probabilitiesContainer), [sum ](double& p){ p /= sum; });
+                double sum = 0;
+                probabilitiesContainer = m_values;
+                std::for_each (begin (probabilitiesContainer), end (probabilitiesContainer), [&sum](double& p){ p = std::exp (p); sum += p; });
+                if (sum != 0)
+                    std::for_each (begin (probabilitiesContainer), end (probabilitiesContainer), [sum ](double& p){ p /= sum; });
             }
-         else
+            else
             {
-               probabilitiesContainer.assign (begin (m_values), end (m_values));
+                probabilitiesContainer.assign (begin (m_values), end (m_values));
             }
-         return probabilitiesContainer;
-      }
+            return probabilitiesContainer;
+        }
 
 
 
 
 
-      Layer::Layer (size_t _numNodes, EnumFunction _activationFunction, ModeOutputValues eModeOutputValues) 
-         : m_numNodes (_numNodes) 
-         , m_eModeOutputValues (eModeOutputValues)
-         , m_activationFunctionType (_activationFunction)
-      {
-         for (size_t iNode = 0; iNode < _numNodes; ++iNode)
+        Layer::Layer (size_t _numNodes, EnumFunction _activationFunction, ModeOutputValues eModeOutputValues) 
+            : m_numNodes (_numNodes) 
+            , m_eModeOutputValues (eModeOutputValues)
+            , m_activationFunctionType (_activationFunction)
+        {
+            for (size_t iNode = 0; iNode < _numNodes; ++iNode)
             {
-               auto actFnc = Linear;
-               auto invActFnc = InvLinear;
-               switch (_activationFunction)
-                  {
-                  case EnumFunction::ZERO:
-                     actFnc = ZeroFnc;
-                     invActFnc = ZeroFnc;
-                     break;
-                  case EnumFunction::LINEAR:
-                     actFnc = Linear;
-                     invActFnc = InvLinear;
-                     break;
-                  case EnumFunction::TANH:
-                     actFnc = Tanh;
-                     invActFnc = InvTanh;
-                     break;
-                  case EnumFunction::RELU:
-                     actFnc = ReLU;
-                     invActFnc = InvReLU;
-                     break;
-                  case EnumFunction::SYMMRELU:
-                     actFnc = SymmReLU;
-                     invActFnc = InvSymmReLU;
-                     break;
-                  case EnumFunction::TANHSHIFT:
-                     actFnc = TanhShift;
-                     invActFnc = InvTanhShift;
-                     break;
-                  case EnumFunction::SOFTSIGN:
-                     actFnc = SoftSign;
-                     invActFnc = InvSoftSign;
-                     break;
-                  case EnumFunction::SIGMOID:
-                     actFnc = Sigmoid;
-                     invActFnc = InvSigmoid;
-                     break;
-                  case EnumFunction::GAUSS:
-                     actFnc = Gauss;
-                     invActFnc = InvGauss;
-                     break;
-                  case EnumFunction::GAUSSCOMPLEMENT:
-                     actFnc = GaussComplement;
-                     invActFnc = InvGaussComplement;
-                     break;
-                  }
-               m_activationFunction = actFnc;
-               m_inverseActivationFunction = invActFnc;
+                auto actFnc = Linear;
+                auto invActFnc = InvLinear;
+                switch (_activationFunction)
+                {
+                case EnumFunction::ZERO:
+                    actFnc = ZeroFnc;
+                    invActFnc = ZeroFnc;
+                    break;
+                case EnumFunction::LINEAR:
+                    actFnc = Linear;
+                    invActFnc = InvLinear;
+                    break;
+                case EnumFunction::TANH:
+                    actFnc = Tanh;
+                    invActFnc = InvTanh;
+                    break;
+                case EnumFunction::RELU:
+                    actFnc = ReLU;
+                    invActFnc = InvReLU;
+                    break;
+                case EnumFunction::SYMMRELU:
+                    actFnc = SymmReLU;
+                    invActFnc = InvSymmReLU;
+                    break;
+                case EnumFunction::TANHSHIFT:
+                    actFnc = TanhShift;
+                    invActFnc = InvTanhShift;
+                    break;
+                case EnumFunction::SOFTSIGN:
+                    actFnc = SoftSign;
+                    invActFnc = InvSoftSign;
+                    break;
+                case EnumFunction::SIGMOID:
+                    actFnc = Sigmoid;
+                    invActFnc = InvSigmoid;
+                    break;
+                case EnumFunction::GAUSS:
+                    actFnc = Gauss;
+                    invActFnc = InvGauss;
+                    break;
+                case EnumFunction::GAUSSCOMPLEMENT:
+                    actFnc = GaussComplement;
+                    invActFnc = InvGaussComplement;
+                    break;
+                }
+                m_activationFunction = actFnc;
+                m_inverseActivationFunction = invActFnc;
             }
-      }
+        }
 
 
 
@@ -202,40 +218,39 @@ namespace TMVA
 
 
 
-      Settings::Settings (TString name,
-                          size_t _convergenceSteps, size_t _batchSize, size_t _testRepetitions, 
-                          double _factorWeightDecay, EnumRegularization eRegularization,
-                          MinimizerType _eMinimizerType, double _learningRate, 
-                          double _momentum, int _repetitions, bool _useMultithreading, 
-                          bool _doBatchNormalization)
-         : m_timer (100, name)
-         , m_minProgress (0)
-         , m_maxProgress (100)
-         , m_convergenceSteps (_convergenceSteps)
-         , m_batchSize (_batchSize)
-         , m_testRepetitions (_testRepetitions)
-         , m_factorWeightDecay (_factorWeightDecay)
-         , count_E (0)
-         , count_dE (0)
-         , count_mb_E (0)
-         , count_mb_dE (0)
-         , m_regularization (eRegularization)
-         , fLearningRate (_learningRate)
-         , fMomentum (_momentum)
-         , fRepetitions (_repetitions)
-         , fMinimizerType (_eMinimizerType)
-         , m_convergenceCount (0)
-         , m_maxConvergenceCount (0)
-         , m_minError (1e10)
-         , m_useMultithreading (_useMultithreading)
-         , m_doBatchNormalization (_doBatchNormalization)
-         , fMonitoring (NULL)
-      {
-      }
+        Settings::Settings (TString name,
+                            size_t _convergenceSteps, size_t _batchSize, size_t _testRepetitions, 
+                            double _factorWeightDecay, int _bucketSize, EnumRegularization eRegularization,
+                            MinimizerType _eMinimizerType, double _learningRate, 
+                            double _momentum, int _repetitions, bool _useMultithreading)
+            : m_timer (100, name)
+            , m_minProgress (0)
+            , m_maxProgress (100)
+            , m_convergenceSteps (_convergenceSteps)
+            , m_batchSize (_batchSize)
+            , m_testRepetitions (_testRepetitions)
+            , m_factorWeightDecay (_factorWeightDecay)
+            , count_E (0)
+            , count_dE (0)
+            , count_mb_E (0)
+            , count_mb_dE (0)
+            , m_regularization (eRegularization)
+            , fLearningRate (_learningRate)
+            , fMomentum (_momentum)
+            , fRepetitions (_repetitions)
+            , fMinimizerType (_eMinimizerType)
+            , m_convergenceCount (0)
+            , m_maxConvergenceCount (0)
+            , m_minError (1e10)
+            , m_useMultithreading (_useMultithreading)
+            , fMonitoring (NULL)
+            , fBucketSize (_bucketSize)
+        {
+        }
     
-      Settings::~Settings () 
-      {
-      }
+        Settings::~Settings () 
+        {
+        }
 
 
 
@@ -250,260 +265,260 @@ namespace TMVA
 
 
 
-      /** \brief action to be done when the training cycle is started (e.g. update some monitoring output)
-       *
-       */
-      void ClassificationSettings::startTrainCycle () 
-      {
-         if (fMonitoring)
+        /** \brief action to be done when the training cycle is started (e.g. update some monitoring output)
+         *
+         */
+        void ClassificationSettings::startTrainCycle () 
+        {
+            if (fMonitoring)
             {
-               create ("ROC", 100, 0, 1, 100, 0, 1);
-               create ("Significance", 100, 0, 1, 100, 0, 3);
-               create ("OutputSig", 100, 0, 1);
-               create ("OutputBkg", 100, 0, 1);
-               fMonitoring->ProcessEvents ();
+                create ("ROC", 100, 0, 1, 100, 0, 1);
+                create ("Significance", 100, 0, 1, 100, 0, 3);
+                create ("OutputSig", 100, 0, 1);
+                create ("OutputBkg", 100, 0, 1);
+                fMonitoring->ProcessEvents ();
             }
-      }
-
-      /** \brief action to be done when the training cycle is ended (e.g. update some monitoring output)
-       *
-       */
-      void ClassificationSettings::endTrainCycle (double /*error*/) 
-      {
-         if (fMonitoring) fMonitoring->ProcessEvents ();
-      }
-
-      /** \brief action to be done after the computation of a test sample (e.g. update some monitoring output)
-       *
-       */
-      void ClassificationSettings::testSample (double /*error*/, double output, double target, double weight)
-      {
+        }
+
+        /** \brief action to be done when the training cycle is ended (e.g. update some monitoring output)
+         *
+         */
+        void ClassificationSettings::endTrainCycle (double /*error*/) 
+        {
+            if (fMonitoring) fMonitoring->ProcessEvents ();
+        }
+
+        /** \brief action to be done after the computation of a test sample (e.g. update some monitoring output)
+         *
+         */
+        void ClassificationSettings::testSample (double /*error*/, double output, double target, double weight)
+        {
             
-         m_output.push_back (output);
-         m_targets.push_back (target);
-         m_weights.push_back (weight);
-      }
-
-
-      /** \brief action to be done when the test cycle is started (e.g. update some monitoring output)
-       *
-       */
-      void ClassificationSettings::startTestCycle () 
-      {
-         m_output.clear ();
-         m_targets.clear ();
-         m_weights.clear ();
-      }
-
-      /** \brief action to be done when the training cycle is ended (e.g. update some monitoring output)
-       *
-       */
-      void ClassificationSettings::endTestCycle () 
-      {
-         if (m_output.empty ())
-            return;
-         double minVal = *std::min_element (begin (m_output), end (m_output));
-         double maxVal = *std::max_element (begin (m_output), end (m_output));
-         const size_t numBinsROC = 1000;
-         const size_t numBinsData = 100;
-
-         std::vector<double> truePositives (numBinsROC+1, 0);
-         std::vector<double> falsePositives (numBinsROC+1, 0);
-         std::vector<double> trueNegatives (numBinsROC+1, 0);
-         std::vector<double> falseNegatives (numBinsROC+1, 0);
-
-         std::vector<double> x (numBinsData, 0);
-         std::vector<double> datSig (numBinsData+1, 0);
-         std::vector<double> datBkg (numBinsData+1, 0);
-
-         double binSizeROC = (maxVal - minVal)/(double)numBinsROC;
-         double binSizeData = (maxVal - minVal)/(double)numBinsData;
-
-         double sumWeightsSig = 0.0;
-         double sumWeightsBkg = 0.0;
-
-         for (size_t b = 0; b < numBinsData; ++b)
+            m_output.push_back (output);
+            m_targets.push_back (target);
+            m_weights.push_back (weight);
+        }
+
+
+        /** \brief action to be done when the test cycle is started (e.g. update some monitoring output)
+         *
+         */
+        void ClassificationSettings::startTestCycle () 
+        {
+            m_output.clear ();
+            m_targets.clear ();
+            m_weights.clear ();
+        }
+
+        /** \brief action to be done when the training cycle is ended (e.g. update some monitoring output)
+         *
+         */
+        void ClassificationSettings::endTestCycle () 
+        {
+            if (m_output.empty ())
+                return;
+            double minVal = *std::min_element (begin (m_output), end (m_output));
+            double maxVal = *std::max_element (begin (m_output), end (m_output));
+            const size_t numBinsROC = 1000;
+            const size_t numBinsData = 100;
+
+            std::vector<double> truePositives (numBinsROC+1, 0);
+            std::vector<double> falsePositives (numBinsROC+1, 0);
+            std::vector<double> trueNegatives (numBinsROC+1, 0);
+            std::vector<double> falseNegatives (numBinsROC+1, 0);
+
+            std::vector<double> x (numBinsData, 0);
+            std::vector<double> datSig (numBinsData+1, 0);
+            std::vector<double> datBkg (numBinsData+1, 0);
+
+            double binSizeROC = (maxVal - minVal)/(double)numBinsROC;
+            double binSizeData = (maxVal - minVal)/(double)numBinsData;
+
+            double sumWeightsSig = 0.0;
+            double sumWeightsBkg = 0.0;
+
+            for (size_t b = 0; b < numBinsData; ++b)
             {
-               double binData = minVal + b*binSizeData;
-               x.at (b) = binData;
+                double binData = minVal + b*binSizeData;
+                x.at (b) = binData;
             }
 
-         if (fabs(binSizeROC) < 0.0001)
-            return;
+            if (fabs(binSizeROC) < 0.0001)
+                return;
 
-         for (size_t i = 0, iEnd = m_output.size (); i < iEnd; ++i)
+            for (size_t i = 0, iEnd = m_output.size (); i < iEnd; ++i)
             {
-               double val = m_output.at (i);
-               double truth = m_targets.at (i);
-               double weight = m_weights.at (i);
+                double val = m_output.at (i);
+                double truth = m_targets.at (i);
+                double weight = m_weights.at (i);
 
-               bool isSignal = (truth > 0.5 ? true : false);
+                bool isSignal = (truth > 0.5 ? true : false);
 
-               if (m_sumOfSigWeights != 0 && m_sumOfBkgWeights != 0)
-                  {
-                     if (isSignal)
+                if (m_sumOfSigWeights != 0 && m_sumOfBkgWeights != 0)
+                {
+                    if (isSignal)
                         weight *= m_sumOfSigWeights;
-                     else
+                    else
                         weight *= m_sumOfBkgWeights;
-                  }
-
-               size_t binROC = (val-minVal)/binSizeROC;
-               size_t binData = (val-minVal)/binSizeData;
-
-               if (isSignal)
-                  {
-                     for (size_t n = 0; n <= binROC; ++n)
-                        {
-                           truePositives.at (n) += weight;
-                        }
-                     for (size_t n = binROC+1; n < numBinsROC; ++n)
-                        {
-                           falseNegatives.at (n) += weight;
-                        }
-
-                     datSig.at (binData) += weight;
-                     sumWeightsSig += weight;
-                  }
-               else
-                  {
-                     for (size_t n = 0; n <= binROC; ++n)
-                        {
-                           falsePositives.at (n) += weight;
-                        }
-                     for (size_t n = binROC+1; n < numBinsROC; ++n)
-                        {
-                           trueNegatives.at (n) += weight;
-                        }
-
-                     datBkg.at (binData) += weight;
-                     sumWeightsBkg += weight;
-                  }
+                }
+
+                size_t binROC = (val-minVal)/binSizeROC;
+                size_t binData = (val-minVal)/binSizeData;
+
+                if (isSignal)
+                {
+                    for (size_t n = 0; n <= binROC; ++n)
+                    {
+                        truePositives.at (n) += weight;
+                    }
+                    for (size_t n = binROC+1; n < numBinsROC; ++n)
+                    {
+                        falseNegatives.at (n) += weight;
+                    }
+
+                    datSig.at (binData) += weight;
+                    sumWeightsSig += weight;
+                }
+                else
+                {
+                    for (size_t n = 0; n <= binROC; ++n)
+                    {
+                        falsePositives.at (n) += weight;
+                    }
+                    for (size_t n = binROC+1; n < numBinsROC; ++n)
+                    {
+                        trueNegatives.at (n) += weight;
+                    }
+
+                    datBkg.at (binData) += weight;
+                    sumWeightsBkg += weight;
+                }
             }
 
-         std::vector<double> sigEff;
-         std::vector<double> backRej;
+            std::vector<double> sigEff;
+            std::vector<double> backRej;
 
-         double bestSignificance = 0;
-         double bestCutSignificance = 0;
+            double bestSignificance = 0;
+            double bestCutSignificance = 0;
 
-         double numEventsScaleFactor = 1.0;
-         if (m_scaleToNumEvents > 0)
+            double numEventsScaleFactor = 1.0;
+            if (m_scaleToNumEvents > 0)
             {
-               size_t numEvents = m_output.size ();
-               numEventsScaleFactor = double (m_scaleToNumEvents)/double (numEvents);
+                size_t numEvents = m_output.size ();
+                numEventsScaleFactor = double (m_scaleToNumEvents)/double (numEvents);
             }
 
-         clear ("ROC");
-         clear ("Significance");
+            clear ("ROC");
+            clear ("Significance");
 
-         for (size_t i = 0; i < numBinsROC; ++i)
+            for (size_t i = 0; i < numBinsROC; ++i)
             {
-               double tp = truePositives.at (i) * numEventsScaleFactor;
-               double fp = falsePositives.at (i) * numEventsScaleFactor;
-               double tn = trueNegatives.at (i) * numEventsScaleFactor;
-               double fn = falseNegatives.at (i) * numEventsScaleFactor;
+                double tp = truePositives.at (i) * numEventsScaleFactor;
+                double fp = falsePositives.at (i) * numEventsScaleFactor;
+                double tn = trueNegatives.at (i) * numEventsScaleFactor;
+                double fn = falseNegatives.at (i) * numEventsScaleFactor;
 
-               double seff = (tp+fn == 0.0 ? 1.0 : (tp / (tp+fn)));
-               double brej = (tn+fp == 0.0 ? 0.0 : (tn / (tn+fp)));
+                double seff = (tp+fn == 0.0 ? 1.0 : (tp / (tp+fn)));
+                double brej = (tn+fp == 0.0 ? 0.0 : (tn / (tn+fp)));
 
-               sigEff.push_back (seff);
-               backRej.push_back (brej);
+                sigEff.push_back (seff);
+                backRej.push_back (brej);
             
-               //            m_histROC->Fill (seff, brej);
-               addPoint ("ROC", seff, brej); // x, y
+                //            m_histROC->Fill (seff, brej);
+                addPoint ("ROC", seff, brej); // x, y
 
 
-               double currentCut = (i * binSizeROC)+minVal;
+                double currentCut = (i * binSizeROC)+minVal;
 
-               double sig = tp;
-               double bkg = fp;
-               double significance = sig / sqrt (sig + bkg);
-               if (significance > bestSignificance)
-                  {
-                     bestSignificance = significance;
-                     bestCutSignificance = currentCut;
-                  }
+                double sig = tp;
+                double bkg = fp;
+                double significance = sig / sqrt (sig + bkg);
+                if (significance > bestSignificance)
+                {
+                    bestSignificance = significance;
+                    bestCutSignificance = currentCut;
+                }
 
-               addPoint ("Significance", currentCut, significance);
-               //            m_histSignificance->Fill (currentCut, significance);
+                addPoint ("Significance", currentCut, significance);
+                //            m_histSignificance->Fill (currentCut, significance);
             }
 
-         m_significances.push_back (bestSignificance);
-         static size_t testCycle = 0;
+            m_significances.push_back (bestSignificance);
+            static size_t testCycle = 0;
 
-         clear ("OutputSig");
-         clear ("OutputBkg");
-         for (size_t i = 0; i < numBinsData; ++i)
+            clear ("OutputSig");
+            clear ("OutputBkg");
+            for (size_t i = 0; i < numBinsData; ++i)
             {
-               addPoint ("OutputSig", x.at (i), datSig.at (i)/sumWeightsSig);
-               addPoint ("OutputBkg", x.at (i), datBkg.at (i)/sumWeightsBkg);
-               // m_histOutputSignal->Fill (x.at (i), datSig.at (1)/sumWeightsSig);
-               // m_histOutputBackground->Fill (x.at (i), datBkg.at (1)/sumWeightsBkg);
+                addPoint ("OutputSig", x.at (i), datSig.at (i)/sumWeightsSig);
+                addPoint ("OutputBkg", x.at (i), datBkg.at (i)/sumWeightsBkg);
+                // m_histOutputSignal->Fill (x.at (i), datSig.at (1)/sumWeightsSig);
+                // m_histOutputBackground->Fill (x.at (i), datBkg.at (1)/sumWeightsBkg);
             }
 
        
-         ++testCycle;
+            ++testCycle;
 
-         if (fMonitoring)
+            if (fMonitoring)
             {
-               plot ("ROC", "", 2, kRed);
-               plot ("Significance", "", 3, kRed);
-               plot ("OutputSig", "", 4, kRed);
-               plot ("OutputBkg", "same", 4, kBlue);
-               fMonitoring->ProcessEvents ();
+                plot ("ROC", "", 2, kRed);
+                plot ("Significance", "", 3, kRed);
+                plot ("OutputSig", "", 4, kRed);
+                plot ("OutputBkg", "same", 4, kBlue);
+                fMonitoring->ProcessEvents ();
             }
 
-         m_cutValue = bestCutSignificance;
-      }
+            m_cutValue = bestCutSignificance;
+        }
 
 
-      /** \brief check for convergence 
-       *
-       */
-      bool Settings::hasConverged (double testError)
-      {
-         // std::cout << "check convergence; minError " << m_minError << "  current " << testError
-         //           << "  current convergence count " << m_convergenceCount << std::endl;
-         if (testError < m_minError*0.999)
+        /** \brief check for convergence 
+         *
+         */
+        bool Settings::hasConverged (double testError)
+        {
+            // std::cout << "check convergence; minError " << m_minError << "  current " << testError
+            //           << "  current convergence count " << m_convergenceCount << std::endl;
+            if (testError < m_minError*0.999)
             {
-               m_convergenceCount = 0;
-               m_minError = testError;
+                m_convergenceCount = 0;
+                m_minError = testError;
             }
-         else
+            else
             {
-               ++m_convergenceCount;
-               m_maxConvergenceCount = std::max (m_convergenceCount, m_maxConvergenceCount);
+                ++m_convergenceCount;
+                m_maxConvergenceCount = std::max (m_convergenceCount, m_maxConvergenceCount);
             }
 
 
-         if (m_convergenceCount >= convergenceSteps () || testError <= 0)
-            return true;
+            if (m_convergenceCount >= convergenceSteps () || testError <= 0)
+                return true;
 
-         return false;
-      }
+            return false;
+        }
 
 
 
-      /** \brief set the weight sums to be scaled to (preparations for monitoring output)
-       *
-       */
-      void ClassificationSettings::setWeightSums (double sumOfSigWeights, double sumOfBkgWeights)
-      {
-         m_sumOfSigWeights = sumOfSigWeights; m_sumOfBkgWeights = sumOfBkgWeights;
-      }
+        /** \brief set the weight sums to be scaled to (preparations for monitoring output)
+         *
+         */
+        void ClassificationSettings::setWeightSums (double sumOfSigWeights, double sumOfBkgWeights)
+        {
+            m_sumOfSigWeights = sumOfSigWeights; m_sumOfBkgWeights = sumOfBkgWeights;
+        }
     
-      /** \brief preparation for monitoring output
-       *
-       */
-      void ClassificationSettings::setResultComputation (
-                                                         std::string _fileNameNetConfig,
-                                                         std::string _fileNameResult,
-                                                         std::vector<Pattern>* _resultPatternContainer)
-      {
-         m_pResultPatternContainer = _resultPatternContainer;
-         m_fileNameResult = _fileNameResult;
-         m_fileNameNetConfig = _fileNameNetConfig;
-      }
+        /** \brief preparation for monitoring output
+         *
+         */
+        void ClassificationSettings::setResultComputation (
+            std::string _fileNameNetConfig,
+            std::string _fileNameResult,
+            std::vector<Pattern>* _resultPatternContainer)
+        {
+            m_pResultPatternContainer = _resultPatternContainer;
+            m_fileNameResult = _fileNameResult;
+            m_fileNameNetConfig = _fileNameNetConfig;
+        }
 
 
 
@@ -512,39 +527,51 @@ namespace TMVA
 
     
 
-      /** \brief compute the number of weights given the size of the input layer
-       *
-       */
-      size_t Net::numWeights (size_t trainingStartLayer) const 
-      {
-         size_t num (0);
-         size_t index (0);
-         size_t prevNodes (inputSize ());
-         for (auto& layer : m_layers)
+        /** \brief compute the number of weights given the size of the input layer
+         *
+         */
+        size_t Net::numWeights (size_t trainingStartLayer) const 
+        {
+            size_t num (0);
+            size_t index (0);
+            size_t prevNodes (inputSize ());
+            for (auto& layer : m_layers)
             {
-               if (index >= trainingStartLayer)
-                  num += layer.numWeights (prevNodes);
-               prevNodes = layer.numNodes ();
-               ++index;
+                if (index >= trainingStartLayer)
+                    num += layer.numWeights (prevNodes);
+                prevNodes = layer.numNodes ();
+                ++index;
             }
-         return num;
-      }
+            return num;
+        }
 
 
+        size_t Net::numNodes (size_t trainingStartLayer) const 
+        {
+            size_t num (0);
+            size_t index (0);
+            for (auto& layer : m_layers)
+            {
+                if (index >= trainingStartLayer)
+                    num += layer.numNodes ();
+                ++index;
+            }
+            return num;
+        }
 
-      /** \brief prepare the drop-out container given the provided drop-fractions
-       *
-       */
-      void Net::fillDropContainer (DropContainer& dropContainer, double dropFraction, size_t numNodes) const
-      {
-         size_t numDrops = dropFraction * numNodes;
-         if (numDrops >= numNodes) // maintain at least one node
-            numDrops = numNodes - 1;
-         dropContainer.insert (end (dropContainer), numNodes-numDrops, true); // add the markers for the nodes which are enabled
-         dropContainer.insert (end (dropContainer), numDrops, false); // add the markers for the disabled nodes
-         // shuffle 
-         std::random_shuffle (end (dropContainer)-numNodes, end (dropContainer)); // shuffle enabled and disabled markers
-      }
+        /** \brief prepare the drop-out container given the provided drop-fractions
+         *
+         */
+        void Net::fillDropContainer (DropContainer& dropContainer, double dropFraction, size_t _numNodes) const
+        {
+            size_t numDrops = dropFraction * _numNodes;
+            if (numDrops >= _numNodes) // maintain at least one node
+                numDrops = _numNodes - 1;
+            dropContainer.insert (end (dropContainer), _numNodes-numDrops, true); // add the markers for the nodes which are enabled
+            dropContainer.insert (end (dropContainer), numDrops, false); // add the markers for the disabled nodes
+            // shuffle 
+            std::random_shuffle (end (dropContainer)-_numNodes, end (dropContainer)); // shuffle enabled and disabled markers
+        }
 
 
 
@@ -553,6 +580,6 @@ namespace TMVA
 
 
  
-   }; // namespace DNN
+    }; // namespace DNN
 }; // namespace TMVA