diff --git a/tmva/tmva/CMakeLists.txt b/tmva/tmva/CMakeLists.txt
index 449e9080999ad..9220b5fef07d2 100644
--- a/tmva/tmva/CMakeLists.txt
+++ b/tmva/tmva/CMakeLists.txt
@@ -35,6 +35,20 @@ set(headers4 TNeuron.h TSynapse.h TActivationChooser.h TActivation.h TActivation
 	     TNeuronInputSqSum.h TNeuronInputAbs.h Types.h Ranking.h RuleFit.h RuleFitAPI.h IMethod.h MsgLogger.h
 	     VariableTransformBase.h VariableIdentityTransform.h VariableDecorrTransform.h VariablePCATransform.h 
 	     VariableGaussTransform.h VariableNormalizeTransform.h VariableRearrangeTransform.h VariableTransform.h ROCCalc.h ROCCurve.h)
+set(dnn_files src/DNN/Architectures/Reference.cxx
+  src/DNN/Architectures/Reference/DataLoader.cxx)
+set(dnn_cuda_files src/DNN/Architectures/Cuda/ActivationFunctions.cu
+                   src/DNN/Architectures/Cuda/Arithmetic.cu
+                   src/DNN/Architectures/Cuda/Buffers.cxx
+                   src/DNN/Architectures/Cuda/CudaMatrix.cu
+                   src/DNN/Architectures/Cuda/DataLoader.cu
+                   src/DNN/Architectures/Cuda/Dropout.cu
+                   src/DNN/Architectures/Cuda/Initialization.cu
+                   src/DNN/Architectures/Cuda/Kernels.cu
+                   src/DNN/Architectures/Cuda/LossFunctions.cu
+                   src/DNN/Architectures/Cuda/OutputFunctions.cu
+                   src/DNN/Architectures/Cuda/Propagation.cu
+                   src/DNN/Architectures/Cuda/Regularization.cu)
 
 #---Need to suffix each header name by TMVA/  -----------------
 foreach(hs headers1 headers2 headers3 headers4)
@@ -45,7 +59,18 @@ endforeach()
 
 ROOT_GENERATE_DICTIONARY(G__TMVA ${theaders1} ${theaders2} ${theaders3} ${theaders4}   MODULE TMVA LINKDEF LinkDef.h OPTIONS "-writeEmptyRootPCM")
 
-ROOT_LINKER_LIBRARY(TMVA *.cxx G__TMVA.cxx LIBRARIES Core
+#---Handle CUDA dependent code. -----------------
+find_package(CUDA)
+if (CUDA_FOUND)
+  cuda_add_library(dnn_cuda ${dnn_cuda_files})
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDNNCUDA")
+  set(cuda_libraries dnn_cuda ${CUDA_CUBLAS_LIBRARIES})
+else (CUDA_FOUND)
+  set(cuda_libraries)
+endif(CUDA_FOUND)
+
+root_linker_library(TMVA *.cxx G__TMVA.cxx ${dnn_files}
+                    LIBRARIES Core ${cuda_libraries}
                     DEPENDENCIES RIO Hist Tree TreePlayer MLP Minuit XMLIO)
 
 install(DIRECTORY inc/TMVA/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/TMVA
@@ -63,7 +88,7 @@ if(NOT gnuinstall)
                  PATTERN "data" EXCLUDE)
 endif()
 
-#ROOT_ADD_TEST_SUBDIRECTORY(test)
+ROOT_ADD_TEST_SUBDIRECTORY(test/DNN)
 
 
 
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h
new file mode 100644
index 0000000000000..a11f50d615cf4
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h
@@ -0,0 +1,296 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 05/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////
+// Definition of the TCuda architecture, which provides an   //
+// implementation of the low-level functionality for neural  //
+// networks for the CUDA computing architectures.            //
+///////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CUDA
+#define TMVA_DNN_ARCHITECTURES_CUDA
+
+#include <utility>
+
+#include "cuda.h"
+
+#include "Cuda/Types.h"
+#include "Cuda/Kernels.h"
+#include "Cuda/Buffers.h"
+#include "Cuda/DataLoader.h"
+#include "Cuda/CudaMatrix.h"
+#include "TMVA/DNN/DataLoader.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+/** The TCuda architecture class.
+ *
+ * Low-level interface class for CUDA computing architecture. Contains as
+ * public types the declaration of the scalar, matrix and data loader types
+ * for this architecture as well as the remaining functions in the low-level
+ * interface in the form of static members.
+ */
+class TCuda
+{
+
+public:
+
+    using Scalar_t       = CudaDouble_t;
+    using Matrix_t       = TCudaMatrix;
+    using DeviceBuffer_t = TCudaDeviceBuffer;
+    using HostBuffer_t   = TCudaHostBuffer;
+    template <typename Data_t>
+    using DataLoader_t   = TCudaDataLoader<Data_t>;
+
+   //____________________________________________________________________________
+   //
+   // Propagation
+   //____________________________________________________________________________
+
+   /** @name Forward Propagation
+    * Low-level functions required for the forward propagation of activations
+    * through the network.
+    */
+   ///@{
+   /** Matrix-multiply \p input with the transpose of \pweights and
+    *  write the results into \p output. */
+   static void MultiplyTranspose(TCudaMatrix &output,
+                                 const TCudaMatrix &input,
+                                 const TCudaMatrix &weights);
+   /** Add the vectors biases row-wise to the matrix output */
+   static void AddRowWise(TCudaMatrix &output,
+                          const TCudaMatrix &biases);
+   ///@}
+
+   /** @name Backward Propagation
+    * Low-level functions required for the forward propagation of activations
+    * through the network.
+    */
+   ///@{
+   /** Perform the complete backward propagation step. If the provided
+    *  \p activationGradientsBackward matrix is not empty, compute the
+    *  gradients of the objective function with respect to the activations
+    *  of the previous layer (backward direction).
+    *  Also compute the weight and the bias gradients. Modifies the values
+    *  in \p df and thus produces only a valid result, if it is applied the
+    *  first time after the corresponding forward propagation has been per-
+    *  formed. */
+   static void Backward(TCudaMatrix & activationGradientsBackward,
+                        TCudaMatrix & weightGradients,
+                        TCudaMatrix & biasGradients,
+                        TCudaMatrix & df,
+                        const TCudaMatrix & activationGradients,
+                        const TCudaMatrix & weights,
+                        const TCudaMatrix & activationBackward);
+   /** Adds a the elements in matrix B scaled by c to the elements in
+    *  the matrix A. This is required for the weight update in the gradient
+    *  descent step.*/
+   static void ScaleAdd(TCudaMatrix & A,
+                        const TCudaMatrix & B,
+                        Scalar_t beta = 1.0);
+
+   static void Copy(TCudaMatrix & B,
+                    const TCudaMatrix & A);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Activation Functions
+   //____________________________________________________________________________
+
+   /** @name Activation Functions
+    * For each activation function, the low-level interface contains two routines.
+    * One that applies the acitvation function to a matrix and one that evaluate
+    * the derivatives of the activation function at the elements of a given matrix
+    * and writes the results into the result matrix.
+    */
+   ///@{
+   static void Identity(TCudaMatrix & B);
+   static void IdentityDerivative(TCudaMatrix & B,
+                                  const TCudaMatrix & A);
+
+   static void Relu(TCudaMatrix & B);
+   static void ReluDerivative(TCudaMatrix & B,
+                              const TCudaMatrix & A);
+
+   static void Sigmoid(TCudaMatrix & B);
+   static void SigmoidDerivative(TCudaMatrix & B,
+                                 const TCudaMatrix & A);
+
+   static void Tanh(TCudaMatrix & B);
+   static void TanhDerivative(TCudaMatrix & B,
+                              const TCudaMatrix & A);
+
+   static void SymmetricRelu(TCudaMatrix & B);
+   static void SymmetricReluDerivative(TCudaMatrix & B,
+                                       const TCudaMatrix & A);
+
+   static void SoftSign(TCudaMatrix & B);
+   static void SoftSignDerivative(TCudaMatrix & B,
+                                  const TCudaMatrix & A);
+
+   static void Gauss(TCudaMatrix & B);
+   static void GaussDerivative(TCudaMatrix & B,
+                               const TCudaMatrix & A);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Loss Functions
+   //____________________________________________________________________________
+
+   /** @name Loss Functions
+    * Loss functions compute a scalar value given the \p output of the network
+    * for a given training input and the expected network prediction \p Y that
+    * quantifies the quality of the prediction. For each function also a routing
+    * that computes the gradients (suffixed by Gradients) must be provided for
+    * the starting of the backpropagation algorithm.
+    */
+   ///@{
+
+   static CudaDouble_t MeanSquaredError(const TCudaMatrix &Y,
+                                        const TCudaMatrix &output);
+   static void MeanSquaredErrorGradients(TCudaMatrix & dY,
+                                         const TCudaMatrix &Y,
+                                         const TCudaMatrix &output);
+
+    /** Sigmoid transformation is implicitly applied, thus \p output should
+     *  hold the linear activations of the last layer in the net. */
+   static CudaDouble_t CrossEntropy(const TCudaMatrix &Y,
+                              const TCudaMatrix &output);
+
+   static void CrossEntropyGradients(TCudaMatrix & dY,
+                                     const TCudaMatrix & Y,
+                                     const TCudaMatrix & output);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Output Functions
+   //____________________________________________________________________________
+
+   /** @name Output Functions
+    * Output functions transform the activations \p output of the
+    * output layer in the network to a valid prediction \p YHat for
+    * the desired usage of the network, e.g.  the identity function
+    * for regression or the sigmoid transformation for two-class
+    * classification.
+    */
+   ///@{
+   static void Sigmoid(TCudaMatrix &YHat,
+                        const TCudaMatrix & );
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Regularization
+   //____________________________________________________________________________
+
+   /** @name Regularization
+    * For each regularization type two functions are required, one named
+    * <tt><Type>Regularization</tt> that evaluates the corresponding
+    * regularization functional for a given weight matrix and the
+    * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
+    * component in the gradients to the provided matrix.
+    */
+   ///@{
+
+   static CudaDouble_t L1Regularization(const TCudaMatrix & W);
+   static void AddL1RegularizationGradients(TCudaMatrix & A,
+                                            const TCudaMatrix & W,
+                                            CudaDouble_t weightDecay);
+
+   static CudaDouble_t L2Regularization(const TCudaMatrix & W);
+   static void AddL2RegularizationGradients(TCudaMatrix & A,
+                                            const TCudaMatrix & W,
+                                            CudaDouble_t weightDecay);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Initialization
+   //____________________________________________________________________________
+
+   /** @name Initialization
+    * For each initialization method, one function in the low-level interface
+    * is provided. The naming scheme is <p>Initialize<Type></p> for a given
+    * initialization method Type.
+    */
+   ///@{
+
+   static void InitializeGauss(TCudaMatrix & A);
+   static void InitializeUniform(TCudaMatrix & A);
+   static void InitializeIdentity(TCudaMatrix & A);
+   static void InitializeZero(TCudaMatrix & A);
+
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Dropout
+   //____________________________________________________________________________
+
+   /** @name Dropout
+    */
+   ///@{
+
+   /** Apply dropout with activation probability \p p to the given
+    *  matrix \p A and scale the result by reciprocal of \p p. */
+   static void Dropout(TCudaMatrix & A, CudaDouble_t p);
+
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Additional Arithmetic Functions
+   //____________________________________________________________________________
+
+   /** @name Additional Arithmetic Functions
+    *
+    * Additional arithmetic on CUDA matrices  used to implement the low-level
+    * interface.
+    */
+   ///@{
+
+   /** Standard multiplication of two matrices \p A and \p B with the result being
+    *  written into C.
+    */
+   static void Multiply(TCudaMatrix &C,
+                        const TCudaMatrix &A,
+                        const TCudaMatrix &B);
+   /** Matrix multiplication of two matrices \p A and \p B^T (transposed) with the
+    *  result being written into C.
+    */
+   static void TransposeMultiply(TCudaMatrix &output,
+                                 const TCudaMatrix &input,
+                                 const TCudaMatrix &Weights);
+   /** In-place Hadamard (element-wise) product of matrices \p A and \p B
+    *  with the result being written into \p A.
+    */
+   static void Hadamard(TCudaMatrix &A,
+                        const TCudaMatrix &B);
+
+   /** Sum columns of (m x n) matrixx \p A and write the results into the first
+    * m elements in \p A.
+    */
+   static void SumColumns(TCudaMatrix &B, const TCudaMatrix &A);
+
+   /** Compute the sum of all elements in \p A */
+   static CudaDouble_t Sum(const TCudaMatrix &A);
+};
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Buffers.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Buffers.h
new file mode 100644
index 0000000000000..14be9534039b5
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Buffers.h
@@ -0,0 +1,135 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 07/08/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////
+// Device and host buffer for CUDA architectures. //
+////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CUDA_BUFFERS
+#define TMVA_DNN_ARCHITECTURES_CUDA_BUFFERS
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include <memory>
+#include "Types.h"
+
+namespace TMVA {
+namespace DNN  {
+
+class TCudaDeviceBuffer;
+
+/** Wrapper class for pinned memory double-precision buffers on the host.
+ * Uses std::shared_pointer with custom destructor to ensure consistent
+ * memory management and allow for easy copying/moving. Copying from device
+ * will set the corresponding data stream which can then be used to synchronize
+ * on the results.
+ */
+class TCudaHostBuffer
+{
+private:
+
+   size_t                          fOffset;
+   mutable cudaStream_t            fComputeStream;
+   std::shared_ptr<CudaDouble_t *> fDevicePointer;
+
+   struct TDestructor
+   {
+       TDestructor()                     = default;
+       TDestructor(const TDestructor  &) = default;
+       TDestructor(      TDestructor &&) = default;
+       TDestructor & operator=(const TDestructor  &) = default;
+       TDestructor & operator=(      TDestructor &&) = default;
+       void operator()(CudaDouble_t ** devicePointer);
+       friend TCudaDeviceBuffer;
+   } fDestructor;
+
+   friend TCudaDeviceBuffer;
+
+public:
+
+   TCudaHostBuffer(size_t size);
+   TCudaHostBuffer(CudaDouble_t *);
+   TCudaHostBuffer() = default;
+   TCudaHostBuffer(const TCudaHostBuffer  &) = default;
+   TCudaHostBuffer(      TCudaHostBuffer &&) = default;
+   TCudaHostBuffer & operator=(const TCudaHostBuffer  &) = default;
+   TCudaHostBuffer & operator=(      TCudaHostBuffer &&) = default;
+
+   /** Return sub-buffer of the current buffer. */
+   TCudaHostBuffer GetSubBuffer(size_t offset, size_t size);
+
+   /** Convert to raw device data pointer.*/
+   operator CudaDouble_t * () const;
+   CudaDouble_t & operator[](size_t index)
+   {
+      return (*fDevicePointer + fOffset)[index];
+   }
+   CudaDouble_t   operator[](size_t index)   const
+   {
+      return (*fDevicePointer + fOffset)[index];
+   }
+
+};
+
+/** Wrapper class for on-device memory double-precision buffers. Uses
+ *  std::shared_pointer with custom destructor to ensure consistent
+ *  memory management and allow for easy copying/moving. A device
+ *  buffer has an associated CUDA compute stream , which is used for
+ *  implicit synchronization of data transfers.
+ */
+class TCudaDeviceBuffer
+{
+private:
+
+   size_t                          fOffset;
+   size_t                          fSize;
+   cudaStream_t                    fComputeStream;
+   std::shared_ptr<CudaDouble_t *> fDevicePointer;
+
+   struct TDestructor
+   {
+       TDestructor()                     = default;
+       TDestructor(const TDestructor  &) = default;
+       TDestructor(      TDestructor &&) = default;
+       TDestructor & operator=(const TDestructor  &) = default;
+       TDestructor & operator=(      TDestructor &&) = default;
+       void operator()(CudaDouble_t ** devicePointer);
+       friend TCudaDeviceBuffer;
+   } fDestructor;
+
+public:
+
+   TCudaDeviceBuffer(size_t size);
+   TCudaDeviceBuffer(size_t size,    cudaStream_t stream);
+   TCudaDeviceBuffer(CudaDouble_t *, size_t size, cudaStream_t stream);
+   TCudaDeviceBuffer() = default;
+   TCudaDeviceBuffer(const TCudaDeviceBuffer  &) = default;
+   TCudaDeviceBuffer(      TCudaDeviceBuffer &&) = default;
+   TCudaDeviceBuffer & operator=(const TCudaDeviceBuffer  &) = default;
+   TCudaDeviceBuffer & operator=(      TCudaDeviceBuffer &&) = default;
+
+   /** Return sub-buffer of the current buffer. */
+   TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size);
+   /** Convert to raw device data pointer.*/
+   operator CudaDouble_t * () const;
+
+   void CopyFrom(const TCudaHostBuffer &) const;
+   void CopyTo(const TCudaHostBuffer &)   const;
+
+   cudaStream_t GetComputeStream() const {return fComputeStream;}
+   void SetComputeStream(cudaStream_t stream) {fComputeStream = stream;}
+
+};
+
+
+} // namespace DNN
+} // namespace TMVA
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaMatrix.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaMatrix.h
new file mode 100644
index 0000000000000..ce5279e77f05c
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaMatrix.h
@@ -0,0 +1,258 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+// Contains the TCudaMatrix class for the representation of matrices //
+// on CUDA devices as well as the TCudaDeviceReference class which   //
+// is a helper class to emulate lvalue references to floating point //
+// values on the device.                                            //
+//////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
+#define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
+
+#include <iostream>
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "cublas_v2.h"
+#include "curand_kernel.h"
+
+#include "TMatrixT.h"
+
+#include "Types.h"
+#include "Buffers.h"
+
+#define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }
+
+namespace TMVA {
+namespace DNN {
+
+/** Function to check cuda return code. Taken from
+ * http://stackoverflow.com/questions/14038589/
+ */
+inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess)
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+//____________________________________________________________________________
+//
+// Cuda Device Reference
+//____________________________________________________________________________
+
+/** TCudaDeviceReference
+ *
+ * Helper class emulating lvalue references for CudaDouble_t values that are
+ * physically on the device. Allows for example to assign to matrix elements.
+ * Not that device access through CudaDeviceReferences enforces synchronization
+ * with all streams and thus qualifies as performance killer. Only used for
+ * testing.
+ */
+class TCudaDeviceReference
+{
+private:
+
+    CudaDouble_t * fDevicePointer;
+
+public:
+
+TCudaDeviceReference(CudaDouble_t * devicePointer)
+  : fDevicePointer(devicePointer)
+   {
+      // Nothing to do here.
+   }
+
+   operator CudaDouble_t()
+   {
+      CudaDouble_t buffer;
+      cudaMemcpy(& buffer, fDevicePointer, sizeof(CudaDouble_t),
+                 cudaMemcpyDeviceToHost);
+      return buffer;
+    }
+
+    void operator=(const TCudaDeviceReference &other)
+    {
+       cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(CudaDouble_t),
+                  cudaMemcpyDeviceToDevice);
+    }
+
+    void operator=(CudaDouble_t value)
+    {
+       CudaDouble_t buffer = value;
+          cudaMemcpy(fDevicePointer, & buffer, sizeof(CudaDouble_t),
+                     cudaMemcpyHostToDevice);
+    }
+
+    void operator+=(CudaDouble_t value)
+    {
+       CudaDouble_t buffer;
+       cudaMemcpy(& buffer, fDevicePointer, sizeof(CudaDouble_t),
+                  cudaMemcpyDeviceToHost);
+       buffer += value;
+       cudaMemcpy(fDevicePointer, & buffer, sizeof(CudaDouble_t),
+                  cudaMemcpyHostToDevice);
+    }
+
+    void operator-=(CudaDouble_t value)
+    {
+       CudaDouble_t buffer;
+       cudaMemcpy(& buffer, fDevicePointer, sizeof(CudaDouble_t),
+                  cudaMemcpyDeviceToHost);
+       buffer -= value;
+       cudaMemcpy(fDevicePointer, & buffer, sizeof(CudaDouble_t),
+                  cudaMemcpyHostToDevice);
+    }
+};
+
+//____________________________________________________________________________
+//
+// Cuda Matrix
+//____________________________________________________________________________
+
+/** TCudaMatrix Class
+ *
+ * The TCudaMatrix class represents matrices on a CUDA device. The class takes
+ * care of allocating and freeing device memory. It also holds static variables
+ * to certain device resources such as the Cublas handle as the computation stream.
+ *
+ * Computations on all matrix instances are performend in a single,
+ * non-standard stream. That means that they are guaranteed to be
+ * executed in order, but may not wait for asynchronous data transfer
+ * in other streams.
+ *
+ * Each matrix also has an associated data stream, which defaults to the
+ * default (0) stream if not explicitly provided.
+ */
+class TCudaMatrix
+{
+public:
+
+private:
+
+   static size_t          fInstances;    ///< Number of existing TCudaMatrix instances.
+   static cublasHandle_t  fCublasHandle;
+   static CudaDouble_t  * fDeviceReturn; ///< Buffer for kernel return values.
+   static curandState_t * fCurandStates;
+   static size_t          fNCurandStates;
+
+   size_t            fNRows;
+   size_t            fNCols;
+   TCudaDeviceBuffer fElementBuffer;
+
+public:
+
+   TCudaMatrix();
+   TCudaMatrix(size_t i, size_t j);
+   TCudaMatrix(const TMatrixT<CudaDouble_t> &);
+   TCudaMatrix(TCudaDeviceBuffer buffer, size_t m, size_t n);
+
+   TCudaMatrix(const TCudaMatrix  &) = default;
+   TCudaMatrix(      TCudaMatrix &&) = default;
+   TCudaMatrix & operator=(const TCudaMatrix  &) = default;
+   TCudaMatrix & operator=(      TCudaMatrix &&) = default;
+   ~TCudaMatrix() = default;
+
+   /** Return the compute stream associated with this matrix */
+   inline cudaStream_t GetComputeStream() const;
+   inline void         SetComputeStream(cudaStream_t stream);
+   /** Blocking synchronization with the associated compute stream, if it's
+    * not the default stream. */
+   inline void Synchronize(const TCudaMatrix &) const;
+   /** Set the return buffer on the device to the specified value. This is
+    * required for example for reductions in order to initialize the
+    * accumulator. */
+   inline static void ResetDeviceReturn(CudaDouble_t value = 0.0);
+   /** Transfer the value in the device return buffer to the host. */
+   inline static CudaDouble_t GetDeviceReturn();
+
+   /** Return device pointer to the device return buffer */
+   inline static CudaDouble_t *  GetDeviceReturnPointer() {return fDeviceReturn;}
+   inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;}
+
+   /** Convert cuda matrix to Root TMatrix. */
+   operator TMatrixT<CudaDouble_t>() const;
+
+   void CopyToDevice(CudaDouble_t *source);
+   void CopyFromDevice(CudaDouble_t *dest);
+
+   size_t GetNrows() const {return fNRows;}
+   size_t GetNcols() const {return fNCols;}
+   size_t GetNoElements() const {return fNRows * fNCols;}
+   const CudaDouble_t * GetDataPointer() const {return fElementBuffer;}
+   CudaDouble_t *       GetDataPointer()       {return fElementBuffer;}
+   const cublasHandle_t & GetCublasHandle() const    {return fCublasHandle;}
+
+   /** Access to elements of device matrices provided through TCudaDeviceReference
+    *  class. Note that access is synchronous end enforces device synchronization
+    *  on all streams. Only used for testing. */
+   TCudaDeviceReference operator()(size_t i, size_t j) const
+   {
+      CudaDouble_t * elementPointer = fElementBuffer;
+      elementPointer += j * fNRows + i;
+      return TCudaDeviceReference(elementPointer);
+   }
+
+private:
+
+   void InitializeCuda();
+   void InitializeCurandStates();
+
+};
+
+//
+// Inline Functions.
+//______________________________________________________________________________
+inline cudaStream_t TCudaMatrix::GetComputeStream() const
+{
+   return fElementBuffer.GetComputeStream();
+}
+
+//______________________________________________________________________________
+inline void TCudaMatrix::SetComputeStream(cudaStream_t stream)
+{
+   return fElementBuffer.SetComputeStream(stream);
+}
+
+//______________________________________________________________________________
+inline void TCudaMatrix::Synchronize(const TCudaMatrix &A) const {
+   cudaEvent_t event;
+   cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+   cudaEventRecord(event, A.GetComputeStream());
+   cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
+   cudaEventDestroy(event);
+}
+
+//______________________________________________________________________________
+inline void TCudaMatrix::ResetDeviceReturn(CudaDouble_t value)
+{
+   CudaDouble_t buffer = value;
+   cudaMemcpy(fDeviceReturn, & buffer, sizeof(CudaDouble_t),
+              cudaMemcpyHostToDevice);
+}
+
+//______________________________________________________________________________
+inline CudaDouble_t TCudaMatrix::GetDeviceReturn()
+{
+   CudaDouble_t buffer;
+   cudaMemcpy(& buffer, fDeviceReturn, sizeof(CudaDouble_t),
+              cudaMemcpyDeviceToHost);
+   return buffer;
+}
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/DataLoader.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/DataLoader.h
new file mode 100644
index 0000000000000..09a2e9c25e3b8
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/DataLoader.h
@@ -0,0 +1,220 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////////////
+// Declaration of the TCudaDataLoader class, which implements a          //
+// prefetching data loader for CUDA architecture. Also contains          //
+// the TCudaBatch class and the TCudaBatchIterator representing batches  //
+// and an iterator over batches in a data set for CUDA architectures     //
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CUDA_DATA
+#define TMVA_DNN_ARCHITECTURES_CUDA_DATA
+
+#include "CudaMatrix.h"
+#include "Buffers.h"
+#include "TMVA/Event.h"
+#include <algorithm>
+#include <iterator>
+
+namespace TMVA {
+namespace DNN  {
+
+// Input Data Types
+using MatrixInput_t    = std::pair<const TMatrixT<Double_t> &,
+                                   const TMatrixT<Double_t> &>;
+using TMVAInput_t      = std::vector<Event*>;
+
+using IndexIterator_t = typename std::vector<size_t>::iterator;
+
+/** TCudaBatch class.
+ *
+ * Lightweight representation of a batch of data on the device. Holds
+ * pointer to the input (samples) and output data (labels) as well as
+ * to the data stream in which this batch is transferred.
+ *
+ * Provides GetInput() and GetOutput() member functions that return
+ * TCudaMatrix representations of the input and output data in the
+ * batch.
+ */
+class TCudaBatch
+{
+private:
+    cudaStream_t   fDataStream;       ///< Cuda stream in which the data is transferred
+    CudaDouble_t * fInputMatrixData;  ///< Pointer to the input data buffer.
+    CudaDouble_t * fOutputMatrixData; ///< Pointer to the ouput data buffer.
+
+    size_t fNinputFeatures;  ///< Number of input features.
+    size_t fNoutputFeatures; ///< Number of output features.
+    size_t fBatchSize;       ///< Size of the batch.
+public:
+
+    TCudaBatch(size_t batchSize,
+        size_t ninputFeatures,
+        size_t noutputFeatures,
+        CudaDouble_t * inputMatrixData,
+        CudaDouble_t * outputMatrixData,
+        cudaStream_t dataStream)
+    : fDataStream(dataStream), fInputMatrixData(inputMatrixData),
+    fOutputMatrixData(outputMatrixData), fNinputFeatures(ninputFeatures),
+    fNoutputFeatures(noutputFeatures), fBatchSize(batchSize)
+    {
+        // Nothing to do here.
+    }
+
+    /** Return the batch input data as a TCudaMatrix. The TCudaMatrix is passed
+     *  the data stream in which the async. data transfer to the corresponding
+     *  buffer is performed, so that operations on the matrix can synchronize
+     *  with it. */
+    TCudaMatrix GetInput()
+    {
+        size_t size = fBatchSize * fNinputFeatures;
+        return TCudaMatrix(TCudaDeviceBuffer(fInputMatrixData, size, fDataStream),
+                           fBatchSize, fNinputFeatures);
+    }
+
+    /** Return the outpur data as a TCudaMatrix. Also forwards the data stream in
+     *  which the async. data transfer is performed to the matrix. See above.
+     */
+    TCudaMatrix GetOutput()
+    {
+        size_t size = fBatchSize * fNoutputFeatures;
+        return TCudaMatrix(TCudaDeviceBuffer(fOutputMatrixData, size, fDataStream),
+                           fBatchSize, fNoutputFeatures);
+    }
+};
+
+template<typename Data_t>
+class TCudaDataLoader;
+
+/** TCudaBatchIterator Class
+ *
+ * Class that implements an iterator over data sets on a CUDA device. The
+ * batch iterator has to take care of the preloading of the data which is
+ * why a special implementation is required.
+ */
+template <typename Data_t>
+class TCudaBatchIterator
+{
+private:
+    using SampleIndexIterator_t = typename std::vector<size_t>::iterator;
+
+    TCudaDataLoader<Data_t>  & fDataLoader;   ///< Dataloader managing data transfer.
+    IndexIterator_t fSampleIndexIterator;    ///< Sample indices in this batch.
+    IndexIterator_t fSampleIndexIteratorEnd; ///< End of this batch.
+
+    const size_t fNbatchesInEpoch;    ///< Total number of batches in the data set.
+    const size_t fBatchSize;          ///< Size of this batch.
+    const size_t fTransferBatchSize;  ///< No. of. batches in a data transfer batch.
+          size_t fBatchIndex;         ///< Index of this batch in the current epoch.
+
+public:
+
+    TCudaBatchIterator(TCudaDataLoader<Data_t> &dataLoader,
+                      IndexIterator_t sampleIndexIterator,
+                      IndexIterator_t sampleIndexIteratorEnd);
+
+    /** Preloads the number of transfer batches as specified by the
+     *  corresponding TCudaDataLoader object. */
+    void PrepareStream();
+    /** Advance to the next batch and check if data should be preloaded. */
+    TCudaBatchIterator & operator++();
+    /** Return TCudaBatch object corresponding to the current iterator position. */
+    TCudaBatch operator*();
+
+    bool operator==(const TCudaBatchIterator & other);
+    bool operator!=(const TCudaBatchIterator & other);
+};
+
+/** The TCudaDataLoader Class
+ *
+ * The TCudaDataLoader class takes care of transferring training and test data
+ * from the host to the device. The data transfer is performed asynchronously
+ * and multiple data set batches can be transferred combined into transfer batches,
+ * which contain a fixed number of data set batches. The range of the preloading
+ * is defined in multiples of transfer batches. */
+template <typename Data_t>
+class TCudaDataLoader
+{
+private:
+
+   const Data_t & fInputData;
+
+   size_t fNsamples;          ///< No. of samples in the data set.
+   size_t fNinputFeatures;    ///< No. of features in input sample.
+   size_t fNoutputFeatures;   ///< No. of features in output sample (truth).
+   size_t fBatchSize;         ///< No. of samples in a (mini-)batch
+   size_t fTransferBatchSize; ///< No. of batches combined in a single data transfer.
+   size_t fPreloadOffset;     ///< How many batch-batches data is loaded in advance.
+
+   size_t fNbatchesInEpoch; ///< No. of batches in one epoch.
+   size_t fInputMatrixSize; ///< No. of elements in input matrix.
+   size_t fOutputMatrixSize;///< No. of elements in output matrix.
+   size_t fTransferSize;    ///< Total size of one  data transfer in bytes;
+
+   size_t fStreamIndex; ///< Transfer stream index.
+
+   CudaDouble_t **fHostData;    ///< Host-side buffer array.
+   CudaDouble_t **fDeviceData;  ///< Device-side buffer array.
+   cudaStream_t * fDataStreams; ///< Data stream array.
+
+   std::vector<size_t> fSampleIndices; ///< Shuffled sample indices.
+
+public:
+
+   TCudaDataLoader(const Data_t & inputData,
+                  size_t nsamples,
+                  size_t batchSize,
+                  size_t ninputFeatures,
+                  size_t noutputFeatures,
+                  size_t batchBatchSize = 5,
+                  size_t preloadOffset  = 2);
+
+   ~TCudaDataLoader();
+
+   /** Return iterator to batches in the training set. Samples in batches are
+    *  are sampled randomly from the data set without replacement. */
+   TCudaBatchIterator<Data_t> begin();
+
+   TCudaBatchIterator<Data_t> end();
+
+   /** Called by the iterator to indicate that the current buffer containing a
+    *  transfer batch of batchs has been consumed. */
+   void NextBuffer() {fStreamIndex = (fStreamIndex + 1) % (fTransferBatchSize + 1);}
+   /** Invoke transfer of the current buffer in the buffer cycle. */
+   void InvokeTransfer();
+   /** Return the batch object corresponding to the given batchIndex in the current
+    *  epoch. */
+   TCudaBatch GetCurrentBatch(size_t batchIndex);
+
+   /** Copy data from the input data object to the pinned transfer buffer on
+    *  the host. Must be specialized for any type of input data that is used. */
+   inline static void CopyBatches(Data_t data,
+                                  IndexIterator_t sampleIndexIteratorBegin,
+                                  IndexIterator_t sampleIndexIteratorEnd,
+                                  size_t batchSize,
+                                  size_t batchBatchSize,
+                                  CudaDouble_t * inputBuffer,
+                                  CudaDouble_t * outputBuffer);
+
+   size_t GetNBatchesInEpoch() const {return fNbatchesInEpoch;}
+   size_t GetPreloadOffset()   const {return fPreloadOffset;}
+   size_t GetBatchSize()       const {return fBatchSize;}
+   size_t GetBatchBatchSize()  const {return fTransferBatchSize;}
+   const Data_t & GetInputData() const {return fInputData;}
+   inline CudaDouble_t * GetInputTransferBuffer()  const;
+   inline CudaDouble_t * GetOutputTransferBuffer() const;
+};
+
+} // namespace TMVA
+} // namespace DNN
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h
new file mode 100644
index 0000000000000..2048a921f02c2
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h
@@ -0,0 +1,69 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////////
+// Declares the DeviceSettings struct which holdsx device specific      //
+// information to be defined at compile time.                           //
+//////////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CUDA_DEVICE
+#define TMVA_DNN_ARCHITECTURES_CUDA_DEVICE
+
+#include "cuda.h"
+#include "vector_types.h" // definition of dim3
+#include "CudaMatrix.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+class TDevice
+{
+public:
+   static constexpr int BlockDimX = 1;
+   static constexpr int BlockDimY = 32;
+   static constexpr int BlockSize = BlockDimX * BlockDimY;
+
+   static dim3 BlockDims()
+   {
+      return dim3(BlockDimX, BlockDimY);
+   }
+
+   static dim3 GridDims(const TCudaMatrix &A)
+   {
+      int gridDimX = A.GetNcols() / TDevice::BlockDimX;
+      if ((A.GetNcols() % TDevice::BlockDimX) != 0)
+          gridDimX += 1;
+      int gridDimY = A.GetNrows() / TDevice::BlockDimY;
+      if ((A.GetNrows() % TDevice::BlockDimY) != 0)
+          gridDimY += 1;
+      return dim3(gridDimX, gridDimY);
+   }
+
+   static int NThreads(const TCudaMatrix &A)
+   {
+      int gridDimX = A.GetNcols() / TDevice::BlockDimX;
+      if ((A.GetNcols() % TDevice::BlockDimX) != 0)
+          gridDimX += 1;
+      int gridDimY = A.GetNrows() / TDevice::BlockDimY;
+      if ((A.GetNrows() % TDevice::BlockDimY) != 0)
+          gridDimY += 1;
+      return gridDimX * gridDimY * TDevice::BlockDimX * TDevice::BlockDimY;
+   }
+
+};
+
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Kernels.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Kernels.h
new file mode 100644
index 0000000000000..23c6814c0260e
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Kernels.h
@@ -0,0 +1,236 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+// Declaration of the device kernels for the CUDA implementation of //
+// the low-level interface.                                         //
+//////////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "cuda.h"
+#include "curand_kernel.h"
+
+namespace TMVA {
+namespace DNN  {
+namespace Cuda {
+
+
+/** @name Device Kernels
+ * Utility device kernels that can only be called from the device.
+ */
+///@{
+/** Atomic addition of doubles, which is at the moment natively supported only
+ for floats. */
+//____________________________________________________________________________
+__device__ CudaDouble_t AtomicAdd(CudaDouble_t* address, CudaDouble_t val);
+
+/** Sum elements in columns of the 2D shared data array and accumulated the
+ *  results in \p result.
+ *
+ *  \param result Pointer to the result where the reduction results of each block
+ *  will be accumulated using atomicAdd. The results of the jth column in this
+ *  2D block will be accumulated in the jth element.
+ *  \param sdata 2D shared data array with the same dimensions as the current
+ *  compute block. Contains the elements that will be row-wise accumulated.
+ */
+//____________________________________________________________________________
+__device__ void ReduceSumVertical(CudaDouble_t *result, CudaDouble_t * sdata);
+
+/** Sum up all elements in the current compute block and accumulate the results
+ *  into \p result using atomicAdd */
+//____________________________________________________________________________
+__device__ void ReduceSum(CudaDouble_t *result, CudaDouble_t * sdata);
+///@}
+
+/** @name Forward and Backward Propagation
+ */
+///@{
+/** Add the \p n -element vector \p theta row-wise to the \p m x \p n matrix
+ * in W */
+//____________________________________________________________________________
+__global__ void AddRowWise(CudaDouble_t * W, const CudaDouble_t * theta,
+                             int m, int n);
+/** Compute the Hadamard product of the \p m x \p n matrix B and A and write
+ * results into B. */
+//____________________________________________________________________________
+__global__ void Hadamard(CudaDouble_t * B,
+                         const CudaDouble_t * A,
+                         int m, int n);
+///@}
+
+/** @name Activation Functions
+ */
+///@{
+//____________________________________________________________________________
+__global__ void IdentityDerivative(CudaDouble_t * A,
+                                   int m, int n);
+//____________________________________________________________________________
+__global__ void Relu(CudaDouble_t * A,
+                     int m, int n);
+
+//____________________________________________________________________________
+__global__ void ReluDerivative(CudaDouble_t * B,
+                               const CudaDouble_t * A, int m, int n);
+
+//____________________________________________________________________________
+__global__ void Sigmoid(CudaDouble_t * A,
+                        int m, int n);
+//____________________________________________________________________________
+__global__ void SigmoidDerivative(CudaDouble_t * B,
+                                  const CudaDouble_t * A,
+                                   int m, int n);
+
+//____________________________________________________________________________
+__global__ void Tanh(CudaDouble_t * A,
+                     int m, int n);
+//____________________________________________________________________________
+__global__ void TanhDerivative(CudaDouble_t * B,
+                               const CudaDouble_t * A,
+                               int m, int n);
+
+//____________________________________________________________________________
+__global__ void SymmetricRelu(CudaDouble_t * A,
+                              int m, int n);
+//____________________________________________________________________________
+__global__ void SymmetricReluDerivative(CudaDouble_t * B,
+                                        const CudaDouble_t * A,
+                                        int m, int n);
+
+//____________________________________________________________________________
+__global__ void SoftSign(CudaDouble_t * A,
+                         int m, int n);
+//____________________________________________________________________________
+__global__ void SoftSignDerivative(CudaDouble_t * B,
+                                   const CudaDouble_t * A,
+                                   int m, int n);
+
+//____________________________________________________________________________
+__global__ void Gauss(CudaDouble_t * A,
+                      int m, int n);
+//____________________________________________________________________________
+__global__ void GaussDerivative(CudaDouble_t * B,
+                                const CudaDouble_t * A,
+                                int m, int n);
+
+//____________________________________________________________________________
+///@}
+
+/** @name Loss Functions
+ */
+///@{
+
+//____________________________________________________________________________
+__global__ void MeanSquaredError(CudaDouble_t * result,
+                                 const CudaDouble_t * Y,
+                                 const CudaDouble_t * output,
+                                 int m, int n);
+
+//____________________________________________________________________________
+__global__ void MeanSquaredErrorGradients(CudaDouble_t * dY,
+                                          const CudaDouble_t * Y,
+                                          const CudaDouble_t * output,
+                                          int m, int n);
+//____________________________________________________________________________
+__global__ void CrossEntropy(CudaDouble_t * result,
+                             const CudaDouble_t * Y,
+                             const CudaDouble_t * output,
+                             int m, int n);
+
+//____________________________________________________________________________
+__global__ void CrossEntropyGradients(CudaDouble_t * dY,
+                                      const CudaDouble_t * Y,
+                                      const CudaDouble_t * output,
+                                      int m, int n);
+///@}
+
+/** @name Regularization
+ */
+///@{
+
+/** Compute the sum of the absolute values of the elements in the \p m x \p n
+ *  matrix \p A and write the result into \p result. This is used to compute
+ *  L1 regularization for weights matrices. */
+//____________________________________________________________________________
+__global__ void AbsoluteSum(CudaDouble_t * result,
+                            const CudaDouble_t * A,
+                            int m, int n);
+
+/** Compute the squared sum of the \p m x \p n matrix \p A and write the result
+ *  into \p result. This is used to compute L2 regularization. */
+//____________________________________________________________________________
+__global__ void SquaredSum(CudaDouble_t * result,
+                           const CudaDouble_t * A,
+                           int m, int n);
+
+/** Add the gradients of L1 regularizatoin applied to the \p m x \p n matrix
+ * in \p A to the \p m x \p n matrix in B. \p weightDecay is the weight assigned
+ * to the L1 regularization term. */
+//____________________________________________________________________________
+__global__ void AddL1RegularizationGradients(CudaDouble_t * B,
+                                             const CudaDouble_t * A,
+                                             CudaDouble_t weightDecay,
+                                             int m, int n);
+
+/** Add the gradients of L2 regularizatoin applied to the \p m x \p n matrix
+ * in \p A to the \p m x \p n matrix in B. \p weightDecay is the weight assigned
+ * to the L1 regularization term. */
+//____________________________________________________________________________
+__global__ void AddL2RegularizationGradients(CudaDouble_t * A,
+                                             const CudaDouble_t * B,
+                                             CudaDouble_t weightDecay,
+                                             int m, int n);
+
+///@}
+
+///@{
+/** @name Dropout
+ */
+//____________________________________________________________________________
+__global__ void InitializeCurandStates(unsigned long long seed,
+                                       curandState_t *states);
+
+//____________________________________________________________________________
+__global__ void Dropout(CudaDouble_t *A,
+                        int m, int n,
+                        CudaDouble_t dropoutProbability,
+                        curandState_t *states);
+
+///@}
+
+///@{
+/** @name Output Functions
+ */
+//____________________________________________________________________________
+__global__ void Sigmoid(CudaDouble_t *A,
+                        const CudaDouble_t *B,
+                        int m, int n);
+
+///@}
+
+///@{
+/** @name Miscellaneous
+ */
+///@{
+
+//____________________________________________________________________________
+__global__ void ReduceMatrix(CudaDouble_t *result,
+                             const CudaDouble_t *A,
+                             int m, int n);
+
+//____________________________________________________________________________
+__global__ void SumColumns(CudaDouble_t *B,
+                           const CudaDouble_t *A,
+                           int m, int n);
+///@}
+
+} // namespace CudaKernels
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Types.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Types.h
new file mode 100644
index 0000000000000..22a38cc8f8465
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Types.h
@@ -0,0 +1,22 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////
+// Contains type definitions for types used in the CUDA      //
+// implementation of the neural network low-level interface. //
+///////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CUDA_TYPES
+#define TMVA_DNN_ARCHITECTURES_CUDA_TYPES
+
+typedef double CudaDouble_t;
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h
new file mode 100644
index 0000000000000..6b1772a02c93d
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h
@@ -0,0 +1,250 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 20/06/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////////
+// Declaration of the TReference architecture, which provides a      //
+// reference implementation of the low-level interface for the DNN   //
+// implementation based on ROOT's TMatrixT matrix type.              //
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_REFERENCE
+#define TMVA_DNN_ARCHITECTURES_REFERENCE
+
+#include "TMatrix.h"
+#include "Reference/DataLoader.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+/*! The reference architecture class.
+*
+* Class template that contains the reference implementation of the low-level
+* interface for the DNN implementation. The reference implementation uses the
+* TMatrixT class template to represent matrices.
+*
+* \tparam Real_t The floating point type used to represent scalars.
+*/
+template<typename Real_t>
+class TReference
+{
+public:
+
+   using Scalar_t     = Real_t;
+   using Matrix_t     = TMatrixT<Real_t>;
+   template <typename Data_t>
+   using DataLoader_t = TReferenceDataLoader<Data_t, Real_t>;
+
+   //____________________________________________________________________________
+   //
+   // Propagation
+   //____________________________________________________________________________
+
+   /** @name Forward Propagation
+    * Low-level functions required for the forward propagation of activations
+    * through the network.
+    */
+   ///@{
+   /** Matrix-multiply \p input with the transpose of \pweights and
+    *  write the results into \p output. */
+   static void MultiplyTranspose(TMatrixT<Scalar_t> &output,
+                                 const TMatrixT<Scalar_t> &input,
+                                 const TMatrixT<Scalar_t> &weights);
+   /** Add the vectors biases row-wise to the matrix output */
+   static void AddRowWise(TMatrixT<Scalar_t> &output,
+                          const TMatrixT<Scalar_t> &biases);
+   ///@}
+
+   /** @name Backward Propagation
+    * Low-level functions required for the forward propagation of activations
+    * through the network.
+    */
+   ///@{
+   /** Perform the complete backward propagation step. If the provided
+    *  \p activationGradientsBackward matrix is not empty, compute the
+    *  gradients of the objective function with respect to the activations
+    *  of the previous layer (backward direction).
+    *  Also compute the weight and the bias gradients. Modifies the values
+    *  in \p df and thus produces only a valid result, if it is applied the
+    *  first time after the corresponding forward propagation has been per-
+    *  formed. */
+   static void Backward(TMatrixT<Scalar_t> & activationGradientsBackward,
+                        TMatrixT<Scalar_t> & weightGradients,
+                        TMatrixT<Scalar_t> & biasGradients,
+                        TMatrixT<Scalar_t> & df,
+                        const TMatrixT<Scalar_t> & activationGradients,
+                        const TMatrixT<Scalar_t> & weights,
+                        const TMatrixT<Scalar_t> & activationBackward);
+   /** Adds a the elements in matrix B scaled by c to the elements in
+    *  the matrix A. This is required for the weight update in the gradient
+    *  descent step.*/
+   static void ScaleAdd(TMatrixT<Scalar_t> & A,
+                        const TMatrixT<Scalar_t> & B,
+                        Scalar_t beta = 1.0);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Activation Functions
+   //____________________________________________________________________________
+
+   /** @name Activation Functions
+    * For each activation function, the low-level interface contains two routines.
+    * One that applies the acitvation function to a matrix and one that evaluate
+    * the derivatives of the activation function at the elements of a given matrix
+    * and writes the results into the result matrix.
+    */
+   ///@{
+   static void Identity(TMatrixT<Real_t> & B);
+   static void IdentityDerivative(TMatrixT<Real_t> & B,
+                                  const TMatrixT<Real_t> & A);
+
+   static void Relu(TMatrixT<Real_t> & B);
+   static void ReluDerivative(TMatrixT<Real_t> & B,
+                              const TMatrixT<Real_t> & A);
+
+   static void Sigmoid(TMatrixT<Real_t> & B);
+   static void SigmoidDerivative(TMatrixT<Real_t> & B,
+                                 const TMatrixT<Real_t> & A);
+
+   static void Tanh(TMatrixT<Real_t> & B);
+   static void TanhDerivative(TMatrixT<Real_t> & B,
+                              const TMatrixT<Real_t> & A);
+
+   static void SymmetricRelu(TMatrixT<Real_t> & B);
+   static void SymmetricReluDerivative(TMatrixT<Real_t> & B,
+                                       const TMatrixT<Real_t> & A);
+
+   static void SoftSign(TMatrixT<Real_t> & B);
+   static void SoftSignDerivative(TMatrixT<Real_t> & B,
+                                  const TMatrixT<Real_t> & A);
+
+   static void Gauss(TMatrixT<Real_t> & B);
+   static void GaussDerivative(TMatrixT<Real_t> & B,
+                               const TMatrixT<Real_t> & A);
+
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Loss Functions
+   //____________________________________________________________________________
+
+   /** @name Loss Functions
+    * Loss functions compute a scalar value given the \p output of the network
+    * for a given training input and the expected network prediction \p Y that
+    * quantifies the quality of the prediction. For each function also a routing
+    * that computes the gradients (suffixed by Gradients) must be provided for
+    * the starting of the backpropagation algorithm.
+    */
+   ///@{
+
+   static Real_t MeanSquaredError(const TMatrixT<Real_t> &Y,
+                                  const TMatrixT<Real_t> &output);
+   static void MeanSquaredErrorGradients(TMatrixT<Real_t> & dY,
+                                         const TMatrixT<Real_t> &Y,
+                                         const TMatrixT<Real_t> &output);
+
+    /** Sigmoid transformation is implicitly applied, thus \p output should
+     *  hold the linear activations of the last layer in the net. */
+   static Real_t CrossEntropy(const TMatrixT<Real_t> &Y,
+                              const TMatrixT<Real_t> &output);
+
+   static void CrossEntropyGradients(TMatrixT<Real_t> & dY,
+                                     const TMatrixT<Real_t> & Y,
+                                     const TMatrixT<Real_t> & output);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Output Functions
+   //____________________________________________________________________________
+
+   /** @name Output Functions
+    * Output functions transform the activations \p output of the
+    * output layer in the network to a valid prediction \p YHat for
+    * the desired usage of the network, e.g.  the identity function
+    * for regression or the sigmoid transformation for two-class
+    * classification.
+    */
+   ///@{
+   static void Sigmoid(TMatrixT<Real_t> &YHat,
+                        const TMatrixT<Real_t> & );
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Regularization
+   //____________________________________________________________________________
+
+   /** @name Regularization
+    * For each regularization type two functions are required, one named
+    * <tt><Type>Regularization</tt> that evaluates the corresponding
+    * regularization functional for a given weight matrix and the
+    * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
+    * component in the gradients to the provided matrix.
+    */
+   ///@{
+
+   static Real_t L1Regularization(const TMatrixT<Real_t> & W);
+   static void AddL1RegularizationGradients(TMatrixT<Real_t> & A,
+                                            const TMatrixT<Real_t> & W,
+                                            Real_t weightDecay);
+
+   static Real_t L2Regularization(const TMatrixT<Real_t> & W);
+   static void AddL2RegularizationGradients(TMatrixT<Real_t> & A,
+                                            const TMatrixT<Real_t> & W,
+                                            Real_t weightDecay);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Initialization
+   //____________________________________________________________________________
+
+   /** @name Initialization
+    * For each initialization method, one function in the low-level interface
+    * is provided. The naming scheme is <p>Initialize<Type></p> for a given
+    * initialization method Type.
+    */
+   ///@{
+
+   static void InitializeGauss(TMatrixT<Real_t> & A);
+
+   static void InitializeUniform(TMatrixT<Real_t> & A);
+
+   static void InitializeIdentity(TMatrixT<Real_t> & A);
+
+   static void InitializeZero(TMatrixT<Real_t> & A);
+
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Dropout
+   //____________________________________________________________________________
+
+   /** @name Dropout
+    */
+   ///@{
+
+   /** Apply dropout with activation probability \p p to the given
+    *  matrix \p A and scale the result by reciprocal of \p p. */
+   static void Dropout(TMatrixT<Real_t> & A, Real_t dropoutProbability);
+
+   ///@}
+};
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Reference/DataLoader.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference/DataLoader.h
new file mode 100644
index 0000000000000..c2c25dace4108
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference/DataLoader.h
@@ -0,0 +1,164 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 12/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////////
+// This file contains the declaration of the Batch, BatchIterator //
+// and DataLoader classes for the reference implementation.       //
+////////////////////////////////////////////////////////////////////
+
+#include <vector>
+#include "TMatrix.h"
+#include "TMVA/Event.h"
+
+#ifndef TMVA_DNN_ARCHITECTURES_REFERENCE_DATALOADER
+#define TMVA_DNN_ARCHITECTURES_REFERENCE_DATALOADER
+
+namespace TMVA
+{
+namespace DNN
+{
+
+// Input Data Types
+using MatrixInput_t = std::pair<const TMatrixT<Double_t> &,
+                                const TMatrixT<Double_t> &>;
+using TMVAInput_t   = std::vector<TMVA::Event*>;
+
+/** Reference Batch Class
+ *
+ * The Batch class for the reference implementation. Provides the required
+ * GetInput() and GetOutput() method which returns the input and output
+ * corresponding to this batch in matrix form.
+ *
+ * \tparam Real_t The floating point type to represent floating point numbers.
+ */
+template<typename Real_t>
+class TReferenceBatch
+{
+   using Matrix_t = TMatrixT<Double_t>;
+
+   const Matrix_t & fInput;
+   const Matrix_t & fOutput;
+
+public:
+
+   /** Create a batch containing the provided input and output in matrix
+    *  form.
+    *
+    * \param input Reference to the input data in matrix form.
+    * \param output Reference to the expected output (truth) in matrix form.
+    */
+   TReferenceBatch(Matrix_t & input, Matrix_t & output)
+   : fInput(input), fOutput(output)
+   {
+      // Nothing to do here.
+   }
+
+   Matrix_t GetInput()  {return fInput;}
+   Matrix_t GetOutput() {return fOutput;}
+
+};
+
+/** Reference DataLoader Class
+ *
+ * DataLoader class for the reference implementation. The
+ * TReferenceDataLoader class takes a reference to the input data and
+ * prepares the batches for the training and evaluation of the neural
+ * network.
+ *
+ * \tparam Data_t Defines the input data type, i.e. the class that
+ *  provides access to the training and test data. To instantiate this
+ *  class with a given input data type, the CopyBatch function
+ *  template must be specialized for this type.
+ *
+ *  \tparam Real_t The floating point type used to represent scalars.
+ */
+template<typename Data_t, typename Real_t>
+class TReferenceDataLoader
+{
+   using Scalar_t = Real_t;
+   using Matrix_t = TMatrixT<Real_t>;
+   using BatchIterator_t = typename std::vector<TReferenceBatch<Real_t>>::iterator;
+   using IndexIterator_t = typename std::vector<size_t>::iterator;
+
+   const Data_t & fInput;
+
+   size_t fNSamples;
+   size_t fBatchSize;
+   size_t fNInputFeatures;
+   size_t fNOutputFeatures;
+   size_t fNBatches;
+
+   std::vector<TMatrixT<Double_t>> fInputMatrices;
+   std::vector<TMatrixT<Double_t>> fOutputMatrices;
+   std::vector<TReferenceBatch<Real_t>> fBatches;
+   std::vector<size_t>        fSampleIndices;
+
+public:
+
+   /** Create data loader from the given input data \p input. The data set
+    *  shoule consists of \p nsamples samples. Will create a vector of
+    *  batches that are sample randomly from the input data.
+    *
+    *  \param nsamples Number of samples in the training set.
+    *  \param batchSize The batch size.
+    *  \param ninputFeatures Number of input features
+    *  \param noutputFeatures Number of output features.
+   */
+   TReferenceDataLoader(const Data_t &input,
+                       size_t nsamples,
+                       size_t batchSize,
+                       size_t ninputFeatures,
+                       size_t noutputFeatures);
+
+   /** This function copies a batch from the input data into the the matrices
+    *  used as input for the neural network.
+    *
+    * \param input  Reference to the matrix to write the input samples into.
+    * \param output Reference to the matrix to write the output labels into.
+    * \param begin  Index iterator containing the sample indices which to include
+    *  in the current batch.
+    * \param end    Iterator to the index of the last sample to include in thie
+    *  batch.
+    */
+   void CopyBatch(Matrix_t &inputMatrix,
+                  Matrix_t &outputMatrix,
+                  const Data_t & input,
+                  IndexIterator_t begin,
+                  IndexIterator_t end);
+
+   /** Return iterator over batches in one epoch the data set. Shuffles
+    *  the training sample before returning the iterator. */
+   BatchIterator_t begin();
+   /** Iterator pointing to the end of the epoch, required for C++ range
+    *  loops */
+   BatchIterator_t end();
+
+   size_t GetBatchSize() const       {return fBatchSize;}
+   size_t GetNBatchesInEpoch() const {return fNBatches;}
+};
+
+template <>
+void TReferenceDataLoader<MatrixInput_t, Double_t>::CopyBatch(Matrix_t &inputMatrix,
+                                                             Matrix_t &outputMatrix,
+                                                             const MatrixInput_t & input,
+                                                             IndexIterator_t begin,
+                                                             IndexIterator_t end);
+
+template <>
+void TReferenceDataLoader<TMVAInput_t, Double_t>::CopyBatch(Matrix_t &inputMatrix,
+                                                           Matrix_t &outputMatrix,
+                                                           const TMVAInput_t & input,
+                                                           IndexIterator_t begin,
+                                                           IndexIterator_t end);
+} // namespace TMVA
+} // namespace DNN
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/DataLoader.h b/tmva/tmva/inc/TMVA/DNN/DataLoader.h
new file mode 100644
index 0000000000000..7ff3d61591a93
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/DataLoader.h
@@ -0,0 +1,222 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 08/08/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////
+// Generic data loader for neural network input data. //
+////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_DATALOADER
+#define TMVA_DNN_DATALOADER
+
+#include "TMatrix.h"
+#include <vector>
+
+#include "TMVA/Event.h"
+
+namespace TMVA {
+namespace DNN  {
+
+//
+// Input Data Types
+//______________________________________________________________________________
+using MatrixInput_t    = std::pair<const TMatrixT<Double_t> &,
+                                   const TMatrixT<Double_t> &>;
+using TMVAInput_t      = std::vector<Event*>;
+
+//
+// TBatch Class.
+//______________________________________________________________________________
+template <typename Architecture_t>
+class TBatch
+{
+private:
+
+   using Matrix_t       = typename Architecture_t::Matrix_t;
+
+   Matrix_t fInputMatrix;
+   Matrix_t fOutputMatrix;
+
+public:
+
+   TBatch(Matrix_t &&, Matrix_t &&);
+   TBatch(const TBatch  &) = default;
+   TBatch(      TBatch &&) = default;
+   TBatch & operator=(const TBatch  &) = default;
+   TBatch & operator=(      TBatch &&) = default;
+
+   Matrix_t & GetInput()  {return fInputMatrix;}
+   Matrix_t & GetOutput() {return fOutputMatrix;}
+};
+
+//
+// TBatchIterator Class.
+//______________________________________________________________________________
+
+template<typename Data_t, typename Architecture_t> class TDataLoader;
+
+template<typename Data_t, typename Architecture_t>
+class TBatchIterator
+{
+private:
+
+   TDataLoader<Data_t, Architecture_t> & fDataLoader;
+   size_t fBatchIndex;
+
+public:
+
+TBatchIterator(TDataLoader<Data_t, Architecture_t> & dataLoader, size_t index = 0)
+: fDataLoader(dataLoader), fBatchIndex(index)
+{
+   // Nothing to do here.
+}
+
+   TBatch<Architecture_t> operator*() {return fDataLoader.GetBatch();}
+   TBatchIterator operator++() {fBatchIndex++; return *this;}
+   bool operator!=(const TBatchIterator & other) {
+      return fBatchIndex != other.fBatchIndex;
+   }
+};
+
+//
+// TDataLoader Class.
+//______________________________________________________________________________
+template<typename Data_t, typename Architecture_t>
+class TDataLoader
+{
+private:
+
+   using HostBuffer_t    = typename Architecture_t::HostBuffer_t;
+   using DeviceBuffer_t  = typename Architecture_t::DeviceBuffer_t;
+   using IndexIterator_t = typename std::vector<size_t>::iterator;
+   using Matrix_t        = typename Architecture_t::Matrix_t;
+   using BatchIterator_t = TBatchIterator<Data_t, Architecture_t>;
+
+   const Data_t  & fData;
+
+   size_t fNSamples;
+   size_t fBatchSize;
+   size_t fNInputFeatures;
+   size_t fNOutputFeatures;
+   size_t fBatchIndex;
+
+   size_t fNStreams;
+   std::vector<DeviceBuffer_t> fDeviceBuffers;
+   std::vector<HostBuffer_t>  fHostBuffers;
+
+   std::vector<size_t> fSampleIndices;
+
+public:
+
+   TDataLoader(const Data_t & data, size_t nSamples, size_t batchSize,
+               size_t nInputFeatures, size_t nOutputFeatures, size_t nStreams = 4);
+   TDataLoader(const TDataLoader  &) = default;
+   TDataLoader(      TDataLoader &&) = default;
+   TDataLoader & operator=(const TDataLoader  &) = default;
+   TDataLoader & operator=(      TDataLoader &&) = default;
+
+   /** Copy input matrix into the given host buffer. Function to be specialized by
+    *  the architecture-specific backend. */
+   void  CopyInput(HostBuffer_t &buffer, IndexIterator_t begin, size_t batchSize);
+   /** Copy output matrix into the given host buffer. Function to be specialized
+    * by the architecture-spcific backend. */
+   void CopyOutput(HostBuffer_t &buffer, IndexIterator_t begin, size_t batchSize);
+
+   BatchIterator_t begin() {return TBatchIterator<Data_t, Architecture_t>(*this);}
+   BatchIterator_t end()
+   {
+      return TBatchIterator<Data_t, Architecture_t>(*this,(fNSamples / fBatchSize)+1);
+   }
+
+   void Shuffle();
+   TBatch<Architecture_t> GetBatch();
+
+};
+
+//
+// TBatch Class.
+//______________________________________________________________________________
+template<typename Architecture_t>
+TBatch<Architecture_t>::TBatch(Matrix_t && inputMatrix, Matrix_t && outputMatrix)
+    : fInputMatrix(inputMatrix), fOutputMatrix(outputMatrix)
+{
+    // Nothing to do here.
+}
+
+//
+// TDataLoader Class.
+//______________________________________________________________________________
+template<typename Data_t, typename Architecture_t>
+TDataLoader<Data_t, Architecture_t>::TDataLoader(
+    const Data_t & data, size_t nSamples, size_t batchSize,
+    size_t nInputFeatures, size_t nOutputFeatures, size_t nStreams)
+    : fData(data), fNSamples(nSamples), fBatchSize(batchSize),
+      fNInputFeatures(nInputFeatures), fNOutputFeatures(nOutputFeatures),
+      fBatchIndex(0), fNStreams(nStreams), fDeviceBuffers(), fHostBuffers(),
+      fSampleIndices()
+{
+   size_t inputMatrixSize  = fBatchSize * fNInputFeatures;
+   size_t outputMatrixSize = fBatchSize * fNOutputFeatures;
+
+   for (size_t i = 0; i < fNStreams; i++)
+   {
+      fHostBuffers.push_back(HostBuffer_t(inputMatrixSize + outputMatrixSize));
+      fDeviceBuffers.push_back(DeviceBuffer_t(inputMatrixSize + outputMatrixSize));
+   }
+
+   fSampleIndices.reserve(fNSamples);
+   for (size_t i = 0; i < fNSamples; i++) {
+      fSampleIndices.push_back(i);
+   }
+}
+
+template<typename Data_t, typename Architecture_t>
+TBatch<Architecture_t> TDataLoader<Data_t, Architecture_t>::GetBatch()
+{
+   fBatchIndex %= (fNSamples / fBatchSize); // Cycle through samples.
+
+
+   size_t inputMatrixSize  = fBatchSize * fNInputFeatures;
+   size_t outputMatrixSize = fBatchSize * fNOutputFeatures;
+
+   size_t streamIndex = fBatchIndex % fNStreams;
+   HostBuffer_t   & hostBuffer   = fHostBuffers[streamIndex];
+   DeviceBuffer_t & deviceBuffer = fDeviceBuffers[streamIndex];
+
+   HostBuffer_t inputHostBuffer  = hostBuffer.GetSubBuffer(0, inputMatrixSize);
+   HostBuffer_t outputHostBuffer = hostBuffer.GetSubBuffer(inputMatrixSize,
+                                                           outputMatrixSize);
+
+   DeviceBuffer_t inputDeviceBuffer  = deviceBuffer.GetSubBuffer(0, inputMatrixSize);
+   DeviceBuffer_t outputDeviceBuffer = deviceBuffer.GetSubBuffer(inputMatrixSize,
+                                                                 outputMatrixSize);
+   size_t sampleIndex = fBatchIndex * fBatchSize;
+   IndexIterator_t sampleIndexIterator = fSampleIndices.begin() + sampleIndex;
+
+   CopyInput(inputHostBuffer,   sampleIndexIterator, fBatchSize);
+   CopyOutput(outputHostBuffer, sampleIndexIterator, fBatchSize);
+
+   deviceBuffer.CopyFrom(hostBuffer);
+   Matrix_t  inputMatrix(inputDeviceBuffer,  fBatchSize, fNInputFeatures);
+   Matrix_t outputMatrix(outputDeviceBuffer, fBatchSize, fNOutputFeatures);
+
+   fBatchIndex++;
+   return TBatch<Architecture_t>(std::move(inputMatrix), std::move(outputMatrix));
+}
+
+template<typename Data_t, typename Architecture_t>
+void TDataLoader<Data_t, Architecture_t>::Shuffle()
+{
+   std::random_shuffle(fSampleIndices.begin(), fSampleIndices.end());
+}
+
+}
+}
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Functions.h b/tmva/tmva/inc/TMVA/DNN/Functions.h
new file mode 100644
index 0000000000000..72ac34f1f69d7
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Functions.h
@@ -0,0 +1,265 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 20/06/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////////////
+// Contains function enums for activation and output functions, as //
+// well as generic evaluation functions, that delegate the call to //
+// the corresponding evaluation kernel.                            //
+/////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_FUNCTIONS
+#define TMVA_DNN_FUNCTIONS
+
+namespace TMVA
+{
+namespace DNN
+{
+//______________________________________________________________________________
+//
+//  Enum Definitions
+//______________________________________________________________________________
+
+/*! Enum that represents layer activation functions. */
+enum class EActivationFunction
+{
+   IDENTITY = 0,
+   RELU     = 1,
+   SIGMOID  = 2,
+   TANH     = 3,
+   SYMMRELU = 4,
+   SOFTSIGN = 5,
+   GAUSS    = 6
+};
+
+/*! Enum that represents output functions */
+enum class EOutputFunction
+{
+   IDENTITY = 'I',
+   SIGMOID  = 'S'
+};
+
+/*! Enum that represents objective functions for the net, i.e. functions
+*  that take the output from the last layer in the net together with the
+*  truths and return the objective function values that is to be minimized
+*  in the training process. */
+enum class ELossFunction
+{
+    CROSSENTROPY     = 'C',
+    MEANSQUAREDERROR = 'R'
+    };
+
+/*! Enum representing the regularization type applied for a given layer */
+enum class ERegularization
+{
+    NONE = '0',
+    L1   = '1',
+    L2   = '2'
+    };
+
+/* Enum represnting the initialization method used for this layer. */
+enum class EInitialization {
+    GAUSS    = 'G',
+    UNIFORM  = 'U',
+    IDENTITY = 'I',
+    ZERO = 'Z'
+};
+
+//______________________________________________________________________________
+//
+//  Activation Functions
+//______________________________________________________________________________
+
+/*! Apply the given activation function to each value in the given
+*  matrix A. */
+template<typename Architecture_t>
+inline void evaluate(typename Architecture_t::Matrix_t &A,
+                    EActivationFunction f)
+{
+    switch(f)
+    {
+    case EActivationFunction::IDENTITY : break;
+    case EActivationFunction::RELU :     Architecture_t::Relu(A);
+        break;
+    case EActivationFunction::SIGMOID  :  Architecture_t::Sigmoid(A);
+        break;
+    case EActivationFunction::TANH     :  Architecture_t::Tanh(A);
+        break;
+    case EActivationFunction::SYMMRELU :  Architecture_t::SymmetricRelu(A);
+        break;
+    case EActivationFunction::SOFTSIGN :  Architecture_t::SoftSign(A);
+        break;
+    case EActivationFunction::GAUSS    :  Architecture_t::Gauss(A);
+        break;
+    }
+}
+
+
+/*! Compute the first partial derivative of the activation function for
+*  the values given in matrix A and write the results into B. */
+//______________________________________________________________________________
+template<typename Architecture_t>
+inline void evaluateDerivative(typename Architecture_t::Matrix_t & B,
+                                EActivationFunction f,
+                                const typename Architecture_t::Matrix_t & A)
+{
+    switch(f)
+    {
+    case EActivationFunction::IDENTITY : Architecture_t::IdentityDerivative(B, A);
+        break;
+    case EActivationFunction::RELU     : Architecture_t::ReluDerivative(B, A);
+        break;
+    case EActivationFunction::SIGMOID  : Architecture_t::SigmoidDerivative(B, A);
+        break;
+    case EActivationFunction::TANH     : Architecture_t::TanhDerivative(B, A);
+        break;
+    case EActivationFunction::SYMMRELU : Architecture_t::SymmetricReluDerivative(B, A);
+        break;
+    case EActivationFunction::SOFTSIGN : Architecture_t::SoftSignDerivative(B, A);
+        break;
+    case EActivationFunction::GAUSS    : Architecture_t::GaussDerivative(B, A);
+        break;
+    }
+}
+
+//______________________________________________________________________________
+//
+//  Output Functions
+//______________________________________________________________________________
+
+/*! Apply the given output function to each value in the given
+*  matrix A. */
+template<typename Architecture_t>
+inline void evaluate(typename Architecture_t::Matrix_t &A,
+                    EOutputFunction f,
+                    const typename Architecture_t::Matrix_t &X)
+{
+    switch(f)
+    {
+    case EOutputFunction::IDENTITY : break;
+    case EOutputFunction::SIGMOID  : Architecture_t::Sigmoid(A, X);
+        break;
+    }
+}
+
+//______________________________________________________________________________
+//
+//  Loss Functions
+//______________________________________________________________________________
+
+/*! Compute the value of the objective function f for given activations
+*  of the ouput layer and the truth Y. */
+template<typename Architecture_t>
+inline auto evaluate(ELossFunction f,
+                    const typename Architecture_t::Matrix_t & Y,
+                    const typename Architecture_t::Matrix_t & output)
+-> decltype(Architecture_t::CrossEntropy(Y,output))
+{
+    switch(f)
+    {
+    case ELossFunction::CROSSENTROPY :
+        return Architecture_t::CrossEntropy(Y, output);
+    case ELossFunction::MEANSQUAREDERROR :
+        return Architecture_t::MeanSquaredError(Y, output);
+    }
+    return 0.0;
+}
+
+/*! Compute the gradient of the given output function f for given activations
+*  output of the output layer and truth Y and write the results into dY. */
+//______________________________________________________________________________
+template<typename Architecture_t>
+inline void evaluateGradients(typename Architecture_t::Matrix_t & dY,
+                                ELossFunction f,
+                                const typename Architecture_t::Matrix_t &Y,
+                                const typename Architecture_t::Matrix_t &output)
+{
+    switch(f)
+    {
+    case ELossFunction::CROSSENTROPY :
+        Architecture_t::CrossEntropyGradients(dY, Y, output);
+        break;
+    case ELossFunction::MEANSQUAREDERROR :
+        Architecture_t::MeanSquaredErrorGradients(dY, Y, output);
+        break;
+    }
+}
+
+
+//______________________________________________________________________________
+//
+// Regularization
+//______________________________________________________________________________
+
+/*! Evaluate the regularization functional for a given weight matrix. */
+template<typename Architecture_t>
+inline auto regularization(const typename Architecture_t::Matrix_t &A,
+                    ERegularization R)
+-> decltype(Architecture_t::L1Regularization(A))
+{
+    switch(R)
+    {
+    case ERegularization::NONE :
+        return 0.0;
+    case ERegularization::L1 :
+        return Architecture_t::L1Regularization(A);
+    case ERegularization::L2 :
+        return Architecture_t::L2Regularization(A);
+    }
+    return 0.0;
+}
+
+/*! Add the regularization gradient corresponding to weight matrix W, to
+*  the matrix A. */
+//______________________________________________________________________________
+template<typename Architecture_t>
+inline void addRegularizationGradients(typename Architecture_t::Matrix_t &A,
+                                       const typename Architecture_t::Matrix_t &W,
+                                       typename Architecture_t::Scalar_t weightDecay,
+                                       ERegularization R)
+{
+    switch(R)
+    {
+    case ERegularization::NONE :
+        break;
+    case ERegularization::L1 :
+        Architecture_t::AddL1RegularizationGradients(A, W, weightDecay);
+        break;
+    case ERegularization::L2 :
+        Architecture_t::AddL2RegularizationGradients(A, W, weightDecay);
+        break;
+    }
+}
+
+//______________________________________________________________________________
+//
+// Initialization
+//______________________________________________________________________________
+
+template<typename Architecture_t>
+inline void initialize(typename Architecture_t::Matrix_t & A,
+                       EInitialization m)
+{
+   switch(m) {
+   case EInitialization::GAUSS    : Architecture_t::InitializeGauss(A);
+       break;
+   case EInitialization::UNIFORM  : Architecture_t::InitializeUniform(A);
+       break;
+   case EInitialization::IDENTITY : Architecture_t::InitializeIdentity(A);
+       break;
+   case EInitialization::ZERO     : Architecture_t::InitializeZero(A);
+       break;
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Layer.h b/tmva/tmva/inc/TMVA/DNN/Layer.h
new file mode 100644
index 0000000000000..da53fb37df246
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Layer.h
@@ -0,0 +1,399 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 20/06/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+// Contains Layer and SharedLayer classes, that represent layers in //
+// neural networks.                                                 //
+//////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_LAYER
+#define TMVA_DNN_LAYER
+
+#include <iostream>
+
+#include "TMatrix.h"
+#include "Functions.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+//
+//  The Layer Class
+//______________________________________________________________________________
+
+/** \class TLayer
+
+    Generic layer class.
+
+    This generic layer class represents a layer of a neural network with
+    a given width n and activation function f. The activation
+    function of each layer is given by \f$\mathbf{u} =
+    \mathbf{W}\mathbf{x} + \boldsymbol{\theta}\f$.
+
+    In addition to the weight and bias matrices, each layer allocates memory
+    for its activations and the corresponding first partial fDerivatives of
+    the activation function as well as the gradients of the fWeights and fBiases.
+
+    The layer provides member functions for the forward propagation of
+    activations through the given layer.
+*/
+template<typename Architecture_t>
+   class TLayer
+{
+
+public:
+   using Scalar_t = typename Architecture_t::Scalar_t;
+   using Matrix_t = typename Architecture_t::Matrix_t;
+
+private:
+
+   size_t fBatchSize;  ///< Batch size used for training and evaluation.
+   size_t fInputWidth; ///< Number of neurons of the previous layer.
+   size_t fWidth;      ///< Number of neurons of this layer.
+
+   Scalar_t fDropoutProbability;  ///< Probability that an input is active.
+
+   Matrix_t fWeights;             ///< The fWeights of this layer.
+   Matrix_t fBiases;              ///< The bias values of this layer.
+   Matrix_t fOutput;              ///< Activations of this layer.
+   Matrix_t fDerivatives;         ///< First fDerivatives of the activations of this layer.
+   Matrix_t fWeightGradients;     ///< Gradients w.r.t. the weigths of this layer.
+   Matrix_t fBiasGradients;       ///< Gradients w.r.t. the bias values of this layer.
+   Matrix_t fActivationGradients; ///< Gradients w.r.t. the activations of this layer.
+
+   EActivationFunction fF; ///< Activation function of the layer.
+
+public:
+
+   TLayer(size_t             BatchSize,
+          size_t             InputWidth,
+          size_t             Width,
+          EActivationFunction f,
+          Scalar_t           dropoutProbability);
+   TLayer(const TLayer &);
+
+   /*! Initialize fWeights according to the given initialization
+    *  method. */
+   void Initialize(EInitialization m);
+   /*! Compute activation of the layer for the given input. The input
+    * must be in matrix form with the different rows corresponding to
+    * different events in the batch. Computes activations as well as
+    * the first partial derivative of the activation function at those
+    * activations. */
+   void inline Forward(Matrix_t & input, bool applyDropout = false);
+   /*! Compute weight, bias and activation gradients. Uses the precomputed
+    *  first partial derviatives of the activation function computed during
+    *  forward propagation and modifies them. Must only be called directly
+    *  a the corresponding call to Forward(...). */
+   void inline Backward(Matrix_t & gradients_backward,
+                        const Matrix_t & activations_backward,
+                        ERegularization r,
+                        Scalar_t weightDecay);
+
+   void Print() const;
+
+   size_t GetBatchSize()          const {return fBatchSize;}
+   size_t GetInputWidth()         const {return fInputWidth;}
+   size_t GetWidth()              const {return fWidth;}
+   size_t GetDropoutProbability() const {return fDropoutProbability;}
+
+   void SetDropoutProbability(Scalar_t p) {fDropoutProbability = p;}
+
+   EActivationFunction GetActivationFunction() const {return fF;}
+
+   Matrix_t       & GetOutput()        {return fOutput;}
+   const Matrix_t & GetOutput() const  {return fOutput;}
+   Matrix_t       & GetWeights()       {return fWeights;}
+   const Matrix_t & GetWeights() const {return fWeights;}
+   Matrix_t       & GetBiases()       {return fBiases;}
+   const Matrix_t & GetBiases() const {return fBiases;}
+   Matrix_t       & GetActivationGradients()       {return fActivationGradients;}
+   const Matrix_t & GetActivationGradients() const {return fActivationGradients;}
+   Matrix_t       & GetBiasGradients()       {return fBiasGradients;}
+   const Matrix_t & GetBiasGradients() const {return fBiasGradients;}
+   Matrix_t       & GetWeightGradients()       {return fWeightGradients;}
+   const Matrix_t & GetWeightGradients() const {return fWeightGradients;}
+
+};
+
+//______________________________________________________________________________
+//
+//  The Shared Layer Class
+//______________________________________________________________________________
+
+/** \class TSharedLayer
+
+    Layer class width shared weight and bias layers.
+
+    Like the Layer class only that weight matrices are shared between
+    different instances of the net, which can be used to implement
+    multithreading 'Hogwild' style.
+*/
+
+template<typename Architecture_t>
+class TSharedLayer
+{
+
+public:
+
+   using Scalar_t = typename Architecture_t::Scalar_t;
+   using Matrix_t = typename Architecture_t::Matrix_t;
+
+private:
+
+   size_t fBatchSize;  ///< Batch size used for training and evaluation.
+   size_t fInputWidth; ///< Number of neurons of the previous layer.
+   size_t fWidth;      ///< Number of neurons of this layer.
+
+   Scalar_t fDropoutProbability;  ///< Probability that an input is active.
+
+   Matrix_t & fWeights;           ///< Reference to the weight matrix of this layer.
+   Matrix_t & fBiases;            ///< Reference to the bias vectors of this layer.
+   Matrix_t fOutput;              ///< Activations of this layer.
+   Matrix_t fDerivatives;         ///< First fDerivatives of the activations of this layer.
+   Matrix_t fWeightGradients;     ///< Gradients w.r.t. the weigths of this layer.
+   Matrix_t fBiasGradients;       ///< Gradients w.r.t. the bias values of this layer.
+   Matrix_t fActivationGradients; ///< Gradients w.r.t. the activations of this layer.
+
+   EActivationFunction fF; ///< Activation function of the layer.
+
+public:
+
+   TSharedLayer(size_t fBatchSize,
+                TLayer<Architecture_t> & layer);
+   TSharedLayer(const TSharedLayer & layer);
+
+   /*! Compute activation of the layer for the given input. The input
+    * must be in matrix form with the different rows corresponding to
+    * different events in the batch. Computes activations as well as
+    * the first partial derivative of the activation function at those
+    * activations. */
+   void inline Forward(Matrix_t & input, bool applyDropout = false);
+   /*! Compute weight, bias and activation gradients. Uses the precomputed
+    *  first partial derviatives of the activation function computed during
+    *  forward propagation and modifies them. Must only be called directly
+    *  a the corresponding call to Forward(...). */
+   void inline Backward(Matrix_t & gradients_backward,
+                        const Matrix_t & activations_backward,
+                        ERegularization r,
+                        Scalar_t weightDecay);
+
+   void Print() const;
+
+   size_t GetBatchSize()          const {return fBatchSize;}
+   size_t GetInputWidth()         const {return fInputWidth;}
+   size_t GetWidth()              const {return fWidth;}
+   size_t GetDropoutProbability() const {return fDropoutProbability;}
+
+   void SetDropoutProbability(Scalar_t p) {fDropoutProbability = p;}
+
+   EActivationFunction GetActivationFunction() const {return fF;}
+
+   Matrix_t       & GetOutput()        {return fOutput;}
+   const Matrix_t & GetOutput() const  {return fOutput;}
+   Matrix_t       & GetWeights() const {return fWeights;}
+   Matrix_t       & GetBiases()       {return fBiases;}
+   const Matrix_t & GetBiases() const {return fBiases;}
+   Matrix_t       & GetActivationGradients()       {return fActivationGradients;}
+   const Matrix_t & GetActivationGradients() const {return fActivationGradients;}
+   Matrix_t       & GetBiasGradients()       {return fBiasGradients;}
+   const Matrix_t & GetBiasGradients() const {return fBiasGradients;}
+   Matrix_t       & GetWeightGradients()       {return fWeightGradients;}
+   const Matrix_t & GetWeightGradients() const {return fWeightGradients;}
+
+};
+
+//______________________________________________________________________________
+//
+//  The Layer Class - Implementation
+//______________________________________________________________________________
+
+template<typename Architecture_t>
+   TLayer<Architecture_t>::TLayer(size_t batchSize,
+                                  size_t inputWidth,
+                                  size_t width,
+                                  EActivationFunction f,
+                                  Scalar_t dropoutProbability)
+   : fBatchSize(batchSize), fInputWidth(inputWidth), fWidth(width),
+     fDropoutProbability(dropoutProbability), fWeights(width, fInputWidth),
+     fBiases(width, 1), fOutput(fBatchSize, width), fDerivatives(fBatchSize, width),
+     fWeightGradients(width, fInputWidth), fBiasGradients(width, 1),
+     fActivationGradients(fBatchSize, width), fF(f)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+TLayer<Architecture_t>::TLayer(const TLayer &layer)
+    : fBatchSize(layer.fBatchSize),
+    fInputWidth(layer.fInputWidth), fWidth(layer.fWidth),
+    fWeights(layer.fWidth, layer.fInputWidth), fBiases(layer.fWidth, 1),
+    fOutput(layer.fBatchSize, layer.fWidth),
+    fDerivatives(layer.fBatchSize, layer.fWidth),
+    fWeightGradients(layer.fWidth, layer.fInputWidth),
+    fBiasGradients(layer.fWidth, 1),
+    fActivationGradients(layer.fBatchSize, layer.fWidth),
+    fF(layer.fF)
+{
+    std::cout << "Copy const. " << std::endl;
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto TLayer<Architecture_t>::Initialize(EInitialization m)
+-> void
+{
+   initialize<Architecture_t>(fWeights, m);
+   initialize<Architecture_t>(fBiases,  EInitialization::ZERO);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto inline TLayer<Architecture_t>::Forward(Matrix_t & input,
+                                            bool applyDropout)
+-> void
+{
+   if (applyDropout && (fDropoutProbability != 1.0)) {
+      Architecture_t::Dropout(input, fDropoutProbability);
+   }
+   Architecture_t::MultiplyTranspose(fOutput, input, fWeights);
+   Architecture_t::AddRowWise(fOutput, fBiases);
+   evaluateDerivative<Architecture_t>(fDerivatives, fF, fOutput);
+   evaluate<Architecture_t>(fOutput, fF);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto TLayer<Architecture_t>::Backward(Matrix_t & gradients_backward,
+                                    const Matrix_t & activations_backward,
+                                    ERegularization r,
+                                    Scalar_t weightDecay)
+-> void
+{
+   Architecture_t::Backward(gradients_backward,
+                            fWeightGradients,
+                            fBiasGradients,
+                            fDerivatives,
+                            fActivationGradients,
+                            fWeights,
+                            activations_backward);
+   addRegularizationGradients<Architecture_t>(fWeightGradients,
+                                              fWeights,
+                                              weightDecay, r);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+   void TLayer<Architecture_t>::Print() const
+{
+   std::cout << "Width: " << fWeights.GetNrows();
+   std::cout << "Biases: " << std::endl;
+   ((TMatrixT<Double_t>) fBiases).Print();
+   std::cout << "Weights: " << std::endl;
+   ((TMatrixT<Double_t>) fWeights).Print();
+   std::cout << ", activation function: ";
+   std::cout << static_cast<int>(fF) << std::endl;
+   ((TMatrixT<Double_t>) fOutput).Print();
+   std::cout << "weight gradients: " << std::endl;
+   ((TMatrixT<Double_t>) fWeightGradients).Print();
+   std::cout << "bias gradients: " << std::endl;
+   ((TMatrixT<Double_t>) fBiasGradients).Print();
+
+//   ((TMatrixT<Double_t>) fBiases).Print();
+}
+
+//______________________________________________________________________________
+//
+//  The Shared Layer Class - Implementation
+//______________________________________________________________________________
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+TSharedLayer<Architecture_t>::TSharedLayer(size_t BatchSize,
+                                         TLayer<Architecture_t> &layer)
+: fBatchSize(BatchSize),
+fInputWidth(layer.GetInputWidth()), fWidth(layer.GetWidth()),
+fDropoutProbability(layer.GetDropoutProbability()),
+fWeights(layer.GetWeights()), fBiases(layer.GetBiases()),
+fOutput(fBatchSize, fWidth), fDerivatives(fBatchSize, fWidth),
+fWeightGradients(fWidth, fInputWidth), fBiasGradients(fWidth, 1),
+fActivationGradients(fBatchSize, fWidth), fF(layer.GetActivationFunction())
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+TSharedLayer<Architecture_t>::TSharedLayer(const TSharedLayer &layer)
+    : fBatchSize(layer.fBatchSize),
+    fInputWidth(layer.GetInputWidth()), fWidth(layer.GetWidth()),
+    fWeights(layer.fWeights), fBiases(layer.fBiases),
+    fOutput(layer.fBatchSize, fWidth), fDerivatives(layer.fBatchSize, fWidth),
+    fWeightGradients(fWidth, fInputWidth), fBiasGradients(fWidth, 1),
+    fActivationGradients(layer.fBatchSize, fWidth),
+    fF(layer.fF)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto inline TSharedLayer<Architecture_t>::Forward(Matrix_t & input,
+                                                  bool applyDropout)
+-> void
+{
+   if (applyDropout && (fDropoutProbability != 1.0)) {
+      Architecture_t::Dropout(input, fDropoutProbability);
+   }
+   Architecture_t::MultiplyTranspose(fOutput, input, fWeights);
+   Architecture_t::AddRowWise(fOutput, fBiases);
+   evaluateDerivative<Architecture_t>(fDerivatives, fF, fOutput);
+   evaluate<Architecture_t>(fOutput, fF);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto inline TSharedLayer<Architecture_t>::Backward(Matrix_t & gradients_backward,
+                                                 const Matrix_t & activations_backward,
+                                                 ERegularization r,
+                                                 Scalar_t weightDecay)
+-> void
+{
+   Architecture_t::Backward(gradients_backward,
+                            fWeightGradients,
+                            fBiasGradients,
+                            fDerivatives,
+                            fActivationGradients,
+                            fWeights,
+                            activations_backward);
+   addRegularizationGradients<Architecture_t>(fWeightGradients,
+                                              fWeights,
+                                              weightDecay, r);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+void TSharedLayer<Architecture_t>::Print() const
+{
+   std::cout << "Width: " << fWeights.GetNrows();
+   std::cout << ", activation function: ";
+   std::cout << static_cast<char>(fF) << std::endl;
+}
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Minimizers.h b/tmva/tmva/inc/TMVA/DNN/Minimizers.h
new file mode 100644
index 0000000000000..e54dd3a78a483
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Minimizers.h
@@ -0,0 +1,407 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh 21/06/16
+
+/*************************************************************************
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+#ifndef TMVA_DNN_MINIMIZERS
+#define TMVA_DNN_MINIMIZERS
+
+#include "DataLoader.h"
+#include "Functions.h"
+#include <chrono>
+
+namespace TMVA {
+namespace DNN {
+
+//______________________________________________________________________________
+//
+// Generic Gradient Descent Class
+//______________________________________________________________________________
+//
+
+/** \class TGradientDescent
+
+    Generic implementation of gradient descent minimization.
+
+    The TGradientDescent class implements an architecture and input data
+    independent implementation of the gradient descent minimization algorithm.
+
+    The Train(...) method trains a given neural network using the provided training
+    and test (validation) data. The interface between input data and the matrix
+    representation required by the net is given by the prepare_batches(...) and
+    prepare_test_data(...) methods as well as the TBatch class of the
+    architecture-specific back end.
+
+    The prepare_batches(...) is expected to generate an iterable of batches. On
+    each of these batches, the Step(...) routine of the minimizer is called, which
+    for the gradient descent method just adds the corresponding gradients scaled
+    by \f$-\alpha\f$ to the weights and biases of each layer. Here \f$\alpha\f$ is
+    the learning rate of the gradient descent method.
+
+    The prepare_test_data(...) routine should return a batch representing
+    the test data, which is used to evaluate the performance of the net
+    every testInterval steps.
+
+    \tparam Architecture_t Type representing which implementation of the low-level
+    interface to use.
+ */
+template<typename Architecture_t>
+class TGradientDescent
+{
+public:
+    using Scalar_t = typename Architecture_t::Scalar_t;
+    using Matrix_t = typename Architecture_t::Matrix_t;
+
+private:
+    size_t   fBatchSize; ///< Batch size to use for the training.
+    size_t   fStepCount; ///< Number of steps performed in the current training sessiong.
+    size_t   fConvergenceSteps; ///< Number of training epochs without considerable decrease in the test error for convergence.
+    size_t   fConvergenceCount; ///< Current number of training epochs without considerable decrease in the test error.
+    size_t   fTestInterval; ///< Interval for the computation of the test error.
+    Scalar_t fTrainingError;///< Holds the most recently computed training loss.
+    Scalar_t fTestError;    ///< Holds the most recently computed test loss.
+    Scalar_t fLearningRate; ///< Learning rate \f$\alpha\f$
+    Scalar_t fMinimumError; ///< The minimum loss achieved on the training set during the current traning session.
+
+public:
+    TGradientDescent();
+    TGradientDescent(Scalar_t learningRate,
+                     size_t convergenceSteps,
+                     size_t testInterval);
+    /*! Reset minimizer object to initial state. Does nothing for this minimizer. */
+    void Reset() {};
+    /*! Train the given net using the given training input data (events), training
+      output data (labels), test input data (events), test output data (labels). */
+    template <typename Data_t, typename Net_t>
+        Scalar_t Train(const Data_t & TrainingDataIn, size_t nTrainingSamples,
+                       const Data_t & TestDataIn, size_t nTestSamples,
+                       Net_t & net, size_t nThreads = 1);
+    /*! Perform a single optimization step on a given batch. Propagates the input
+      matrix foward through the net, evaluates the loss and propagates the gradients
+      backward through the net. The computed gradients are scaled by the learning
+      rate \f$\alpha\f$ and subtracted from the weights and bias values of each
+      layer. */
+    template <typename Net_t>
+    void Step(Net_t &net, Matrix_t &input, const Matrix_t &output);
+    template <typename Net_t>
+    void Step(Net_t &master,
+              std::vector<Net_t> &nets,
+              std::vector<TBatch<Architecture_t>> &batches);
+    /** Does not evaluate the loss and therefore not trigger a possible synchronization
+     *  with the device. Trains the weights of each layer, but only the bias terms of
+     *  the first layer for compatibility with the previous implementation. */
+    template <typename Net_t>
+    void StepReducedWeights(Net_t &net,
+                            Matrix_t &input,
+                            const Matrix_t &output);
+    /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger
+     * synchronization with the device. */
+    template <typename Net_t>
+    Scalar_t StepReducedWeightsLoss(Net_t &net,
+                                    Matrix_t &input,
+                                    const Matrix_t &output);
+    template <typename Net_t>
+    inline void TestError(Net_t &net,
+                          Matrix_t &input,
+                          const Matrix_t &output);
+    bool HasConverged();
+
+    size_t   GetConvergenceCount() const {return fConvergenceCount;}
+    size_t   getConvergenceSteps() const {return fConvergenceSteps;}
+    Scalar_t GetTrainingError() const {return fTrainingError;}
+    Scalar_t GetTestError() const     {return fTestError;}
+    size_t   GetTestInterval() const  {return fTestInterval;}
+
+    void SetConvergenceSteps(size_t steps) {fConvergenceSteps = steps;}
+    void SetTestInterval(size_t interval)  {fTestInterval = interval;}
+    void SetLearningRate(Scalar_t rate)    {fLearningRate = rate;}
+};
+
+//______________________________________________________________________________
+//
+// Implementation
+//______________________________________________________________________________
+template<typename Architecture_t>
+    TGradientDescent<Architecture_t>::TGradientDescent()
+   : fBatchSize(0), fStepCount(0), fConvergenceSteps(0),
+     fConvergenceCount(0), fTestInterval(0), fLearningRate(0),
+     fMinimumError(1e100)
+{
+   // Nothing to do here.
+}
+//______________________________________________________________________________
+template<typename Architecture_t>
+TGradientDescent<Architecture_t>::TGradientDescent(Scalar_t learningRate,
+                                                   size_t convergenceSteps,
+                                                   size_t testInterval)
+   : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps),
+     fConvergenceCount(0), fTestInterval(testInterval), fLearningRate(learningRate),
+     fMinimumError(1e100)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+template <typename Data_t, typename Net_t>
+    auto TGradientDescent<Architecture_t>::Train(const Data_t & trainingData,
+                                                 size_t nTrainingSamples,
+                                                 const Data_t & testData,
+                                                 size_t nTestSamples,
+                                                 Net_t & net,
+                                                 size_t nThreads)
+   -> Scalar_t
+{
+   // Reset iteration state.
+   fMinimumError = 1e100;
+   fConvergenceCount = 0;
+   fStepCount = 0;
+
+   // Prepare training data.
+   bool converged = false;
+
+   TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
+                                                   net.GetBatchSize(),
+                                                   net.GetInputWidth(),
+                                                   net.GetOutputWidth(), nThreads);
+   auto testNet = net.CreateClone(nTestSamples);
+   TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
+                                                  testNet.GetBatchSize(),
+                                                  testNet.GetInputWidth(),
+                                                  net.GetOutputWidth());
+   std::vector<Net_t> nets{};
+   nets.reserve(nThreads);
+   for (size_t i = 0; i < nThreads; i++) {
+       nets.push_back(net);
+       for (size_t j = 0; j < net.GetDepth(); j++)
+       {
+           std::cout << "copy" << std::endl;
+           auto &masterLayer = net.GetLayer(j);
+           auto &layer = nets.back().GetLayer(j);
+           Architecture_t::Copy(layer.GetWeights(),
+                                masterLayer.GetWeights());
+           Architecture_t::Copy(layer.GetBiases(),
+                                masterLayer.GetBiases());
+       }
+   }
+
+   std::chrono::time_point<std::chrono::system_clock> start, end;
+   start = std::chrono::system_clock::now();
+
+
+   while (!converged)
+   {
+      fStepCount++;
+
+      size_t netIndex = 0;
+      std::vector<TBatch<Architecture_t>> batches{};
+      for (size_t i = 0; i < nTrainingSamples / net.GetBatchSize(); i += nThreads) {
+         batches.clear();
+         for (size_t j = 0; j < nThreads; j++) {
+            batches.reserve(nThreads);
+            batches.push_back(trainLoader.GetBatch());
+         }
+         Step(net, nets, batches);
+      }
+
+      // Compute test error.
+      if ((fStepCount % fTestInterval) == 0) {
+         end   = std::chrono::system_clock::now();
+         std::chrono::duration<double> elapsed_seconds = end - start;
+         start = std::chrono::system_clock::now();
+         double seconds = elapsed_seconds.count();
+         double nFlops  = (double) (fTestInterval * (nTrainingSamples / net.GetBatchSize()));
+                nFlops *= net.GetNFlops();
+         std::cout << "Elapsed time for " << fTestInterval << " Epochs: "
+                   << seconds << " [s] => " << nFlops * 1e-9 / seconds
+                   << " GFlop/s" << std::endl;
+         auto b = *testLoader.begin();
+         auto inputMatrix  = b.GetInput();
+         auto outputMatrix = b.GetOutput();
+
+         Scalar_t loss = testNet.Loss(inputMatrix, outputMatrix);
+         std::cout << fStepCount << ": " << loss << std::endl;
+         converged = HasConverged();
+      }
+
+   }
+   return fMinimumError;
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+    template <typename Net_t>
+    void inline TGradientDescent<Architecture_t>::Step(Net_t & net,
+                                                       Matrix_t &input,
+                                                       const Matrix_t &output)
+{
+    //Scalar_t loss = net.Loss(input, output);
+    //fTrainingError = loss;
+    net.Forward(input);
+    net.Backward(input, output);
+
+    for (size_t i = 0; i < net.GetDepth(); i++)
+    {
+        auto &layer = net.GetLayer(i);
+        Architecture_t::ScaleAdd(layer.GetWeights(),
+                                 layer.GetWeightGradients(),
+                                 -fLearningRate);
+        Architecture_t::ScaleAdd(layer.GetBiases(),
+                                 layer.GetBiasGradients(),
+                                 -fLearningRate);
+    }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+    template <typename Net_t>
+    void inline TGradientDescent<Architecture_t>::Step(
+        Net_t & master,
+        std::vector<Net_t> & nets,
+        std::vector<TBatch<Architecture_t>> & batches)
+{
+   typename Architecture_t::Matrix_t dummy(0,0);
+   size_t depth = master.GetDepth();
+
+   // Forward
+   for (size_t j = 0; j < nets.size(); j++) {
+      nets[j].GetLayer(0).Forward(batches[j].GetInput());
+   }
+
+   for (size_t i = 1; i < depth; i++)
+   {
+      for (size_t j = 0; j < nets.size(); j++) {
+         nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput());
+      }
+   }
+   // Gradients
+   for (size_t j = 0; j < nets.size(); j++) {
+      evaluateGradients<Architecture_t>(
+          nets[j].GetLayer(depth-1).GetActivationGradients(),
+          nets[j].GetLossFunction(),
+          batches[j].GetOutput(),
+          nets[j].GetLayer(depth-1).GetOutput());
+   }
+   // Backward
+   for (size_t i = depth - 1; i > 0; i--)
+   {
+      for (size_t j = 0; j < nets.size(); j++) {
+         nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
+                                      nets[j].GetLayer(i-1).GetOutput(),
+                                      nets[j].GetRegularization(),
+                                      nets[j].GetWeightDecay());
+      }
+   }
+   for (size_t j = 0; j < nets.size(); j++) {
+      nets[j].GetLayer(0).Backward(dummy,
+                                   batches[j].GetInput(),
+                                   nets[j].GetRegularization(),
+                                   nets[j].GetWeightDecay());
+   }
+
+   for (size_t j = 0; j < nets.size(); j++) {
+      for (size_t i = 0; i < depth; i++)
+      {
+         auto &masterLayer = master.GetLayer(i);
+         auto &layer       = nets[j].GetLayer(i);
+         Architecture_t::ScaleAdd(masterLayer.GetWeights(),
+                                  layer.GetWeightGradients(),
+                                  -fLearningRate);
+         Architecture_t::Copy(layer.GetWeights(),
+                              masterLayer.GetWeights());
+         Architecture_t::ScaleAdd(masterLayer.GetBiases(),
+                                  layer.GetBiasGradients(),
+                                  -fLearningRate);
+         Architecture_t::Copy(layer.GetBiases(),
+                              masterLayer.GetBiases());
+      }
+   }
+}
+
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+template <typename Net_t>
+void inline TGradientDescent<Architecture_t>::StepReducedWeights(
+    Net_t & net,
+    Matrix_t &input,
+    const Matrix_t &output)
+{
+   net.Forward(input);
+   net.Backward(input, output);
+
+   for (size_t i = 0; i < net.GetDepth(); i++)
+   {
+      auto &layer = net.GetLayer(i);
+      Architecture_t::ScaleAdd(layer.GetWeights(),
+                               layer.GetWeightGradients(),
+                               -fLearningRate);
+      if (i == 0) {
+         Architecture_t::ScaleAdd(layer.GetBiases(),
+                                  layer.GetBiasGradients(),
+                                  -fLearningRate);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+    template <typename Net_t>
+    auto inline TGradientDescent<Architecture_t>::StepReducedWeightsLoss(
+        Net_t & net,
+        Matrix_t &input,
+        const Matrix_t &output)
+    -> Scalar_t
+{
+   Scalar_t loss = net.Loss(input, output);
+   fTrainingError = loss;
+   net.Backward(input, output);
+
+   for (size_t i = 0; i < net.GetDepth(); i++)
+   {
+      auto &layer = net.GetLayer(i);
+      Architecture_t::ScaleAdd(layer.GetWeights(),
+                               layer.GetWeightGradients(),
+                               -fLearningRate);
+      if (i == 0) {
+         Architecture_t::ScaleAdd(layer.GetBiases(),
+                                  layer.GetBiasGradients(),
+                                  -fLearningRate);
+      }
+   }
+   return loss;
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+    template <typename Net_t>
+    inline void TGradientDescent<Architecture_t>::TestError(Net_t & net,
+                                                            Matrix_t &input,
+                                                            const Matrix_t &output)
+{
+   fTestError = net.Loss(input, output, false);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+bool inline TGradientDescent<Architecture_t>::HasConverged()
+{
+   if (fTestError < fMinimumError * 0.999) {
+      fConvergenceCount = 0;
+      fMinimumError     = fTestError;
+   } else {
+      fConvergenceCount++;
+   }
+
+   return (fConvergenceCount >= fConvergenceSteps);
+}
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Net.h b/tmva/tmva/inc/TMVA/DNN/Net.h
new file mode 100644
index 0000000000000..4a4ae9957ef72
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Net.h
@@ -0,0 +1,368 @@
+// @(#)root/tmva: $Id$
+// Author: Simon Pfreundschuh 20/06/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+#ifndef TMVA_DNN_NET
+#define TMVA_DNN_NET
+
+#include <vector>
+#include <iostream>
+
+#include "Layer.h"
+
+namespace TMVA {
+namespace DNN  {
+
+/** \class TNet
+
+    Generic neural network class.
+
+    This generic neural network class represents a concrete neural
+    network through a vector of layers and coordinates the forward
+    and backward propagation through the net.
+
+    The net takes as input a batch from the training data given in
+    matrix form, with each row corresponding to a certain training
+    event.
+
+    On construction, the neural network allocates all the memory
+    required for the training of the neural net and keeps it until
+    its destruction.
+
+    The Architecture type argument simply holds the
+    architecture-specific data types, which are just the matrix type
+    Matrix_t and the used scalar type Scalar_t.
+
+    \tparam Architecture The Architecture type that holds the
+    \tparam Layer_t The type used for the layers. Can be either
+    Layer<Architecture> or SharedWeightLayer<Architecture>.
+    datatypes for a given architecture.
+*/
+template<typename Architecture_t, typename Layer_t = TLayer<Architecture_t>>
+   class TNet {
+
+public:
+   using Matrix_t         = typename Architecture_t::Matrix_t;
+   using Scalar_t         = typename Architecture_t::Scalar_t;
+   using LayerIterator_t  = typename std::vector<Layer_t>::iterator;
+
+private:
+   size_t fBatchSize;  ///< Batch size for training and evaluation of the Network.
+   size_t fInputWidth; ///< Number of features in a single input event.
+
+   std::vector<Layer_t> fLayers; ///< Layers in the network.
+
+   Matrix_t fDummy;       ///< Empty matrix for last step in back propagation.
+   ELossFunction fJ;      ///< The loss function of the network.
+   ERegularization fR;    ///< The regularization used for the network.
+   Scalar_t fWeightDecay; ///< The weight decay factor.
+
+public:
+   TNet();
+   TNet(const TNet & other);
+   /*! Construct a neural net for a given batch size with
+    *  given output function * and regularization. */
+   TNet(size_t batchSize,
+        size_t inputWidth,
+        ELossFunction fJ,
+        ERegularization fR = ERegularization::NONE,
+        Scalar_t fWeightDecay = 0.0);
+   /*! Create a clone that uses the same weight and biases matrices but
+    *  potentially a difference batch size. */
+   TNet<Architecture_t, TSharedLayer<Architecture_t>> CreateClone(size_t batchSize);
+
+   /*! Add a layer of the given size to the neural net. */
+   void AddLayer(size_t width, EActivationFunction f,
+                 Scalar_t dropoutProbability = 1.0);
+
+   /*! Remove all layers from the network.*/
+   void Clear();
+
+   /*! Add a layer which shares its weights with another TNet instance. */
+   template <typename SharedLayer>
+   void AddLayer(SharedLayer & layer);
+
+   /*! Iterator to the first layer of the net. */
+   LayerIterator_t LayersBegin() {return fLayers;}
+
+   /*! Iterator to the last layer of the net. */
+   LayerIterator_t LayersEnd() {return fLayers;}
+
+   /*! Initialize the weights in the net with the
+    *  initialization method. */
+   inline void Initialize(EInitialization m);
+
+   /*! Forward a given input through the neural net. Computes
+    *  all layer activations up to the output layer */
+   inline void Forward(Matrix_t& X, bool applyDropout = false);
+
+   /*! Compute the weight gradients in the net from the given training
+    * samples X and training labels Y. */
+   inline void Backward(const Matrix_t &X, const Matrix_t &Y);
+
+   /*! Evaluate the loss function of the net using the activations
+    *  that are currently stored in the output layer. */
+   inline Scalar_t Loss(const Matrix_t &Y) const;
+
+   /*! Propagate the input batch X through the net and evaluate the
+    *  error function for the resulting activations of the output
+    *  layer */
+   inline Scalar_t Loss(Matrix_t &X, const Matrix_t &Y, bool applyDropout = false);
+
+   /*! Compute the neural network predictionion obtained from forwarding the
+    *  batch X through the neural network and applying the output function
+    *  f to the activation of the last layer in the network. */
+   inline void Prediction(Matrix_t &Y_hat, Matrix_t &X, EOutputFunction f);
+
+   /*! Compute the neural network rediction obtained from applying the output
+    * function f to the activation of the last layer in the network. */
+   inline void Prediction(Matrix_t &Y_hat, EOutputFunction f) const;
+
+   Scalar_t            GetNFlops();
+
+   size_t              GetDepth() const         {return fLayers.size();}
+   size_t              GetBatchSize() const     {return fBatchSize;}
+   Layer_t &           GetLayer(size_t i)       {return fLayers[i];}
+   const Layer_t &     GetLayer(size_t i) const {return fLayers[i];}
+   ELossFunction       GetLossFunction() const  {return fJ;}
+   Matrix_t &          GetOutput()              {return fLayers.back().GetOutput();}
+   size_t              GetInputWidth() const    {return fInputWidth;}
+   size_t              GetOutputWidth() const   {return fLayers.back().GetWidth();}
+   ERegularization     GetRegularization()      {return fR;}
+   Scalar_t            GetWeightDecay()         {return fWeightDecay;}
+
+   void SetInputWidth(size_t InputWidth)     {fInputWidth = InputWidth;}
+   void SetRegularization(ERegularization R) {fR = R;}
+   void SetLossFunction(ELossFunction J)     {fJ = J;}
+   void SetWeightDecay(Scalar_t weightDecay) {fWeightDecay = weightDecay;}
+
+   void Print();
+};
+
+//______________________________________________________________________________
+
+template<typename Architecture_t, typename Layer_t>
+   TNet<Architecture_t, Layer_t>::TNet()
+   : fBatchSize(0), fInputWidth(0), fDummy(0,0),
+   fJ(ELossFunction::MEANSQUAREDERROR), fR(ERegularization::NONE)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+
+template<typename Architecture_t, typename Layer_t>
+   TNet<Architecture_t, Layer_t>::TNet(const TNet & other)
+   : fBatchSize(other.fBatchSize), fInputWidth(other.fInputWidth),
+   fLayers(other.fLayers), fDummy(0,0), fJ(other.fJ), fR(other.fR)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+
+template<typename Architecture_t, typename Layer_t>
+   TNet<Architecture_t, Layer_t>::TNet(size_t        batchSize,
+                                       size_t        inputWidth,
+                                       ELossFunction J,
+                                       ERegularization R,
+                                       Scalar_t weightDecay)
+   : fBatchSize(batchSize), fInputWidth(inputWidth), fDummy(0,0),
+     fJ(J), fR(R), fWeightDecay(weightDecay)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+
+template<typename Architecture_t, typename Layer_t>
+   auto TNet<Architecture_t, Layer_t>::CreateClone(size_t BatchSize)
+   -> TNet<Architecture_t, TSharedLayer<Architecture_t>>
+{
+   TNet<Architecture_t, TSharedLayer<Architecture_t>> other(BatchSize,
+                                                        fInputWidth,
+                                                        fJ, fR);
+   for (auto &l : fLayers) {
+      other.AddLayer(l);
+   }
+   return other;
+}
+
+//______________________________________________________________________________
+
+template<typename Architecture_t, typename Layer_t>
+   void TNet<Architecture_t, Layer_t>::AddLayer(size_t width,
+                                                EActivationFunction f,
+                                                Scalar_t dropoutProbability)
+{
+   if (fLayers.size() == 0) {
+      fLayers.emplace_back(fBatchSize, fInputWidth, width, f, dropoutProbability);
+   } else {
+      size_t prevWidth = fLayers.back().GetWidth();
+      fLayers.emplace_back(fBatchSize, prevWidth, width, f, dropoutProbability);
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   void TNet<Architecture_t, Layer_t>::Clear()
+{
+   fLayers.clear();
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   template<typename SharedLayer_t>
+   inline void TNet<Architecture_t, Layer_t>::AddLayer(SharedLayer_t & layer)
+{
+   fLayers.emplace_back(fBatchSize, layer);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   inline void TNet<Architecture_t, Layer_t>::Initialize(EInitialization m)
+{
+   for (auto &l : fLayers) {
+      l.Initialize(m);
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+inline void TNet<Architecture_t, Layer_t>::Forward(Matrix_t &input,
+                                                   bool applyDropout)
+{
+   fLayers.front().Forward(input, applyDropout);
+
+   for (size_t i = 1; i < fLayers.size(); i++) {
+      fLayers[i].Forward(fLayers[i-1].GetOutput(), applyDropout);
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   inline void TNet<Architecture_t, Layer_t>::Backward(const Matrix_t &X,
+                                                       const Matrix_t &Y)
+{
+
+    evaluateGradients<Architecture_t>(fLayers.back().GetActivationGradients(),
+                                      fJ, Y, fLayers.back().GetOutput());
+
+   for (size_t i = fLayers.size()-1; i > 0; i--) {
+      auto & activation_gradient_backward
+         = fLayers[i-1].GetActivationGradients();
+      auto & activations_backward
+         = fLayers[i-1].GetOutput();
+      fLayers[i].Backward(activation_gradient_backward,
+                          activations_backward, fR, fWeightDecay);
+   }
+   fLayers[0].Backward(fDummy, X, fR, fWeightDecay);
+
+}
+
+//______________________________________________________________________________
+
+template<typename Architecture_t, typename Layer_t>
+   inline auto TNet<Architecture_t, Layer_t>::Loss(const Matrix_t &Y) const
+   -> Scalar_t
+{
+   auto loss = evaluate<Architecture_t>(fJ, Y, fLayers.back().GetOutput());
+   for (auto &l : fLayers) {
+      loss += fWeightDecay * regularization<Architecture_t>(l.GetWeights(), fR);
+   }
+   return loss;
+}
+
+//______________________________________________________________________________
+
+template<typename Architecture_t, typename Layer_t>
+   inline auto TNet<Architecture_t, Layer_t>::Loss(Matrix_t &X,
+                                                   const Matrix_t &Y,
+                                                   bool applyDropout)
+   -> Scalar_t
+{
+   Forward(X, applyDropout);
+   return Loss(Y);
+}
+
+//______________________________________________________________________________
+
+template<typename Architecture_t, typename Layer_t>
+   inline void TNet<Architecture_t, Layer_t>::Prediction(Matrix_t &Yhat,
+                                                         Matrix_t &X,
+                                                         EOutputFunction f)
+{
+   Forward(X, false);
+   evaluate<Architecture_t>(Yhat, f, fLayers.back().GetOutput());
+}
+
+//______________________________________________________________________________
+
+template<typename Architecture_t, typename Layer_t>
+   inline void TNet<Architecture_t, Layer_t>::Prediction(Matrix_t &Y_hat,
+                                                         EOutputFunction f) const
+{
+   evaluate<Architecture_t>(Y_hat, f, fLayers.back().GetOutput());
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+auto TNet<Architecture_t, Layer_t>::GetNFlops()
+   -> Scalar_t
+{
+   Scalar_t flops = 0;
+
+   Scalar_t nb  = (Scalar_t) fBatchSize;
+   Scalar_t nlp = (Scalar_t) fInputWidth;
+
+   for(size_t i = 0; i < fLayers.size(); i++) {
+      Layer_t & layer = fLayers[i];
+      Scalar_t nl = (Scalar_t) layer.GetWidth();
+
+      // Forward propagation.
+      flops += nb * nl * (2.0 * nlp - 1); // Matrix mult.
+      flops += nb * nl;                   // Add bias values.
+      flops += 2 * nb * nl;               // Apply activation function and compute
+                                          // derivative.
+      // Backward propagation.
+      flops += nb * nl;                      // Hadamard
+      flops += nlp * nb * (2.0 * nlp - 1.0); // Weight gradients
+      flops += nl * (nb - 1);                // Bias gradients
+      if (i > 0) {
+         flops += nlp * nb * (2.0 * nl  - 1.0); // Previous layer gradients.
+      }
+      nlp = nl;
+   }
+   return flops;
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   void TNet<Architecture_t, Layer_t>::Print()
+{
+   std::cout << "DEEP NEURAL NETWORK:";
+   std::cout << " Loss function: " << static_cast<char>(fJ);
+   std::cout << ", size: " << fLayers.size() << std::endl;
+
+   size_t i = 1;
+   for (auto & l : fLayers) {
+      std::cout << "DNN Layer " << i << ":" << std::endl;
+      l.Print();
+      i++;
+   }
+
+}
+
+//______________________________________________________________________________
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/MethodDNN.h b/tmva/tmva/inc/TMVA/MethodDNN.h
index 3b6bdc13971bb..0325459a4b706 100644
--- a/tmva/tmva/inc/TMVA/MethodDNN.h
+++ b/tmva/tmva/inc/TMVA/MethodDNN.h
@@ -57,6 +57,12 @@
 #include "TMVA/NeuralNet.h"
 #endif
 
+#include "TMVA/DNN/Net.h"
+#include "TMVA/DNN/Minimizers.h"
+
+#ifdef DNNCUDA
+#include "TMVA/DNN/Architectures/Cuda.h"
+#endif
 
 
 namespace TMVA {
@@ -82,6 +88,7 @@ namespace TMVA {
       std::vector<std::map<TString,TString>> ParseKeyValueString(TString parseString, TString blockDelim, TString tokenDelim);
 
       void Train();
+      void TrainGPU();
 
       virtual Double_t GetMvaValue( Double_t* err=0, Double_t* errUpper=0 );
       virtual const std::vector<Float_t>& GetRegressionValues();
@@ -142,6 +149,8 @@ namespace TMVA {
       bool     fResume;
       TString  fWeightInitializationStrategyString;
       TMVA::DNN::WeightInitializationStrategy fWeightInitializationStrategy;
+      TString  fGPUString;
+      bool     fGPU;
 
       std::vector<std::shared_ptr<TMVA::DNN::Settings>> fSettings;
 
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu
new file mode 100644
index 0000000000000..dad198013880c
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu
@@ -0,0 +1,194 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////////
+ // Implementation of the activation functions for the TCuda      //
+ // implementation of the low-level interface.                   //
+ //////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "TMVA/DNN/Architectures/Cuda/Kernels.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+void TCuda::IdentityDerivative(TCudaMatrix & B, const TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::IdentityDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       (int) B.GetNrows(),
+       (int) B.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//______________________________________________________________________________
+void TCuda::Relu(TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Relu<<<gridDims, blockDims, 0, s>>>(A.GetDataPointer(),
+                                                  (int) A.GetNrows(),
+                                                  (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+void TCuda::ReluDerivative(TCudaMatrix & B, const TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::ReluDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//______________________________________________________________________________
+void TCuda::Sigmoid(TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Sigmoid<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+void TCuda::SigmoidDerivative(TCudaMatrix & B, const TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::SigmoidDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//______________________________________________________________________________
+void TCuda::Tanh(TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Tanh<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+void TCuda::TanhDerivative(TCudaMatrix & B, const TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::TanhDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//______________________________________________________________________________
+void TCuda::SymmetricRelu(TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::SymmetricRelu<<<gridDims, blockDims, 0, s>>>(A.GetDataPointer(),
+                                                             (int) A.GetNrows(),
+                                                             (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+void TCuda::SymmetricReluDerivative(TCudaMatrix & B, const TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::SymmetricReluDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//______________________________________________________________________________
+void TCuda::SoftSign(TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::SoftSign<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+void TCuda::SoftSignDerivative(TCudaMatrix & B, const TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::SoftSignDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//______________________________________________________________________________
+void TCuda::Gauss(TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Gauss<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+void TCuda::GaussDerivative(TCudaMatrix & B, const TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::GaussDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu
new file mode 100644
index 0000000000000..d7a366ec18ce6
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu
@@ -0,0 +1,134 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Contains additional arithmetic functions required by the CUDA //
+// neural network implementation.                                //
+///////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Kernels.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//____________________________________________________________________________
+void TCuda::Multiply(TCudaMatrix &C,
+                    const TCudaMatrix &A,
+                    const TCudaMatrix &B)
+{
+   int m, n, k;
+   m = A.GetNrows();
+   k = A.GetNcols();
+   n = B.GetNcols();
+   CudaDouble_t alpha = 1.0, beta = 0.0;
+
+   cudaStream_t s = A.GetComputeStream();
+   cublasSetStream(A.GetCublasHandle(), s);
+
+   // Compute C = beta * C + alpha * (A * B)
+   cublasDgemm(A.GetCublasHandle(),
+               CUBLAS_OP_N, CUBLAS_OP_N,
+               m, n, k, & alpha,
+               A.GetDataPointer(), m,   // *A, lda
+               B.GetDataPointer(), k,   // *B, ldb
+               & beta,                  // beta
+               C.GetDataPointer(), m);  // *C, ldc
+
+   C.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+void TCuda::TransposeMultiply(TCudaMatrix & C,
+                             const TCudaMatrix & A,
+                             const TCudaMatrix & B)
+{
+   int m, n, k;
+   k = A.GetNrows();
+   m = A.GetNcols();
+   n = B.GetNcols();
+   CudaDouble_t alpha = 1.0, beta = 0.0;
+
+   cudaStream_t s = A.GetComputeStream();
+   cublasSetStream(A.GetCublasHandle(), s);
+
+   // Compute C = beta * C + alpha * (A^T * B)
+   cublasDgemm(A.GetCublasHandle(),
+               CUBLAS_OP_T, CUBLAS_OP_N,
+               m, n, k, & alpha,
+               A.GetDataPointer(), k,     // *A, lda
+               B.GetDataPointer(), k,     // *B, ldb
+               & beta,                    // beta
+               C.GetDataPointer(), m);    // *C, ldc
+
+   C.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+void TCuda::Hadamard(TCudaMatrix &B,
+                    const TCudaMatrix &A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Hadamard<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(),
+                                                              A.GetDataPointer(),
+                                                              A.GetNrows(),
+                                                              A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+CudaDouble_t TCuda::Sum(const TCudaMatrix &A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+
+   TCudaMatrix::ResetDeviceReturn();
+   ::TMVA::DNN::Cuda::ReduceMatrix<<<gridDims, blockDims, 0, s>>>(
+       TCudaMatrix::GetDeviceReturnPointer(),
+       A.GetDataPointer(),
+       A.GetNrows(),
+       A.GetNcols());
+   return TCudaMatrix::GetDeviceReturn();
+}
+
+//____________________________________________________________________________
+void TCuda::SumColumns(TCudaMatrix &B, const TCudaMatrix &A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+
+   cudaStream_t s = A.GetComputeStream();
+   cudaMemsetAsync(B.GetDataPointer(), 0, A.GetNcols() * sizeof(CudaDouble_t), s);
+   ::TMVA::DNN::Cuda::SumColumns<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(),
+                                                                A.GetDataPointer(),
+                                                                A.GetNrows(),
+                                                                A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+void TCuda::ScaleAdd(TCudaMatrix &B, const TCudaMatrix &A, CudaDouble_t alpha)
+{
+   cudaStream_t s = 0; //A.GetComputeStream();
+   cublasDaxpy(A.GetCublasHandle(), A.GetNoElements(), &alpha,
+               A.GetDataPointer(), 1,
+               B.GetDataPointer(), 1);
+   //B.SetComputeStream(s);
+}
+
+} // DNN
+} // TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Buffers.cxx b/tmva/tmva/src/DNN/Architectures/Cuda/Buffers.cxx
new file mode 100644
index 0000000000000..cc8044934b80c
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Buffers.cxx
@@ -0,0 +1,126 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 07/08/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////////////
+// Implementation of device and host buffers for CUDA architectures.  //
+////////////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda/Buffers.h"
+#include "cuda_runtime.h"
+#include <iostream>
+
+namespace TMVA {
+namespace DNN  {
+
+//
+// TCudaHostBuffer
+//______________________________________________________________________________
+void TCudaHostBuffer::TDestructor::operator()(CudaDouble_t ** devicePointer)
+{
+   cudaFreeHost(*devicePointer);
+   delete[] devicePointer;
+}
+
+//______________________________________________________________________________
+TCudaHostBuffer::TCudaHostBuffer(size_t size)
+    : fOffset(0), fComputeStream(0), fDestructor()
+{
+   CudaDouble_t ** pointer = new CudaDouble_t * [1];
+   cudaMallocHost(pointer, size * sizeof(CudaDouble_t));
+   fDevicePointer = std::shared_ptr<CudaDouble_t *>(pointer, fDestructor);
+}
+
+//______________________________________________________________________________
+TCudaHostBuffer::operator CudaDouble_t * () const
+{
+   return *fDevicePointer + fOffset;
+}
+
+//______________________________________________________________________________
+TCudaHostBuffer TCudaHostBuffer::GetSubBuffer(size_t offset, size_t /*size*/)
+{
+   TCudaHostBuffer buffer = *this;
+   buffer.fOffset         = offset;
+   return buffer;
+}
+
+//
+// TCudaDevicePointer
+//______________________________________________________________________________
+void TCudaDeviceBuffer::TDestructor::operator()(CudaDouble_t ** devicePointer)
+{
+   cudaFree(*devicePointer);
+   delete[] devicePointer;
+}
+
+//______________________________________________________________________________
+TCudaDeviceBuffer::TCudaDeviceBuffer(size_t size)
+    : fOffset(0), fSize(size), fDestructor()
+{
+   CudaDouble_t ** pointer = new CudaDouble_t * [1];
+   cudaMalloc(pointer, size * sizeof(CudaDouble_t));
+   fDevicePointer = std::shared_ptr<CudaDouble_t *>(pointer, fDestructor);
+   cudaStreamCreate(&fComputeStream);
+}
+
+//______________________________________________________________________________
+TCudaDeviceBuffer::TCudaDeviceBuffer(size_t size, cudaStream_t stream)
+    : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
+{
+   CudaDouble_t ** pointer = new CudaDouble_t * [1];
+   cudaMalloc(pointer, size * sizeof(CudaDouble_t));
+   fDevicePointer = std::shared_ptr<CudaDouble_t *>(pointer, fDestructor);
+}
+
+//______________________________________________________________________________
+TCudaDeviceBuffer::TCudaDeviceBuffer(CudaDouble_t * devicePointer,
+                                     size_t size,
+                                     cudaStream_t stream)
+    : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
+{
+   CudaDouble_t ** pointer = new CudaDouble_t * [1];
+   *pointer       = devicePointer;
+   fDevicePointer = std::shared_ptr<CudaDouble_t *>(pointer, fDestructor);
+}
+
+//______________________________________________________________________________
+TCudaDeviceBuffer TCudaDeviceBuffer::GetSubBuffer(size_t offset, size_t size)
+{
+   TCudaDeviceBuffer buffer = *this;
+   buffer.fOffset           = offset;
+   buffer.fSize             = size;
+   return buffer;
+}
+
+//______________________________________________________________________________
+TCudaDeviceBuffer::operator CudaDouble_t * () const
+{
+    return *fDevicePointer + fOffset;
+}
+
+//______________________________________________________________________________
+void TCudaDeviceBuffer::CopyFrom(const TCudaHostBuffer &buffer) const
+{
+   cudaStreamSynchronize(fComputeStream);
+   cudaMemcpyAsync(*this, buffer, fSize * sizeof(CudaDouble_t),
+                   cudaMemcpyHostToDevice, fComputeStream);
+}
+
+//______________________________________________________________________________
+void TCudaDeviceBuffer::CopyTo(const TCudaHostBuffer &buffer) const
+{
+   cudaMemcpyAsync(*this, buffer, fSize * sizeof(CudaDouble_t),
+                   cudaMemcpyDeviceToHost, fComputeStream);
+   buffer.fComputeStream = fComputeStream;
+}
+
+} // TMVA
+} // DNN
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu b/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu
new file mode 100644
index 0000000000000..e68db8150d199
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu
@@ -0,0 +1,127 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////
+// Implementation of the TCudaMatrix class. //
+/////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda/CudaMatrix.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "TMVA/DNN/Architectures/Cuda/Kernels.h"
+
+namespace TMVA {
+namespace DNN  {
+
+// Static members.
+//____________________________________________________________________________
+size_t          TCudaMatrix::fInstances     = 0;
+cublasHandle_t  TCudaMatrix::fCublasHandle  = nullptr;
+CudaDouble_t  * TCudaMatrix::fDeviceReturn  = nullptr;
+curandState_t * TCudaMatrix::fCurandStates  = nullptr;
+size_t          TCudaMatrix::fNCurandStates = 0;
+
+// Constructors.
+//____________________________________________________________________________
+TCudaMatrix::TCudaMatrix()
+    : fNRows(0), fNCols(0), fElementBuffer()
+{
+   InitializeCuda();
+}
+
+//____________________________________________________________________________
+TCudaMatrix::TCudaMatrix(size_t m, size_t n)
+    : fNRows(m), fNCols(n), fElementBuffer(m * n, 0)
+{
+   InitializeCuda();
+}
+
+//____________________________________________________________________________
+TCudaMatrix::TCudaMatrix(const TMatrixT<CudaDouble_t> & Host)
+    : fNRows(Host.GetNrows()), fNCols(Host.GetNcols()),
+      fElementBuffer(Host.GetNoElements(), 0)
+{
+   InitializeCuda();
+
+   CudaDouble_t * buffer = new CudaDouble_t[fNRows * fNCols];
+   size_t index = 0;
+   for (size_t j = 0; j < fNCols; j++) {
+      for (size_t i = 0; i < fNRows; i++) {
+         buffer[index] = Host(i, j);
+         index++;
+      }
+   }
+
+   cudaMemcpy(fElementBuffer, buffer, fNRows * fNCols * sizeof(CudaDouble_t),
+              cudaMemcpyHostToDevice);
+}
+
+//____________________________________________________________________________
+TCudaMatrix::TCudaMatrix(TCudaDeviceBuffer buffer,
+                         size_t m, size_t n)
+    : fNRows(m), fNCols(n), fElementBuffer(buffer)
+{
+   InitializeCuda();
+}
+
+//____________________________________________________________________________
+inline void TCudaMatrix::InitializeCuda()
+{
+   if (fInstances == 0) {
+       cublasCreate(&fCublasHandle);
+       CUDACHECK(cudaMalloc(& fDeviceReturn, sizeof(CudaDouble_t)));
+       CUDACHECK(cudaMalloc(& fCurandStates, TDevice::NThreads(*this)));
+   }
+   if (TDevice::NThreads(*this) > (int) fNCurandStates) {
+       fNCurandStates = TDevice::NThreads(*this);
+       if (fCurandStates) {
+           cudaFree(fCurandStates);
+       }
+       cudaMalloc(&fCurandStates, TDevice::NThreads(*this) * sizeof(curandState_t));
+       InitializeCurandStates();
+   }
+   fInstances++;
+}
+
+//____________________________________________________________________________
+void TCudaMatrix::InitializeCurandStates()
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(*this);
+   ::TMVA::DNN::Cuda::InitializeCurandStates<<<gridDims, blockDims>>>(time(nullptr),
+                                                                      fCurandStates);
+
+}
+
+
+// Conversion to TMatrixT.
+//____________________________________________________________________________
+TCudaMatrix::operator TMatrixT<CudaDouble_t>() const
+{
+   TMatrixT<CudaDouble_t> hostMatrix(GetNrows(), GetNcols());
+
+   CudaDouble_t * buffer = new CudaDouble_t[fNRows * fNCols];
+   cudaMemcpy(buffer, fElementBuffer, fNRows * fNCols * sizeof(CudaDouble_t),
+              cudaMemcpyDeviceToHost);
+
+   size_t index = 0;
+   for (size_t j = 0; j < fNCols; j++) {
+      for (size_t i = 0; i < fNRows; i++) {
+         hostMatrix(i, j) = buffer[index];
+         index++;
+      }
+   }
+
+   delete[] buffer;
+   return hostMatrix;
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/DataLoader.cu b/tmva/tmva/src/DNN/Architectures/Cuda/DataLoader.cu
new file mode 100644
index 0000000000000..05dc5be445559
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/DataLoader.cu
@@ -0,0 +1,377 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////
+// Implementation of the data loader for Cuda architectures. //
+///////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/DataLoader.h"
+
+namespace TMVA {
+namespace DNN  {
+
+
+// Inline Function Bodies
+//____________________________________________________________________________
+
+
+
+// TCudaBatchIterator
+//____________________________________________________________________________
+template <typename Data_t>
+TCudaBatchIterator<Data_t>::TCudaBatchIterator(TCudaDataLoader<Data_t> &dataLoader,
+                                             IndexIterator_t sampleIndexIterator,
+                                             IndexIterator_t sampleIndexIteratorEnd)
+  : fDataLoader(dataLoader),
+    fSampleIndexIterator(sampleIndexIterator),
+    fSampleIndexIteratorEnd(sampleIndexIteratorEnd),
+    fNbatchesInEpoch(dataLoader.GetNBatchesInEpoch()),
+    fBatchSize(dataLoader.GetBatchSize()),
+    fTransferBatchSize(dataLoader.GetBatchBatchSize())
+{
+    size_t distance = std::distance(fSampleIndexIterator,
+                                    fSampleIndexIteratorEnd);
+    fBatchIndex = fNbatchesInEpoch - (distance / fBatchSize);
+}
+
+//____________________________________________________________________________
+template <typename Data_t>
+inline void TCudaBatchIterator<Data_t>::PrepareStream()
+{
+   for (size_t i = 0; i < fDataLoader.GetPreloadOffset() + 1; i++) {
+       TCudaDataLoader<Data_t>::CopyBatches(fDataLoader.GetInputData(),
+                                           fSampleIndexIterator,
+                                           fSampleIndexIteratorEnd,
+                                           fBatchSize,
+                                           fTransferBatchSize,
+                                           fDataLoader.GetInputTransferBuffer(),
+                                           fDataLoader.GetOutputTransferBuffer());
+      fDataLoader.InvokeTransfer();
+      fSampleIndexIterator += fTransferBatchSize * fBatchSize;
+   }
+}
+
+//____________________________________________________________________________
+template <typename Data_t>
+inline TCudaBatchIterator<Data_t> & TCudaBatchIterator<Data_t>::operator++()
+{
+   fBatchIndex++;
+   if ((fBatchIndex % fTransferBatchSize) == 0) {
+      if (fBatchIndex < fNbatchesInEpoch) {
+          TCudaDataLoader<Data_t>::CopyBatches(fDataLoader.GetInputData(),
+                                              fSampleIndexIterator,
+                                              fSampleIndexIteratorEnd,
+                                              fBatchSize,
+                                              fTransferBatchSize,
+                                              fDataLoader.GetInputTransferBuffer(),
+                                              fDataLoader.GetOutputTransferBuffer());
+         fDataLoader.InvokeTransfer();
+         fSampleIndexIterator += fTransferBatchSize * fBatchSize;
+      }
+   }
+   return *this;
+}
+
+//____________________________________________________________________________
+template <typename Data_t>
+inline TCudaBatch TCudaBatchIterator<Data_t>::operator*()
+{
+   return fDataLoader.GetCurrentBatch(fBatchIndex);
+}
+
+//____________________________________________________________________________
+template <typename Data_t>
+inline bool TCudaBatchIterator<Data_t>::operator==(const TCudaBatchIterator & other)
+{
+   return fBatchIndex == other.fBatchIndex;
+}
+
+//____________________________________________________________________________
+template <typename Data_t>
+inline bool TCudaBatchIterator<Data_t>::operator!=(const TCudaBatchIterator & other)
+{
+   return fBatchIndex != other.fBatchIndex;
+}
+
+// TCudaDataLoader
+//____________________________________________________________________________
+template <typename Data_t>
+TCudaDataLoader<Data_t>::TCudaDataLoader(const Data_t & inputData,
+                                       size_t nsamples,
+                                       size_t batchSize,
+                                       size_t ninputFeatures,
+                                       size_t noutputFeatures,
+                                       size_t batchBatchSize,
+                                       size_t preloadOffset)
+: fInputData(inputData), fNsamples(nsamples), fNinputFeatures(ninputFeatures),
+  fNoutputFeatures(noutputFeatures), fBatchSize(batchSize),
+  fTransferBatchSize(batchBatchSize), fPreloadOffset(preloadOffset),
+  fStreamIndex(0), fSampleIndices(nsamples)
+{
+   fNbatchesInEpoch = fNsamples / fBatchSize;
+
+   fInputMatrixSize  = fNinputFeatures  * fBatchSize;
+   fOutputMatrixSize = fNoutputFeatures * fBatchSize;
+   fTransferSize     = fTransferBatchSize * (fInputMatrixSize + fOutputMatrixSize);
+   fTransferSize    *= sizeof(CudaDouble_t);
+
+   fHostData    = new CudaDouble_t * [fPreloadOffset + 1];
+   fDeviceData  = new CudaDouble_t * [fPreloadOffset + 1];
+   fDataStreams = new cudaStream_t   [fPreloadOffset + 1];
+
+   for (size_t i = 0; i < fPreloadOffset + 1; i++) {
+      CUDACHECK(cudaMallocHost(fHostData + i, fTransferSize));
+      CUDACHECK(cudaMalloc(fDeviceData + i,   fTransferSize));
+      cudaStreamCreate(fDataStreams + i);
+   }
+
+   for (size_t i = 0; i < fNsamples; i++)
+       fSampleIndices[i] = i;
+}
+
+//____________________________________________________________________________
+template <typename Data_t>
+TCudaDataLoader<Data_t>::~TCudaDataLoader()
+{
+   for (size_t i = 0; i < fPreloadOffset + 1; i++) {
+      cudaFree(fHostData + i);
+      cudaFree(fDeviceData + i);
+   }
+}
+
+//____________________________________________________________________________
+template <typename Data_t>
+inline TCudaBatchIterator<Data_t> TCudaDataLoader<Data_t>::begin() {
+   std::random_shuffle(fSampleIndices.begin(), fSampleIndices.end());
+   TCudaBatchIterator<Data_t> iterator(*this, fSampleIndices.begin(),
+                                      fSampleIndices.end());
+   iterator.PrepareStream();
+   return iterator;
+}
+
+//____________________________________________________________________________
+template <typename Data_t>
+inline TCudaBatchIterator<Data_t> TCudaDataLoader<Data_t>::end() {
+   return TCudaBatchIterator<Data_t>(*this, fSampleIndices.end(),
+                                    fSampleIndices.end());
+}
+
+
+//____________________________________________________________________________
+template <typename Data_t>
+inline CudaDouble_t * TCudaDataLoader<Data_t>::GetInputTransferBuffer() const
+{
+   return fHostData[fStreamIndex];
+}
+
+//____________________________________________________________________________
+template <typename Data_t>
+inline CudaDouble_t * TCudaDataLoader<Data_t>::GetOutputTransferBuffer() const
+{
+   return fHostData[fStreamIndex] + fTransferBatchSize * fInputMatrixSize;
+}
+
+//____________________________________________________________________________
+template <typename InputDataType>
+void TCudaDataLoader<InputDataType>::InvokeTransfer()
+{
+    cudaMemcpyAsync(fDeviceData[fStreamIndex],
+                    fHostData[fStreamIndex],
+                    fTransferSize,
+                    cudaMemcpyHostToDevice,
+                    fDataStreams[fStreamIndex]);
+
+    // Cycle through buffers and data streams.
+    fStreamIndex = (fStreamIndex + 1) % (fPreloadOffset + 1);
+}
+
+//____________________________________________________________________________
+template <typename Data_t>
+auto TCudaDataLoader<Data_t>::GetCurrentBatch(size_t batchIndex)
+    -> TCudaBatch
+{
+    size_t bufferIndex = batchIndex % fTransferBatchSize;
+    size_t nextStreamIndex = fStreamIndex;
+
+    CudaDouble_t * inputDataPointer = fDeviceData[nextStreamIndex];
+    inputDataPointer  += bufferIndex * fInputMatrixSize;
+    CudaDouble_t * outputDataPointer = fDeviceData[nextStreamIndex];
+    outputDataPointer += fTransferBatchSize * fInputMatrixSize;
+    outputDataPointer += bufferIndex * fOutputMatrixSize;
+
+    cudaStreamSynchronize(fDataStreams[nextStreamIndex]);
+    return TCudaBatch(fBatchSize, fNinputFeatures, fNoutputFeatures,
+                     inputDataPointer, outputDataPointer, fDataStreams[nextStreamIndex]);
+}
+
+//____________________________________________________________________________
+template <>
+inline void TCudaDataLoader<MatrixInput_t>::CopyBatches(
+    MatrixInput_t data,
+    IndexIterator_t sampleIndexIteratorBegin,
+    IndexIterator_t sampleIndexIteratorEnd,
+    size_t batchSize,
+    size_t batchBatchSize,
+    CudaDouble_t * inputBuffer,
+    CudaDouble_t * outputBuffer)
+{
+
+   const TMatrixT<Double_t> &inputData  = std::get<0>(data);
+   const TMatrixT<Double_t> &outputData = std::get<1>(data);
+
+   size_t n_input  = inputData.GetNcols();
+   size_t n_output = outputData.GetNcols();
+
+   // Copy input matrices;
+
+   auto sampleIndexIterator = sampleIndexIteratorBegin;
+   size_t bufferOffset = 0;
+   for (size_t b = 0; b < batchBatchSize; b++) {
+      for (size_t i = 0; i < batchSize; i++) {
+         if (sampleIndexIterator < sampleIndexIteratorEnd) {
+            size_t sampleIndex = *sampleIndexIterator;
+            // Copy input matrices.
+            for (size_t j = 0; j < n_input; j++) {
+               size_t bufferIndex = bufferOffset + j * batchSize + i;
+               inputBuffer[bufferIndex] = inputData(sampleIndex, j);
+            }
+            sampleIndexIterator++;
+         }
+      }
+      bufferOffset += batchSize * n_input;
+   }
+
+   // Copy output matrices;
+
+   sampleIndexIterator = sampleIndexIteratorBegin;
+   bufferOffset = 0;
+   for (size_t b = 0; b < batchBatchSize; b++) {
+      for (size_t i = 0; i < batchSize; i++) {
+         if (sampleIndexIterator < sampleIndexIteratorEnd) {
+            size_t sampleIndex = *sampleIndexIterator;
+            for (size_t j = 0; j < n_output; j++) {
+               size_t bufferIndex = bufferOffset + j * batchSize + i;
+               outputBuffer[bufferIndex] = outputData(sampleIndex, j);
+            }
+            sampleIndexIterator++;
+         }
+      }
+      bufferOffset += batchSize * n_output;
+   }
+}
+
+//____________________________________________________________________________
+template <>
+inline void TCudaDataLoader<TMVAInput_t>::CopyBatches(
+   TMVAInput_t data,
+   IndexIterator_t sampleIndexIteratorBegin,
+   IndexIterator_t sampleIndexIteratorEnd,
+   size_t batchSize,
+   size_t batchBatchSize,
+   CudaDouble_t * inputBuffer,
+   CudaDouble_t * outputBuffer)
+{
+
+   Event * event = data.front();
+   size_t nInput  = event->GetNVariables();
+   size_t nOutput = (event->GetNTargets() == 0) ? 1 : event->GetNTargets();
+
+   // Copy input matrices;
+
+   auto sampleIndexIterator = sampleIndexIteratorBegin;
+   size_t bufferOffset = 0;
+   for (size_t b = 0; b < batchBatchSize; b++) {
+      for (size_t i = 0; i < batchSize; i++) {
+         if (sampleIndexIterator < sampleIndexIteratorEnd) {
+            size_t sampleIndex = *sampleIndexIterator;
+            event = data[sampleIndex];
+            // Copy input matrices.
+            for (size_t j = 0; j < nInput; j++) {
+               size_t bufferIndex = bufferOffset + j * batchSize + i;
+               inputBuffer[bufferIndex] = event->GetValue(j);
+            }
+            sampleIndexIterator++;
+         }
+      }
+      bufferOffset += batchSize * nInput;
+   }
+
+   // Copy output matrices;
+
+   sampleIndexIterator = sampleIndexIteratorBegin;
+   bufferOffset = 0;
+   for (size_t b = 0; b < batchBatchSize; b++) {
+      for (size_t i = 0; i < batchSize; i++) {
+         if (sampleIndexIterator < sampleIndexIteratorEnd) {
+            size_t sampleIndex = *sampleIndexIterator;
+            event = data[sampleIndex];
+            for (size_t j = 0; j < nOutput; j++) {
+               size_t bufferIndex = bufferOffset + j * batchSize + i;
+               if (event->GetNTargets() == 0) {
+                  outputBuffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
+               } else {
+                  outputBuffer[bufferIndex] = event->GetTarget(j);
+               }
+            }
+            sampleIndexIterator++;
+         }
+      }
+      bufferOffset += batchSize * nOutput;
+   }
+}
+
+// Explicit instantiation.
+template class TCudaBatchIterator<MatrixInput_t>;
+template class TCudaBatchIterator<TMVAInput_t>;
+template class TCudaDataLoader<MatrixInput_t>;
+template class TCudaDataLoader<TMVAInput_t>;
+
+template<>
+void TDataLoader<MatrixInput_t, TCuda>::CopyInput(TCudaHostBuffer & buffer,
+                                                  IndexIterator_t sampleIterator,
+                                                  size_t batchSize)
+{
+   const TMatrixT<Double_t> &inputMatrix  = std::get<0>(fData);
+   size_t n = inputMatrix.GetNcols();
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = *sampleIterator;
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = inputMatrix(sampleIndex, j);
+      }
+      sampleIterator++;
+   }
+}
+
+template<>
+void TDataLoader<MatrixInput_t, TCuda>::CopyOutput(TCudaHostBuffer & buffer,
+                                                   IndexIterator_t sampleIterator,
+                                                   size_t batchSize)
+{
+   const TMatrixT<Double_t> &outputMatrix  = std::get<1>(fData);
+   size_t n = outputMatrix.GetNcols();
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = *sampleIterator;
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = outputMatrix(sampleIndex, j);
+      }
+      sampleIterator++;
+   }
+}
+
+template class TDataLoader<MatrixInput_t, TCuda>;
+
+} // namespace TMVA
+} // namespace DNN
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu
new file mode 100644
index 0000000000000..76aa842b8f560
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu
@@ -0,0 +1,37 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 14/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+
+////////////////////////////////////////////////////////////////////
+// Implementation of the Dropout function for TCuda architectures. //
+////////////////////////////////////////////////////////////////////
+
+namespace TMVA {
+namespace DNN  {
+
+//____________________________________________________________________________
+void TCuda::Dropout(TCudaMatrix &A, CudaDouble_t dropoutProbability)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Dropout<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols(),
+       dropoutProbability,
+       TCudaMatrix::GetCurandStatesPointer());
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Initialization.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Initialization.cu
new file mode 100644
index 0000000000000..d66a05048d6bc
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Initialization.cu
@@ -0,0 +1,103 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 14/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ /////////////////////////////////////////////////////////////
+ // Implementation of the initialization functions for CUDA //
+ // Architectures                                           //
+ /////////////////////////////////////////////////////////////
+
+#include "TRandom.h"
+#include "TMatrix.h"
+#include "TMVA/DNN/Architectures/Cuda.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+void TCuda::InitializeGauss(TCudaMatrix & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   TRandom rand(time(nullptr));
+   TMatrixT<CudaDouble_t> B(m, n);
+
+   Real_t sigma = sqrt(2.0 / ((Real_t) n));
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         B(i,j) = rand.Gaus(0.0, sigma);
+      }
+   }
+   A = B;
+}
+
+//______________________________________________________________________________
+void TCuda::InitializeUniform(TCudaMatrix & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   TRandom rand(time(nullptr));
+   TMatrixT<CudaDouble_t> B(m, n);
+
+   Real_t range = sqrt(2.0 / ((Real_t) n));
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         B(i,j) = rand.Uniform(-range, range);
+      }
+   }
+   A = B;
+}
+
+//______________________________________________________________________________
+void TCuda::InitializeIdentity(TCudaMatrix & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+   TMatrixT<CudaDouble_t> B(m, n);
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n ; j++) {
+         B(i,j) = 0.0;
+      }
+
+      if (i < n) {
+         B(i,i) = 1.0;
+      }
+   }
+   A = B;
+}
+
+//______________________________________________________________________________
+void TCuda::InitializeZero(TCudaMatrix & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+   TMatrixT<CudaDouble_t> B(m, n);
+
+   for (size_t i = 0; i < m * n; i++) {
+      for (size_t j = 0; j < n ; j++) {
+         B(i,j) = 0.0;
+      }
+   }
+   A = B;
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cu
new file mode 100644
index 0000000000000..1c75e7be4d4d4
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cu
@@ -0,0 +1,638 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////////////////
+// Implementation of the device kernels for the CUDA implementation of //
+// the low-level interface.                                            //
+/////////////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "cuda.h"
+#include "math.h"
+
+namespace TMVA {
+namespace DNN  {
+namespace Cuda {
+
+//____________________________________________________________________________
+__device__ CudaDouble_t AtomicAdd(CudaDouble_t* address, CudaDouble_t val)
+{
+   unsigned long long int* address_as_ull = (unsigned long long int*)address;
+   unsigned long long int old = *address_as_ull, assumed;
+   do {
+      assumed = old;
+      old = atomicCAS(address_as_ull, assumed,
+                      __double_as_longlong(val +
+                                           __longlong_as_double(assumed)));
+   } while (assumed != old);
+   return __longlong_as_double(old);
+}
+
+//____________________________________________________________________________
+__device__ void ReduceSumVertical(CudaDouble_t *result,
+                                  CudaDouble_t * sdata,
+                                  int n)
+{
+   // i,j are block row and column indices.
+   int i = threadIdx.y;
+   int j = threadIdx.x;
+   int index = i * blockDim.x + j;
+
+   __syncthreads();
+   if ((blockDim.y > 512) && (i < 512)) {
+      if ((i + 512) < blockDim.y) {
+         sdata[index] += sdata[index + 512 * blockDim.x];
+      }
+   }
+
+   __syncthreads();
+   if ((blockDim.y > 256) && (i < 256)) {
+      if ((i + 256) < blockDim.y) {
+         sdata[index] += sdata[index + 256 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 128) && (i < 128)) {
+      if ((i + 128) < blockDim.y) {
+         sdata[index] += sdata[index + 128 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 64) && (i < 64)) {
+      if ((i + 64) < blockDim.y) {
+         sdata[index] += sdata[index + 64 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 32) && (i < 32)) {
+      if ((i + 32) < blockDim.y) {
+         sdata[index] += sdata[index + 32 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 16) && (i < 16)) {
+      if ((i + 16) < blockDim.y) {
+         sdata[index] += sdata[index + 16 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 8) && (i < 8)) {
+      if ((i + 8) < blockDim.y) {
+         sdata[index] += sdata[index + 8 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 4) && (i < 4)) {
+      if ((i + 4) < blockDim.y) {
+         sdata[index] += sdata[index + 4 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 2) && (i < 2)) {
+      if ((i + 2) < blockDim.y) {
+         sdata[index] += sdata[index + 2 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 1) && (i < 1)) {
+      if ((i + 1) < blockDim.y) {
+         sdata[index] += sdata[index + 1 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((i == 0) && ((blockIdx.x * blockDim.x + threadIdx.x) < n)) {
+      AtomicAdd(result + j, sdata[index]);
+   }
+   __syncthreads();
+}
+
+//____________________________________________________________________________
+__device__ void ReduceSum(CudaDouble_t *result, CudaDouble_t * sdata)
+{
+   int tid = threadIdx.x + threadIdx.y * blockDim.x;
+
+   __syncthreads();
+   if ((TDevice::BlockSize > 512) && (tid < 512)) {
+      if ((tid + 512) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 512];
+      }
+   }
+
+   __syncthreads();
+   if ((TDevice::BlockSize > 256) && (tid < 256)) {
+      if ((tid + 256) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 256];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 128) && (tid < 128)) {
+      if ((tid + 128) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 128];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 64) && (tid < 64)) {
+      if ((tid + 64) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 64];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 32) && (tid < 32)) {
+      if ((tid + 32) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 32];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 16) && (tid < 16)) {
+      if ((tid + 16) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 16];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 8) && (tid < 8)) {
+      if ((tid + 8) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 8];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 4) && (tid < 4)) {
+      if ((tid + 4) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 4];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 2) && (tid < 2)) {
+      if ((tid + 2) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 2];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 1) && (tid < 1)) {
+      if ((tid + 1) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 1];
+      }
+   }
+   if (tid == 0) {
+       AtomicAdd(result, sdata[0]);
+   }
+
+   __syncthreads();
+}
+
+//____________________________________________________________________________
+__global__ void AddRowWise(CudaDouble_t * W,
+                           const CudaDouble_t * theta,
+                           int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n))
+       W[index] += theta[j];
+}
+
+//____________________________________________________________________________
+__global__ void Hadamard(CudaDouble_t * B,
+                         const CudaDouble_t * A,
+                         int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n))
+       B[index] *= A[index];
+}
+
+//____________________________________________________________________________
+__global__ void IdentityDerivative(CudaDouble_t * A,
+                                   int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n))
+       A[index] = 1.0;
+}
+
+//____________________________________________________________________________
+__global__ void Relu(CudaDouble_t * A,
+                     int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      CudaDouble_t x = A[index];
+      A[index] = (x < 0.0) ? 0.0 : x;
+   }
+}
+
+//____________________________________________________________________________
+__global__ void ReluDerivative(CudaDouble_t * B,
+                               const CudaDouble_t * A, int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      CudaDouble_t x = A[index];
+      B[index] = (x < 0.0) ? 0.0 : 1.0;
+   }
+}
+
+//____________________________________________________________________________
+__global__ void Sigmoid(CudaDouble_t * A,
+                        int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      CudaDouble_t sig = 1.0 / (1.0 + exp(-A[index]));
+      A[index] = sig;
+   }
+}
+
+//____________________________________________________________________________
+__global__ void Sigmoid(CudaDouble_t * B,
+                        const CudaDouble_t * A,
+                        int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      CudaDouble_t sig = 1.0 / (1.0 + exp(-A[index]));
+      B[index] = sig;
+   }
+}
+//____________________________________________________________________________
+__global__ void SigmoidDerivative(CudaDouble_t * B,
+                                  const CudaDouble_t * A,
+                                  int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      CudaDouble_t sig = 1.0 / (1.0 + exp(-A[index]));
+      B[index] = sig * (1.0 - sig);
+   }
+}
+
+//____________________________________________________________________________
+__global__ void Tanh(CudaDouble_t * A,
+                     int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      CudaDouble_t t = ::tanh(A[index]);
+      A[index] = t;
+   }
+}
+
+//____________________________________________________________________________
+__global__ void TanhDerivative(CudaDouble_t * B,
+                               const CudaDouble_t * A,
+                               int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      CudaDouble_t t = ::tanh(A[index]);
+      B[index] = 1 - t*t;
+   }
+}
+
+//____________________________________________________________________________
+__global__ void SymmetricRelu(CudaDouble_t * A,
+                              int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      A[index] = abs(A[index]);
+   }
+}
+
+//____________________________________________________________________________
+__global__ void SymmetricReluDerivative(CudaDouble_t * B,
+                                        const CudaDouble_t * A,
+                                        int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      B[index] = (A[index] < 0.0) ? -1.0 : 1.0;
+   }
+}
+
+//____________________________________________________________________________
+__global__ void SoftSign(CudaDouble_t * A,
+                          int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      CudaDouble_t x = A[index];
+      A[index] = x / (1.0 + abs(x));
+   }
+}
+
+//____________________________________________________________________________
+__global__ void SoftSignDerivative(CudaDouble_t * B,
+                                   const CudaDouble_t * A,
+                                   int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      CudaDouble_t x = 1.0 + fabs(A[index]);
+      B[index] = 1 / (x * x);
+   }
+}
+
+//____________________________________________________________________________
+__global__ void Gauss(CudaDouble_t * A,
+                      int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      CudaDouble_t x = A[index];
+      A[index] = exp(- x * x);
+   }
+}
+
+//____________________________________________________________________________
+__global__ void GaussDerivative(CudaDouble_t * B,
+                                const CudaDouble_t * A,
+                                int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      CudaDouble_t x = A[index];
+      B[index] = - 2.0 * x * exp(- x * x);
+   }
+}
+
+//____________________________________________________________________________
+__global__ void MeanSquaredError(CudaDouble_t * result,
+                                 const CudaDouble_t * Y,
+                                 const CudaDouble_t * output,
+                                 int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid   = blockDim.x * threadIdx.y + threadIdx.x;
+   int index = j * m + i;
+
+   __shared__ CudaDouble_t sdata[TDevice::BlockSize];
+
+   if ((i < m) && (j < n)) {
+       CudaDouble_t norm = 1 / ((CudaDouble_t) (m * n));
+       CudaDouble_t e   = Y[index] - output[index];
+       sdata[tid] = norm * e * e;
+   } else {
+       sdata[tid] = 0.0;
+   }
+   ReduceSum(result, sdata);
+}
+
+//____________________________________________________________________________
+__global__ void SquaredSum(CudaDouble_t * result,
+                           const CudaDouble_t * A,
+                           int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid   = blockDim.x * threadIdx.y + threadIdx.x;
+   int index = j * m + i;
+
+   __shared__ CudaDouble_t sdata[TDevice::BlockSize];
+
+   if ((i < m) && (j < n)) {
+       CudaDouble_t e = A[index];
+       sdata[tid] = e * e;
+   } else {
+       sdata[tid] = 0.0;
+   }
+   ReduceSum(result, sdata);
+}
+
+//____________________________________________________________________________
+__global__ void AbsoluteSum(CudaDouble_t * result,
+                            const CudaDouble_t * A,
+                            int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid   = blockDim.x * threadIdx.y + threadIdx.x;
+   int index = j * m + i;
+
+   __shared__ CudaDouble_t sdata[TDevice::BlockSize];
+
+   if ((i < m) && (j < n)) {
+       sdata[tid] = abs(A[index]);
+   } else {
+       sdata[tid] = 0.0;
+   }
+   ReduceSum(result, sdata);
+}
+
+//____________________________________________________________________________
+__global__ void MeanSquaredErrorGradients(CudaDouble_t * dY,
+                                          const CudaDouble_t * Y,
+                                          const CudaDouble_t * output,
+                                          int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n))
+       dY[index] = 2.0 / ((CudaDouble_t) (m * n)) * (output[index] - Y[index]);
+}
+
+//____________________________________________________________________________
+__global__ void AddL1RegularizationGradients(CudaDouble_t * A,
+                                             const CudaDouble_t * B,
+                                             CudaDouble_t weightDecay,
+                                             int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+       CudaDouble_t sign = (B[index] < 0.0) ? -1.0 : 1.0;
+       A[index] += sign * weightDecay;
+   }
+}
+
+//____________________________________________________________________________
+__global__ void AddL2RegularizationGradients(CudaDouble_t * A,
+                                             const CudaDouble_t * B,
+                                             CudaDouble_t weightDecay,
+                                             int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+       A[index] += 2.0 * weightDecay * B[index];
+   }
+}
+
+//____________________________________________________________________________
+__global__ void CrossEntropy(CudaDouble_t * result,
+                             const CudaDouble_t * Y,
+                             const CudaDouble_t * output,
+                             int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid   = blockDim.x * threadIdx.y + threadIdx.x;
+   int index = j * m + i;
+
+   __shared__ CudaDouble_t sdata[TDevice::BlockSize];
+
+   if ((i < m) && (j < n)) {
+       CudaDouble_t norm = 1 / ((CudaDouble_t) (m * n));
+       CudaDouble_t sig  = 1.0 / (1.0 + exp(-output[index]));
+       CudaDouble_t ce   = Y[index] * log(sig) + (1.0 - Y[index]) * log(1.0 - sig);
+       sdata[tid]        = - norm * ce;
+   } else {
+       sdata[tid] = 0.0;
+   }
+
+   ReduceSum(result, sdata);
+}
+
+//____________________________________________________________________________
+__global__ void CrossEntropyGradients(CudaDouble_t * dY,
+                                      const CudaDouble_t * Y,
+                                      const CudaDouble_t * output,
+                                      int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      CudaDouble_t norm = 1 / ((CudaDouble_t) (m * n));
+      CudaDouble_t y = Y[index];
+      CudaDouble_t sig = 1.0 / (1.0 + exp(-output[index]));
+      dY[index] = norm * (sig - y);
+   }
+}
+
+//____________________________________________________________________________
+__global__ void ReduceMatrix(CudaDouble_t *result,
+                             const CudaDouble_t *A,
+                             int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid = threadIdx.y * blockDim.x + threadIdx.x;
+   int index = j * m + i;
+
+   __shared__ CudaDouble_t smem[TDevice::BlockSize];
+   if ((i < m) && (j < n))
+       smem[tid] = A[index];
+   else
+       smem[tid] = 0.0;
+
+   ReduceSum(result, smem);
+}
+
+//____________________________________________________________________________
+__global__ void SumColumns(CudaDouble_t *B,
+                            const CudaDouble_t *A,
+                            int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int matrixIndex = j * m + i;
+   int blockIndex  = blockDim.x * threadIdx.y + threadIdx.x;
+
+
+   __shared__ CudaDouble_t smem[TDevice::BlockSize];
+
+   if ((i < m) && (j < n)) {
+       smem[blockIndex] = A[matrixIndex];
+   } else {
+       smem[blockIndex] = 0.0;
+   }
+
+   ReduceSumVertical(B + blockDim.x * blockIdx.x, smem, n);
+}
+
+//____________________________________________________________________________
+__global__ void InitializeCurandStates(unsigned long long seed,
+                                       curandState_t *state)
+{
+   int i   = blockDim.y * blockIdx.y + threadIdx.y;
+   int j   = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid = i * gridDim.x + j;
+   curand_init(seed, 0, tid, state + tid);
+}
+
+//____________________________________________________________________________
+__global__ void Dropout(CudaDouble_t *A,
+                        int m, int n,
+                        CudaDouble_t dropoutProbability,
+                        curandState_t *state)
+{
+   int i   = blockDim.y * blockIdx.y + threadIdx.y;
+   int j   = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid = i * gridDim.x + j;
+   if ((i < m) && (j < n)) {
+      float r = curand_uniform(state + tid);
+      if (r > dropoutProbability) {
+         A[j * m + i] = 0.0;
+      } else {
+         A[j * m + i] /= dropoutProbability;
+      }
+   }
+}
+
+} // namespace Cuda
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu
new file mode 100644
index 0000000000000..1f5479fce75b6
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu
@@ -0,0 +1,95 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+// Implementation of the loss functions for the TCuda implementation //
+// of the low-level interface.                                      //
+//////////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "TMVA/DNN/Architectures/Cuda/Kernels.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//____________________________________________________________________________
+CudaDouble_t TCuda::MeanSquaredError(const TCudaMatrix & Y,
+                                    const TCudaMatrix & output)
+{
+    dim3 blockDims = TDevice::BlockDims();
+    dim3 gridDims  = TDevice::GridDims(Y);
+    cudaStream_t s = Y.GetComputeStream();
+    TCudaMatrix::ResetDeviceReturn();
+    ::TMVA::DNN::Cuda::MeanSquaredError<<<gridDims, blockDims, 0, s>>>(
+        TCudaMatrix::GetDeviceReturnPointer(),
+        Y.GetDataPointer(),
+        output.GetDataPointer(),
+        (int) Y.GetNrows(),
+        (int) Y.GetNcols());
+    return TCudaMatrix::GetDeviceReturn();
+}
+
+//____________________________________________________________________________
+void TCuda::MeanSquaredErrorGradients(TCudaMatrix & dY,
+                                    const TCudaMatrix & Y,
+                                    const TCudaMatrix & output)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(Y);
+   cudaStream_t s = output.GetComputeStream();
+   ::TMVA::DNN::Cuda::MeanSquaredErrorGradients<<<gridDims, blockDims, 0, s>>>(
+       dY.GetDataPointer(),
+       Y.GetDataPointer(),
+       output.GetDataPointer(),
+       (int) Y.GetNrows(),
+       (int) Y.GetNcols());
+   dY.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+CudaDouble_t TCuda::CrossEntropy(const TCudaMatrix & Y,
+                                const TCudaMatrix & output)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(Y);
+   TCudaMatrix::ResetDeviceReturn();
+   cudaStream_t s = Y.GetComputeStream();
+   ::TMVA::DNN::Cuda::CrossEntropy<<<gridDims, blockDims, 0, s>>>(
+       TCudaMatrix::GetDeviceReturnPointer(),
+       Y.GetDataPointer(),
+       output.GetDataPointer(),
+       (int) Y.GetNrows(),
+       (int) Y.GetNcols());
+   return TCudaMatrix::GetDeviceReturn();
+}
+
+//____________________________________________________________________________
+void TCuda::CrossEntropyGradients(TCudaMatrix & dY,
+                                 const TCudaMatrix & Y,
+                                 const TCudaMatrix & output)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(Y);
+   cudaStream_t s = output.GetComputeStream();
+   ::TMVA::DNN::Cuda::CrossEntropyGradients<<<gridDims, blockDims, 0, s>>>(
+       dY.GetDataPointer(),
+       Y.GetDataPointer(),
+       output.GetDataPointer(),
+       (int) Y.GetNrows(),
+       (int) Y.GetNcols());
+   dY.SetComputeStream(s);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu
new file mode 100644
index 0000000000000..09dc46a3e8d1b
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu
@@ -0,0 +1,39 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 11/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////
+// Explicit instantiation of the Reference architecture class //
+// template for Double_t scalar types.                        //
+////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+void TCuda::Sigmoid(TCudaMatrix & B,
+                    const TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Sigmoid<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(),
+                                                             A.GetDataPointer(),
+                                                             (int) A.GetNrows(),
+                                                             (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu
new file mode 100644
index 0000000000000..f8e2b3bf7c8a3
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu
@@ -0,0 +1,96 @@
+// @(#)root/tmva/tmva/dnn:$Id$ // Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////////
+ // Implementation of the functions required for the forward and //
+ // backward propagation of activations through a neural network //
+ // for CUDA architectures.                                      //
+ //////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "TMVA/DNN/Architectures/Cuda/Kernels.h"
+
+namespace TMVA {
+namespace DNN  {
+
+
+void TCuda::MultiplyTranspose(TCudaMatrix &output,
+                            const TCudaMatrix &input,
+                            const TCudaMatrix &Weights)
+{
+   int m, n, k;
+   k = input.GetNcols();
+   m = input.GetNrows();
+   n = Weights.GetNrows();
+   CudaDouble_t alpha = 1.0, beta = 0.0;
+
+   // Compute C = beta * C + alpha * (A * B^T)
+   cudaStream_t s = input.GetComputeStream();
+   cublasSetStream(input.GetCublasHandle(), s);
+   cublasDgemm(input.GetCublasHandle(),
+               CUBLAS_OP_N, CUBLAS_OP_T,
+               m, n, k, & alpha,
+               input.GetDataPointer(), m,     // *A, lda
+               Weights.GetDataPointer(), n,   // *B, ldb
+               & beta,                           // beta
+               output.GetDataPointer(), m);   // *C, ldc
+   output.SetComputeStream(s);
+}
+
+void TCuda::AddRowWise(TCudaMatrix &Weights,
+                      const TCudaMatrix &theta)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(Weights);
+   cudaStream_t s = Weights.GetComputeStream();
+   ::TMVA::DNN::Cuda::AddRowWise<<<gridDims, blockDims, 0, s>>>(
+       Weights.GetDataPointer(),
+       theta.GetDataPointer(),
+       Weights.GetNrows(),
+       Weights.GetNcols());
+}
+
+void TCuda::Backward(TCudaMatrix & activation_gradients_backward,
+                    TCudaMatrix & weight_gradients,
+                    TCudaMatrix & bias_gradients,
+                    TCudaMatrix & df,
+                    const TCudaMatrix & activation_gradients,
+                    const TCudaMatrix & weights,
+                    const TCudaMatrix & activation_backward)
+{
+   // Compute element-wise product.
+   TCuda::Hadamard(df, activation_gradients);
+
+   // Activation gradients.
+   if (activation_gradients_backward.GetNoElements() > 0)
+       TCuda::Multiply(activation_gradients_backward, df, weights);
+
+   // Weight gradients.
+   if (weight_gradients.GetNoElements() > 0)
+       TCuda::TransposeMultiply(weight_gradients, df, activation_backward);
+
+   // Bias gradients.
+   if (bias_gradients.GetNoElements() > 0)
+       TCuda::SumColumns(bias_gradients, df);
+
+}
+
+void TCuda::Copy(TCudaMatrix & B, const TCudaMatrix & A)
+{
+   size_t m = B.GetNrows();
+   size_t n = B.GetNcols();
+   cudaMemcpyAsync(B.GetDataPointer(), A.GetDataPointer(),
+                   m * n * sizeof(CudaDouble_t), cudaMemcpyDeviceToDevice,
+                   0);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu
new file mode 100644
index 0000000000000..161b7361260fa
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu
@@ -0,0 +1,88 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////
+// Contains the definitions of the kernel calling functions for //
+// computation of regularization functionals and gradients      //
+// functions for CUDA architectures.                            //
+//////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "TMVA/DNN/Architectures/Cuda/Kernels.h"
+
+namespace TMVA {
+namespace DNN  {
+
+//______________________________________________________________________________
+CudaDouble_t TCuda::L1Regularization(const TCudaMatrix & A)
+{
+    dim3 blockDims = TDevice::BlockDims();
+    dim3 gridDims  = TDevice::GridDims(A);
+    cudaStream_t s = A.GetComputeStream();
+    TCudaMatrix::ResetDeviceReturn();
+    ::TMVA::DNN::Cuda::AbsoluteSum<<<gridDims, blockDims, 0, s>>>(
+        TCudaMatrix::GetDeviceReturnPointer(),
+        A.GetDataPointer(),
+        (int) A.GetNrows(),
+        (int) A.GetNcols());
+    return TCudaMatrix::GetDeviceReturn();
+}
+
+//______________________________________________________________________________
+void TCuda::AddL1RegularizationGradients(TCudaMatrix & B,
+                                        const TCudaMatrix & A,
+                                        CudaDouble_t weightDecay)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::AddL1RegularizationGradients<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       weightDecay,
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+CudaDouble_t TCuda::L2Regularization(const TCudaMatrix & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   TCudaMatrix::ResetDeviceReturn();
+   ::TMVA::DNN::Cuda::SquaredSum<<<gridDims, blockDims, 0, s>>>(
+       TCudaMatrix::GetDeviceReturnPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   return TCudaMatrix::GetDeviceReturn();
+}
+
+//______________________________________________________________________________
+void TCuda::AddL2RegularizationGradients(TCudaMatrix & B,
+                                        const TCudaMatrix & A,
+                                        CudaDouble_t weightDecay)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::AddL2RegularizationGradients<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       weightDecay,
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+} // namspace DNN
+} // namspace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Reference.cxx b/tmva/tmva/src/DNN/Architectures/Reference.cxx
new file mode 100644
index 0000000000000..0d8eaf2b422fe
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference.cxx
@@ -0,0 +1,32 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////
+// Explicit instantiation of the TReference architecture class //
+// template for Double_t scalar types.                        //
+////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Reference.h"
+
+#include "Reference/Propagation.cxx"
+#include "Reference/ActivationFunctions.cxx"
+#include "Reference/OutputFunctions.cxx"
+#include "Reference/LossFunctions.cxx"
+#include "Reference/Regularization.cxx"
+#include "Reference/Initialization.cxx"
+#include "Reference/Dropout.cxx"
+
+namespace TMVA {
+namespace DNN  {
+template class TReference<Double_t>;
+} // namespace TMVA
+} // namespace DNN
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/ActivationFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/ActivationFunctions.cxx
new file mode 100644
index 0000000000000..93afd86bbabc8
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/ActivationFunctions.cxx
@@ -0,0 +1,237 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////////
+ // Implementation of the activation functions for the reference //
+ // implementation.                                              //
+ //////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Reference.h"
+#include <math.h>
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::IdentityDerivative(TMatrixT<Real_t> & B,
+                                            const TMatrixT<Real_t> &A)
+{
+   size_t m,n;
+   m = B.GetNrows();
+   n = B.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         B(i,j) = 1.0;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::Relu(TMatrixT<Real_t> &A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         A(i,j) = std::max((Real_t) 0.0, A(i,j));
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::ReluDerivative(TMatrixT<Real_t> & B,
+                                              const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++)
+   {
+      for (size_t j = 0; j < n; j++)
+      {
+         B(i,j) = (A(i,j) < 0) ? 0.0 : 1.0;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::Sigmoid(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j)));
+         A(i,j) = sig;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::SigmoidDerivative(TMatrixT<Real_t> & B,
+                                                 const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j)));
+         B(i,j) = sig * (1.0 - sig);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::Tanh(TMatrixT<Real_t> & B)
+{
+   size_t m,n;
+   m = B.GetNrows();
+   n = B.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t t = tanh(B(i,j));
+         B(i,j) = t;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::TanhDerivative(TMatrixT<Real_t> & B,
+                                              const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t t = tanh(A(i,j));
+         B(i,j) = 1 - t * t;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::SymmetricRelu(TMatrixT<Real_t> & B)
+{
+   size_t m,n;
+   m = B.GetNrows();
+   n = B.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         B(i,j) = fabs(B(i,j));
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::SymmetricReluDerivative(TMatrixT<Real_t> & B,
+                                                       const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         B(i,j) = (A(i,j) < 0.0) ? -1.0 : 1.0;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::SoftSign(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t x = A(i,j);
+         A(i,j)   = x / (1 + fabs(x));
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::SoftSignDerivative(TMatrixT<Real_t> & B,
+                                                  const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t x = 1.0 + fabs(A(i,j));
+         B(i,j)   = 1.0 / (x * x);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::Gauss(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t x = A(i,j);
+         A(i,j)   = exp(- x * x);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::GaussDerivative(TMatrixT<Real_t> & B,
+                                               const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t x = A(i,j);
+         B(i,j)   = - 2.0 * x * exp(- x * x);
+      }
+   }
+}
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/DataLoader.cxx b/tmva/tmva/src/DNN/Architectures/Reference/DataLoader.cxx
new file mode 100644
index 0000000000000..9a287cd9b743c
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/DataLoader.cxx
@@ -0,0 +1,142 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 12/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////
+// Implementation for the DataLoader for the reference //
+// implementation.                                     //
+/////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Reference/DataLoader.h"
+#include "TMVA/Event.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+template<typename Data_t, typename Real_t>
+TReferenceDataLoader<Data_t, Real_t>::TReferenceDataLoader(const Data_t &input,
+                                                         size_t nsamples,
+                                                         size_t batchSize,
+                                                         size_t ninputFeatures,
+                                                         size_t noutputFeatures)
+    : fInput(input), fNSamples(nsamples), fBatchSize(batchSize),
+      fNInputFeatures(ninputFeatures), fNOutputFeatures(noutputFeatures),
+      fNBatches(nsamples / batchSize), fInputMatrices(), fOutputMatrices(),
+      fBatches(), fSampleIndices()
+{
+   fInputMatrices.reserve(fNBatches);
+   fOutputMatrices.reserve(fNBatches);
+   for (size_t i = 0; i < fNBatches; i++) {
+      fInputMatrices.emplace_back(fBatchSize, fNInputFeatures);
+      fOutputMatrices.emplace_back(fBatchSize, fNOutputFeatures);
+   }
+
+   fBatches.reserve(fNBatches);
+   fSampleIndices.reserve(fNBatches);
+
+   for (size_t i = 0; i < fNSamples; i++) {
+      fSampleIndices.emplace_back(i);
+   }
+}
+
+template<typename Data_t, typename Real_t>
+auto TReferenceDataLoader<Data_t, Real_t>::begin()
+    -> BatchIterator_t
+{
+   random_shuffle(fSampleIndices.begin(), fSampleIndices.end());
+   fBatches.clear();
+   fBatches.reserve(fNSamples);
+
+   size_t sampleIndex = 0;
+   for (size_t batchIndex = 0; batchIndex < fNBatches; batchIndex++) {
+
+      CopyBatch(fInputMatrices[batchIndex],
+                fOutputMatrices[batchIndex],
+                fInput,
+                fSampleIndices.begin() + sampleIndex,
+                fSampleIndices.begin() + sampleIndex + fBatchSize);
+      fBatches.emplace_back(fInputMatrices[batchIndex],
+                            fOutputMatrices[batchIndex]);
+      sampleIndex += fBatchSize;
+   }
+   return fBatches.begin();
+}
+
+template<typename Data_t, typename Real_t>
+auto TReferenceDataLoader<Data_t, Real_t>::end()
+    -> BatchIterator_t
+{
+   return fBatches.end();
+}
+
+using MatrixInput_t = std::pair<const TMatrixT<Double_t>&,
+                                const TMatrixT<Double_t> &>;
+
+template <>
+void TReferenceDataLoader<MatrixInput_t, Double_t>::CopyBatch(
+    Matrix_t &inputMatrix,
+    Matrix_t &outputMatrix,
+    const MatrixInput_t &input,
+    IndexIterator_t indexBegin,
+    IndexIterator_t indexEnd)
+{
+   const Matrix_t &in  = std::get<0>(input);
+   const Matrix_t &out = std::get<1>(input);
+
+   size_t batchIndex = 0;
+   for (IndexIterator_t i = indexBegin; i != indexEnd; i++) {
+      size_t index = *i;
+      for (size_t j = 0; j < (size_t) in.GetNcols(); j++) {
+         inputMatrix(batchIndex, j) = in(index, j);
+      }
+      for (size_t j = 0; j < (size_t) out.GetNcols(); j++) {
+         outputMatrix(batchIndex, j) = out(index, j);
+      }
+      batchIndex++;
+   }
+}
+
+using TMVAInput_t   = std::vector<TMVA::Event*>;
+
+template <>
+void TReferenceDataLoader<TMVAInput_t, Double_t>::CopyBatch(
+    Matrix_t &inputMatrix,
+    Matrix_t &outputMatrix,
+    const TMVAInput_t &input,
+    IndexIterator_t indexBegin,
+    IndexIterator_t indexEnd)
+{
+   size_t batchIndex = 0;
+   for (IndexIterator_t i = indexBegin; i != indexEnd; i++) {
+      size_t index = *i;
+      Event *event = input.at(index);
+      for (size_t j = 0; j < event->GetNVariables(); j++) {
+         inputMatrix(batchIndex, j) = event->GetValue(j);
+      }
+      if (event->GetNTargets() > 0) {
+         for (size_t j = 0; j < event->GetNTargets(); j++) {
+            outputMatrix(batchIndex, j) = event->GetTarget(j);
+         }
+      } else {
+         outputMatrix(batchIndex, 0) = (event->GetClass() == 0) ? 1.0 : 0.0;
+         batchIndex++;
+      }
+   }
+}
+
+// Explicit instantiation.
+
+template class TReferenceDataLoader<MatrixInput_t, Double_t>;
+template class TReferenceDataLoader<TMVAInput_t, Double_t>;
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Dropout.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Dropout.cxx
new file mode 100644
index 0000000000000..04e14811436c0
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/Dropout.cxx
@@ -0,0 +1,50 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////////
+ // Implementation of the activation functions for the reference //
+ // implementation.                                              //
+ //////////////////////////////////////////////////////////////////
+
+
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TRandom.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+
+template<typename Real_t>
+void TReference<Real_t>::Dropout(TMatrixT<Real_t> & B, Real_t dropoutProbability)
+{
+   size_t m,n;
+   m = B.GetNrows();
+   n = B.GetNcols();
+
+   TRandom rand(time(nullptr));
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t r = rand.Uniform();
+         if (r >= dropoutProbability) {
+            B(i,j) = 0.0;
+         } else {
+            B(i,j) /= dropoutProbability;
+         }
+      }
+   }
+}
+
+}
+}
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Initialization.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Initialization.cxx
new file mode 100644
index 0000000000000..edd6ecaedb700
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/Initialization.cxx
@@ -0,0 +1,97 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////////////
+ // Implementation of the initialization functions for the reference //
+ // implementation.                                                  //
+ //////////////////////////////////////////////////////////////////////
+
+#include "TRandom.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::InitializeGauss(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   TRandom rand(time(nullptr));
+
+   Real_t sigma = sqrt(2.0 / ((Real_t) n));
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         A(i,j) = rand.Gaus(0.0, sigma);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::InitializeUniform(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   TRandom rand(time(nullptr));
+
+   Real_t range = sqrt(2.0 / ((Real_t) n));
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         A(i,j) = rand.Uniform(-range, range);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::InitializeIdentity(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n ; j++) {
+         A(i,j) = 0.0;
+      }
+
+      if (i < n) {
+         A(i,i) = 1.0;
+      }
+   }
+}
+
+template<typename Real_t>
+void TReference<Real_t>::InitializeZero(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m * n; i++) {
+      for (size_t j = 0; j < n ; j++) {
+         A(i,j) = 0.0;
+      }
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx
new file mode 100644
index 0000000000000..aa0b144be2053
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx
@@ -0,0 +1,101 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ ////////////////////////////////////////////////////////////
+ // Implementation of the loss functions for the reference //
+ // implementation.                                        //
+ ////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Reference.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+//______________________________________________________________________________
+template<typename Real_t>
+Real_t TReference<Real_t>::MeanSquaredError(const TMatrixT<Real_t> &Y,
+                                           const TMatrixT<Real_t> &output)
+{
+   size_t m,n;
+   m = Y.GetNrows();
+   n = Y.GetNcols();
+   Real_t result = 0.0;
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t dY = (Y(i,j) - output(i,j));
+         result += dY * dY;
+      }
+   }
+   result /= (Real_t) (m * n);
+   return result;
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::MeanSquaredErrorGradients(TMatrixT<Real_t> & dY,
+                                                  const TMatrixT<Real_t> & Y,
+                                                  const TMatrixT<Real_t> & output)
+{
+   size_t m,n;
+   m = Y.GetNrows();
+   n = Y.GetNcols();
+
+   dY.Minus(Y, output);
+   dY *= - 2.0 / ((Real_t) (m*n));
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+Real_t TReference<Real_t>::CrossEntropy(const TMatrixT<Real_t> &Y,
+                                       const TMatrixT<Real_t> &output)
+{
+   size_t m,n;
+   m = Y.GetNrows();
+   n = Y.GetNcols();
+   Real_t result = 0.0;
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t sig = 1.0 / (1.0 + std::exp(-output(i,j)));
+         result      += Y(i,j) * std::log(sig)
+         + (1.0 - Y(i,j)) * std::log(1.0 - sig);
+      }
+   }
+   result /= - (Real_t) (m * n);
+   return result;
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::CrossEntropyGradients(TMatrixT<Real_t> & dY,
+                                              const TMatrixT<Real_t> & Y,
+                                              const TMatrixT<Real_t> & output)
+{
+   size_t m,n;
+   m = Y.GetNrows();
+   n = Y.GetNcols();
+
+   Real_t norm = 1.0 / ((Real_t) (m * n));
+   for (size_t i = 0; i < m; i++)
+   {
+      for (size_t j = 0; j < n; j++)
+      {
+         Real_t y   = Y(i,j);
+         Real_t sig = 1.0 / (1.0 + std::exp(-output(i,j)));
+         dY(i,j) = norm * (sig - y);
+      }
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx
new file mode 100644
index 0000000000000..731c95713d404
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx
@@ -0,0 +1,37 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 11/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////
+// Explicit instantiation of the TReference architecture class //
+// template for Double_t scalar types.                        //
+////////////////////////////////////////////////////////////////
+
+namespace TMVA {
+namespace DNN  {
+
+template<typename Real_t>
+void TReference<Real_t>::Sigmoid(TMatrixT<Real_t> & B,
+                                const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j)));
+         B(i,j) = sig;
+      }
+   }
+}
+
+} // namespace TMVA
+} // namespace DNN
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Outputfunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Outputfunctions.cxx
new file mode 100644
index 0000000000000..b572b03baa60f
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/Outputfunctions.cxx
@@ -0,0 +1,39 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 11/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////
+// Explicit instantiation of the Reference architecture class //
+// template for Double_t scalar types.                        //
+////////////////////////////////////////////////////////////////
+
+namespace TMVA
+{
+namespace DNN
+{
+
+template<typename Real_t>
+void Reference<Real_t>::Sigmoid(TMatrixT<Real_t> & B,
+                                const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j)));
+         A(i,j) = sig;
+      }
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Propagation.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Propagation.cxx
new file mode 100644
index 0000000000000..1b360810603f3
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/Propagation.cxx
@@ -0,0 +1,95 @@
+// @(#)root/tmva/tmva/dnn:$Id$ // Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////////////
+// Implementation of the functions required for the forward and    //
+// backward propagation of activations through a neural network in //
+// the reference implementation.                                   //
+/////////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Reference.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+template<typename Scalar_t>
+void TReference<Scalar_t>::MultiplyTranspose(TMatrixT<Scalar_t> &output,
+                                            const TMatrixT<Scalar_t> &input,
+                                            const TMatrixT<Scalar_t> &weights)
+{
+    output.MultT(input, weights);
+}
+
+template<typename Scalar_t>
+void TReference<Scalar_t>::AddRowWise(TMatrixT<Scalar_t> &output,
+                                     const TMatrixT<Scalar_t> &biases)
+{
+   for (size_t i = 0; i < (size_t) output.GetNrows(); i++) {
+      for (size_t j = 0; j < (size_t) output.GetNcols(); j++) {
+         output(i,j) += biases(j,0);
+      }
+   }
+}
+
+template<typename Scalar_t>
+void TReference<Scalar_t>::Backward(TMatrixT<Scalar_t> & activation_gradients_backward,
+                                   TMatrixT<Scalar_t> & weight_gradients,
+                                   TMatrixT<Scalar_t> & bias_gradients,
+                                   TMatrixT<Scalar_t> & df,
+                                   const TMatrixT<Scalar_t> & activation_gradients,
+                                   const TMatrixT<Scalar_t> & weights,
+                                   const TMatrixT<Scalar_t> & activations_backward)
+{
+
+   // Compute element-wise product.
+   for (size_t i = 0; i < (size_t) df.GetNrows(); i++) {
+      for (size_t j = 0; j < (size_t) df.GetNcols(); j++) {
+         df(i,j) *= activation_gradients(i,j);
+      }
+   }
+
+   // Activation gradients.
+   if (activation_gradients_backward.GetNoElements() > 0) {
+       activation_gradients_backward.Mult(df, weights);
+   }
+
+   // Weights gradients.
+   if (weight_gradients.GetNoElements() > 0) {
+      weight_gradients.TMult(df, activations_backward);
+   }
+
+   // Bias gradients.
+   if (bias_gradients.GetNoElements() > 0) {
+      for (size_t j = 0; j < (size_t) df.GetNcols(); j++) {
+         Scalar_t sum = 0.0;
+         for (size_t i = 0; i < (size_t) df.GetNrows(); i++) {
+            sum += df(i,j);
+         }
+         bias_gradients(j,0) = sum;
+      }
+   }
+}
+
+template<typename Scalar_t>
+void TReference<Scalar_t>::ScaleAdd(TMatrixT<Scalar_t> & A,
+                                   const TMatrixT<Scalar_t> & B,
+                                   Scalar_t beta)
+{
+   for (size_t i = 0; i < (size_t) A.GetNrows(); i++) {
+      for (size_t j = 0; j < (size_t) A.GetNcols(); j++) {
+         A(i,j) += beta * B(i,j);
+      }
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Regularization.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Regularization.cxx
new file mode 100644
index 0000000000000..ffe0f19ec7db1
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/Regularization.cxx
@@ -0,0 +1,98 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////////////
+ // Implementation of the regularization functions for the reference //
+ // implementation.                                                  //
+ //////////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Reference.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+template<typename Real_t>
+Real_t TReference<Real_t>::L1Regularization(const TMatrixT<Real_t> & W)
+{
+   size_t m,n;
+   m = W.GetNrows();
+   n = W.GetNcols();
+
+   Real_t result = 0.0;
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         result += std::abs(W(i,j));
+      }
+   }
+   return result;
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::AddL1RegularizationGradients(TMatrixT<Real_t> & A,
+                                                     const TMatrixT<Real_t> & W,
+                                                     Real_t weightDecay)
+{
+   size_t m,n;
+   m = W.GetNrows();
+   n = W.GetNcols();
+
+   Real_t sign = 0.0;
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         sign = (W(i,j) > 0.0) ? 1.0 : -1.0;
+         A(i,j) += sign * weightDecay;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+Real_t TReference<Real_t>::L2Regularization(const TMatrixT<Real_t> & W)
+{
+   size_t m,n;
+   m = W.GetNrows();
+   n = W.GetNcols();
+
+   Real_t result = 0.0;
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         result += W(i,j) * W(i,j);
+      }
+   }
+   return result;
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::AddL2RegularizationGradients(TMatrixT<Real_t> & A,
+                                                     const TMatrixT<Real_t> & W,
+                                                     Real_t weightDecay)
+{
+   size_t m,n;
+   m = W.GetNrows();
+   n = W.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         A(i,j) += weightDecay * 2.0 * W(i,j);
+      }
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/MethodDNN.cxx b/tmva/tmva/src/MethodDNN.cxx
index d3f0e547ca349..c5fb4a9ac9722 100644
--- a/tmva/tmva/src/MethodDNN.cxx
+++ b/tmva/tmva/src/MethodDNN.cxx
@@ -42,6 +42,9 @@
 #include "TMVA/Config.h"
 #include "TMVA/Ranking.h"
 
+#include "TMVA/DNN/Net.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+
 #include "TMVA/NeuralNet.h"
 #include "TMVA/Monitoring.h"
 
@@ -52,7 +55,10 @@ REGISTER_METHOD(DNN)
 
 ClassImp(TMVA::MethodDNN)
 
-
+using TMVA::DNN::EActivationFunction;
+using TMVA::DNN::ELossFunction;
+using TMVA::DNN::EInitialization;
+using TMVA::DNN::EOutputFunction;
 
 
    namespace TMVA
@@ -160,6 +166,7 @@ void TMVA::MethodDNN::DeclareOptions()
    AddPreDefVal(TString("XAVIERUNIFORM"));
    AddPreDefVal(TString("LAYERSIZE"));
 
+   DeclareOptionRef(fGPUString="True", "GPU", "Use GPU for training.");
 
    DeclareOptionRef(fTrainingStrategy="LearningRate=1e-1,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5|LearningRate=1e-4,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropConfig=0.0+0.5+0.5,DropRepetitions=5,Multithreading=True",    "TrainingStrategy",    "defines the training strategies");
 
@@ -388,6 +395,12 @@ void TMVA::MethodDNN::ProcessOptions()
    else
       fWeightInitializationStrategy = TMVA::DNN::WeightInitializationStrategy::XAVIER;
 
+   fGPUString.ToUpper ();
+   if (fGPUString.BeginsWith ("T"))
+       fGPU = true;
+   else
+       fGPU = false;
+
    // create settings
    if (fAnalysisType == Types::kClassification)
       {
@@ -492,160 +505,382 @@ void TMVA::MethodDNN::ProcessOptions()
 //______________________________________________________________________________
 void TMVA::MethodDNN::Train()
 {
-    
-   fMonitoring = NULL;
-   // if (!fMonitoring)
-   // {
-   //     fMonitoring = make_shared<Monitoring>();
-   //     fMonitoring->Start ();
-   // }
+    if (fGPU) {
+        TrainGPU();
+    } else {
+
+       fMonitoring = NULL;
+       // if (!fMonitoring)
+       // {
+       //     fMonitoring = make_shared<Monitoring>();
+       //     fMonitoring->Start ();
+       // }
+
+       // INITIALIZATION
+       // create pattern
+       std::vector<Pattern> trainPattern;
+       std::vector<Pattern> testPattern;
+
+       const std::vector<TMVA::Event*>& eventCollectionTraining = GetEventCollection (Types::kTraining);
+       const std::vector<TMVA::Event*>& eventCollectionTesting  = GetEventCollection (Types::kTesting);
+
+       for (size_t iEvt = 0, iEvtEnd = eventCollectionTraining.size (); iEvt < iEvtEnd; ++iEvt)
+       {
+          const TMVA::Event* event = eventCollectionTraining.at (iEvt);
+          const std::vector<Float_t>& values  = event->GetValues  ();
+          if (fAnalysisType == Types::kClassification)
+          {
+             double outputValue = event->GetClass () == 0 ? 0.9 : 0.1;
+             trainPattern.push_back (Pattern (values.begin  (), values.end (), outputValue, event->GetWeight ()));
+             trainPattern.back ().addInput (1.0); // bias node
+          }
+          else
+          {
+             const std::vector<Float_t>& targets = event->GetTargets ();
+             trainPattern.push_back (Pattern (values.begin  (), values.end (), targets.begin (), targets.end (), event->GetWeight ()));
+             trainPattern.back ().addInput (1.0); // bias node
+          }
+       }
+
+       for (size_t iEvt = 0, iEvtEnd = eventCollectionTesting.size (); iEvt < iEvtEnd; ++iEvt)
+       {
+          const TMVA::Event* event = eventCollectionTesting.at (iEvt);
+          const std::vector<Float_t>& values  = event->GetValues  ();
+          if (fAnalysisType == Types::kClassification)
+          {
+             double outputValue = event->GetClass () == 0 ? 0.9 : 0.1;
+             testPattern.push_back (Pattern (values.begin  (), values.end (), outputValue, event->GetWeight ()));
+             testPattern.back ().addInput (1.0); // bias node
+          }
+          else
+          {
+             const std::vector<Float_t>& targets = event->GetTargets ();
+             testPattern.push_back (Pattern (values.begin  (), values.end (), targets.begin (), targets.end (), event->GetWeight ()));
+             testPattern.back ().addInput (1.0); // bias node
+          }
+       }
+
+       if (trainPattern.empty () || testPattern.empty ())
+           return;
+
+       // create net and weights
+       fNet.clear ();
+       fWeights.clear ();
+
+       // if "resume" from saved weights
+       if (fResume)
+       {
+          std::cout << ".. resume" << std::endl;
+          //        std::tie (fNet, fWeights) = ReadWeights (fFileName);
+       }
+       else // initialize weights and net
+       {
+          size_t inputSize = GetNVariables (); //trainPattern.front ().input ().size ();
+          size_t outputSize = fAnalysisType == Types::kClassification ? 1 : GetNTargets (); //trainPattern.front ().output ().size ();
+          fNet.setInputSize (inputSize + 1); // num vars + bias node
+          fNet.setOutputSize (outputSize); // num vars + bias node
+        
+          // configure neural net
+          auto itLayout = std::begin (fLayout), itLayoutEnd = std::end (fLayout)-1; // all layers except the last one
+          for ( ; itLayout != itLayoutEnd; ++itLayout)
+          {
+             fNet.addLayer (DNN::Layer ((*itLayout).first, (*itLayout).second)); 
+             Log() << kINFO 
+                   << "Add Layer with " << (*itLayout).first << " nodes." 
+                   << Endl;
+          }
+
+          DNN::ModeOutputValues eModeOutputValues = DNN::ModeOutputValues::SIGMOID;
+          if (fAnalysisType == Types::kRegression)
+          {
+             eModeOutputValues = DNN::ModeOutputValues::DIRECT;
+          }
+          else if ((fAnalysisType == Types::kClassification ||
+                    fAnalysisType == Types::kMulticlass) &&
+                   fModeErrorFunction == TMVA::DNN::ModeErrorFunction::SUMOFSQUARES)
+          {
+             eModeOutputValues = DNN::ModeOutputValues::DIRECT;
+          }
+          fNet.addLayer (DNN::Layer (outputSize, (*itLayout).second, eModeOutputValues)); 
+          Log() << kINFO 
+                << "Add Layer with " << outputSize << " nodes." 
+                << Endl << Endl;
+          fNet.setErrorFunction (fModeErrorFunction); 
+
+          size_t numWeights = fNet.numWeights ();
+          Log() << kINFO 
+                << "Total number of Synapses = " 
+                << numWeights
+                << Endl;
+
+          // initialize weights
+          fNet.initializeWeights (fWeightInitializationStrategy, 
+                                  std::back_inserter (fWeights));
+       }
+
+
+       // loop through settings 
+       // and create "settings" and minimizer 
+       int idxSetting = 0;
+       for (auto itSettings = std::begin (fSettings), itSettingsEnd = std::end (fSettings); itSettings != itSettingsEnd; ++itSettings, ++idxSetting)
+       {
+          std::shared_ptr<TMVA::DNN::Settings> ptrSettings = *itSettings;
+          ptrSettings->setMonitoring (fMonitoring);
+          Log() << kINFO
+                << "Training with learning rate = " << ptrSettings->learningRate ()
+                << ", momentum = " << ptrSettings->momentum ()
+                << ", repetitions = " << ptrSettings->repetitions ()
+                << Endl;
+
+          ptrSettings->setProgressLimits ((idxSetting)*100.0/(fSettings.size ()), (idxSetting+1)*100.0/(fSettings.size ()));
+
+          const std::vector<double>& dropConfig = ptrSettings->dropFractions ();
+          if (!dropConfig.empty ())
+          {
+             Log () << kINFO << "Drop configuration" << Endl
+                    << "    drop repetitions = " << ptrSettings->dropRepetitions () << Endl;
+          }
+          int idx = 0;
+          for (auto f : dropConfig)
+          {
+             Log () << kINFO << "    Layer " << idx << " = " << f << Endl;
+             ++idx;
+          }
+          Log () << kINFO << Endl;
+        
+          if (ptrSettings->minimizerType () == TMVA::DNN::MinimizerType::fSteepest)
+          {
+             DNN::Steepest minimizer (ptrSettings->learningRate (), ptrSettings->momentum (), ptrSettings->repetitions ());
+             /*E =*/fNet.train (fWeights, trainPattern, testPattern, minimizer, *ptrSettings.get ());
+          }
+          ptrSettings.reset ();
+          Log () << kINFO << Endl;
+       }
+       fMonitoring = 0;
+    }
+}
 
-   // INITIALIZATION
-   // create pattern
-   std::vector<Pattern> trainPattern;
-   std::vector<Pattern> testPattern;
+void TMVA::MethodDNN::TrainGPU()
+{
 
-   const std::vector<TMVA::Event*>& eventCollectionTraining = GetEventCollection (Types::kTraining);
-   const std::vector<TMVA::Event*>& eventCollectionTesting  = GetEventCollection (Types::kTesting);
+#ifdef DNNCUDA // Included only if DNNCUDA flag is set.
 
-   for (size_t iEvt = 0, iEvtEnd = eventCollectionTraining.size (); iEvt < iEvtEnd; ++iEvt)
-      {
-         const TMVA::Event* event = eventCollectionTraining.at (iEvt);
-         const std::vector<Float_t>& values  = event->GetValues  ();
-         if (fAnalysisType == Types::kClassification)
-            {
-               double outputValue = event->GetClass () == 0 ? 0.9 : 0.1;
-               trainPattern.push_back (Pattern (values.begin  (), values.end (), outputValue, event->GetWeight ()));
-               trainPattern.back ().addInput (1.0); // bias node
-            }
-         else
-            {
-               const std::vector<Float_t>& targets = event->GetTargets ();
-               trainPattern.push_back (Pattern (values.begin  (), values.end (), targets.begin (), targets.end (), event->GetWeight ()));
-               trainPattern.back ().addInput (1.0); // bias node
-            }
-      }
+   TMVA::DNN::TNet<TMVA::DNN::TCuda> GPUNet{};
 
-   for (size_t iEvt = 0, iEvtEnd = eventCollectionTesting.size (); iEvt < iEvtEnd; ++iEvt)
-      {
-         const TMVA::Event* event = eventCollectionTesting.at (iEvt);
-         const std::vector<Float_t>& values  = event->GetValues  ();
-         if (fAnalysisType == Types::kClassification)
-            {
-               double outputValue = event->GetClass () == 0 ? 0.9 : 0.1;
-               testPattern.push_back (Pattern (values.begin  (), values.end (), outputValue, event->GetWeight ()));
-               testPattern.back ().addInput (1.0); // bias node
-            }
-         else
-            {
-               const std::vector<Float_t>& targets = event->GetTargets ();
-               testPattern.push_back (Pattern (values.begin  (), values.end (), targets.begin (), targets.end (), event->GetWeight ()));
-               testPattern.back ().addInput (1.0); // bias node
-            }
-      }
+   size_t inputSize = GetNVariables ();
+   size_t outputSize = (GetNTargets() == 0) ? 1 : GetNTargets();
 
-   if (trainPattern.empty () || testPattern.empty ())
-      return;
+   GPUNet.SetInputWidth(inputSize);
 
-   // create net and weights
-   fNet.clear ();
-   fWeights.clear ();
+   // Also need to set standard net structure.
+   fNet.setInputSize (inputSize + 1);
+   fNet.setOutputSize (outputSize);
 
-   // if "resume" from saved weights
-   if (fResume)
+   // configure neural net
+   auto itLayout = std::begin (fLayout), itLayoutEnd = std::end (fLayout)-1; // all layers except the last one
+   for ( ; itLayout != itLayoutEnd; ++itLayout)
+   {
+      fNet.addLayer (DNN::Layer ((*itLayout).first, (*itLayout).second)); 
+      TMVA::DNN::EnumFunction f = (*itLayout).second;
+      switch(f)
       {
-         std::cout << ".. resume" << std::endl;
-         //        std::tie (fNet, fWeights) = ReadWeights (fFileName);
+      case DNN::EnumFunction::RELU :
+          GPUNet.AddLayer((*itLayout).first, EActivationFunction::RELU);
+          break;
+      case DNN::EnumFunction::TANH :
+          GPUNet.AddLayer((*itLayout).first, EActivationFunction::TANH);
+          break;
+      case DNN::EnumFunction::SYMMRELU :
+          GPUNet.AddLayer((*itLayout).first, EActivationFunction::SYMMRELU);
+          break;
+      case DNN::EnumFunction::SOFTSIGN :
+          GPUNet.AddLayer((*itLayout).first, EActivationFunction::SOFTSIGN);
+          break;
+      case DNN::EnumFunction::SIGMOID :
+          GPUNet.AddLayer((*itLayout).first, EActivationFunction::SIGMOID);
+          break;
+      case DNN::EnumFunction::LINEAR :
+          GPUNet.AddLayer((*itLayout).first, EActivationFunction::IDENTITY);
+          break;
+      case DNN::EnumFunction::GAUSS :
+          GPUNet.AddLayer((*itLayout).first, EActivationFunction::GAUSS);
+          break;
+      default :
+          GPUNet.AddLayer((*itLayout).first, EActivationFunction::IDENTITY);
+          break;
       }
-   else // initialize weights and net
-      {
-         size_t inputSize = GetNVariables (); //trainPattern.front ().input ().size ();
-         size_t outputSize = fAnalysisType == Types::kClassification ? 1 : GetNTargets (); //trainPattern.front ().output ().size ();
-         fNet.setInputSize (inputSize + 1); // num vars + bias node
-         fNet.setOutputSize (outputSize); // num vars + bias node
-        
-         // configure neural net
-         auto itLayout = std::begin (fLayout), itLayoutEnd = std::end (fLayout)-1; // all layers except the last one
-         for ( ; itLayout != itLayoutEnd; ++itLayout)
-            {
-               fNet.addLayer (DNN::Layer ((*itLayout).first, (*itLayout).second)); 
-               Log() << kINFO 
-                     << "Add Layer with " << (*itLayout).first << " nodes." 
-                     << Endl;
-            }
+   }
 
-         DNN::ModeOutputValues eModeOutputValues = DNN::ModeOutputValues::SIGMOID;
-         if (fAnalysisType == Types::kRegression)
-            {
-               eModeOutputValues = DNN::ModeOutputValues::DIRECT;
-            }
-         else if ((fAnalysisType == Types::kClassification ||
-                   fAnalysisType == Types::kMulticlass) &&
-                  fModeErrorFunction == TMVA::DNN::ModeErrorFunction::SUMOFSQUARES)
-            {
-               eModeOutputValues = DNN::ModeOutputValues::DIRECT;
-            }
-         fNet.addLayer (DNN::Layer (outputSize, (*itLayout).second, eModeOutputValues)); 
-         Log() << kINFO 
-               << "Add Layer with " << outputSize << " nodes." 
-               << Endl << Endl;
-         fNet.setErrorFunction (fModeErrorFunction); 
-
-         size_t numWeights = fNet.numWeights ();
-         Log() << kINFO 
-               << "Total number of Synapses = " 
-               << numWeights
-               << Endl;
+   DNN::ModeOutputValues eModeOutputValues = DNN::ModeOutputValues::SIGMOID;
+   if (fAnalysisType == Types::kRegression)
+   {
+      eModeOutputValues = DNN::ModeOutputValues::DIRECT;
+      GPUNet.AddLayer(outputSize, EActivationFunction::IDENTITY);
+      GPUNet.SetLossFunction(ELossFunction::MEANSQUAREDERROR);
+   } else if ((fAnalysisType == Types::kClassification ||
+               fAnalysisType == Types::kMulticlass) &&
+              fModeErrorFunction == TMVA::DNN::ModeErrorFunction::SUMOFSQUARES) {
+      GPUNet.AddLayer(outputSize, EActivationFunction::IDENTITY);
+      GPUNet.SetLossFunction(ELossFunction::MEANSQUAREDERROR);
+   } else {
+      eModeOutputValues = DNN::ModeOutputValues::DIRECT;
+      GPUNet.AddLayer(outputSize, EActivationFunction::IDENTITY);
+      GPUNet.SetLossFunction(ELossFunction::CROSSENTROPY);
+   }
 
-         // initialize weights
-         fNet.initializeWeights (fWeightInitializationStrategy, 
-                                 std::back_inserter (fWeights));
-      }
+   fNet.addLayer (DNN::Layer (outputSize, (*itLayout).second, eModeOutputValues));
+   fNet.setErrorFunction (fModeErrorFunction);
 
+   switch(fWeightInitializationStrategy)
+   {
+   case DNN::WeightInitializationStrategy::XAVIER :
+       GPUNet.Initialize(EInitialization::GAUSS);
+       break;
+   case DNN::WeightInitializationStrategy::XAVIERUNIFORM:
+       GPUNet.Initialize(EInitialization::UNIFORM);
+       break;
+   default :
+       GPUNet.Initialize(EInitialization::GAUSS);
+       break;
+   }
+
+   size_t nTrainingSamples = GetEventCollection(Types::kTraining).size();
+   size_t nTestSamples     = GetEventCollection(Types::kTesting).size();
 
-   // loop through settings 
-   // and create "settings" and minimizer 
    int idxSetting = 0;
-   for (auto itSettings = std::begin (fSettings), itSettingsEnd = std::end (fSettings); itSettings != itSettingsEnd; ++itSettings, ++idxSetting)
+   for (auto itSettings = std::begin (fSettings), itSettingsEnd = std::end (fSettings);
+        itSettings != itSettingsEnd; ++itSettings, ++idxSetting)
       {
-         std::shared_ptr<TMVA::DNN::Settings> ptrSettings = *itSettings;
-         ptrSettings->setMonitoring (fMonitoring);
+
+         TMVA::DNN::Settings settings = **itSettings;
+         settings.setMonitoring (fMonitoring);
+
          Log() << kINFO
-               << "Training with learning rate = " << ptrSettings->learningRate ()
-               << ", momentum = " << ptrSettings->momentum ()
-               << ", repetitions = " << ptrSettings->repetitions ()
+               << "Training on GPU with learning rate = "
+               << settings.learningRate ()
+               << ", momentum = " << settings.momentum ()
+               << ", repetitions = " << settings.repetitions ()
                << Endl;
 
-         ptrSettings->setProgressLimits ((idxSetting)*100.0/(fSettings.size ()), (idxSetting+1)*100.0/(fSettings.size ()));
+         settings.setProgressLimits ((idxSetting)*100.0/(fSettings.size ()),
+                                         (idxSetting+1)*100.0/(fSettings.size ()));
 
-         const std::vector<double>& dropConfig = ptrSettings->dropFractions ();
+         const std::vector<double>& dropConfig = settings.dropFractions ();
          if (!dropConfig.empty ())
-            {
-               Log () << kINFO << "Drop configuration" << Endl
-                      << "    drop repetitions = " << ptrSettings->dropRepetitions () << Endl;
-            }
+         {
+            Log () << kINFO << "Drop configuration" << Endl
+                   << "    drop repetitions = "
+                   << settings.dropRepetitions () << Endl;
+         }
+
+         auto trainNet = GPUNet.CreateClone(settings.batchSize());
          int idx = 0;
          for (auto f : dropConfig)
-            {
-               Log () << kINFO << "    Layer " << idx << " = " << f << Endl;
-               ++idx;
-            }
+         {
+            Log () << kINFO << "    Layer " << idx << " = " << f << Endl;
+            trainNet.GetLayer(idx).SetDropoutProbability(f);
+            ++idx;
+         }
          Log () << kINFO << Endl;
-        
-         if (ptrSettings->minimizerType () == TMVA::DNN::MinimizerType::fSteepest)
-            {
-               DNN::Steepest minimizer (ptrSettings->learningRate (), ptrSettings->momentum (), ptrSettings->repetitions ());
-               /*E =*/fNet.train (fWeights, trainPattern, testPattern, minimizer, *ptrSettings.get ());
+
+         using DataLoader_t = typename DNN::TCuda::DataLoader_t<DNN::TMVAInput_t>;
+         DataLoader_t trainingData(GetEventCollection(Types::kTraining),
+                                   nTrainingSamples,
+                                   trainNet.GetBatchSize(),
+                                   trainNet.GetInputWidth(),
+                                   trainNet.GetOutputWidth());
+
+         DataLoader_t testData(GetEventCollection(Types::kTesting),
+                               nTestSamples,
+                               nTestSamples,
+                               trainNet.GetInputWidth(),
+                               trainNet.GetOutputWidth());
+         auto testNet   = GPUNet.CreateClone(testData.GetBatchSize());
+         DNN::TGradientDescent<DNN::TCuda> minimizer{};
+
+         minimizer.Reset();
+         minimizer.SetLearningRate(settings.learningRate());
+         minimizer.SetTestInterval(settings.testRepetitions());
+         minimizer.SetConvergenceSteps(settings.convergenceSteps());
+
+         bool converged = false;
+         size_t stepCount = 0;
+
+         while (!converged)
+         {
+            // Perform minimization steps for a full epoch.
+            if ((stepCount % minimizer.GetTestInterval()) != 0) {
+               for (auto batch : trainingData) {
+                  auto inputMatrix  = batch.GetInput();
+                  auto outputMatrix = batch.GetOutput();
+                  minimizer.StepReducedWeights(trainNet, inputMatrix, outputMatrix);
+               }
+            } else {
+               Double_t trainingError = 0.0;
+               for (auto batch : trainingData) {
+                  auto inputMatrix  = batch.GetInput();
+                  auto outputMatrix = batch.GetOutput();
+                  trainingError += minimizer.StepReducedWeightsLoss(
+                      trainNet,
+                      inputMatrix,
+                      outputMatrix);
+               }
+               trainingError /= (Double_t) trainingData.GetNBatchesInEpoch();
+
+               auto testBatch  = *testData.begin();
+               auto testInput  = testBatch.GetInput();
+               auto testOutput = testBatch.GetOutput();
+               minimizer.TestError(testNet, testInput, testOutput);
+
+               TString convText = Form("(train/test/epo/conv/maxco): %.3g/%.3g/%d/%d",
+                                       trainingError,
+                                       minimizer.GetTestError(),
+                                       (int) stepCount,
+                                       (int) minimizer.GetConvergenceCount ());
+               Double_t progress = minimizer.GetConvergenceCount()
+                                   / settings.convergenceSteps();
+               settings.cycle(progress, convText);
+               converged = minimizer.HasConverged();
             }
-         ptrSettings.reset ();
-         Log () << kINFO << Endl;
+            stepCount++;
+         }
+         fMonitoring = 0;
       }
-   fMonitoring = 0;
-}
+   fWeights.clear();
 
+   size_t weightIndex = 0;
+   size_t prevLayerWidth = GPUNet.GetInputWidth();
 
+   for (size_t l = 0; l < GPUNet.GetDepth(); l++) {
+      auto &layer = GPUNet.GetLayer(l);
+      size_t layerWidth = layer.GetWidth();
+      size_t layerSize = prevLayerWidth * layerWidth;
+      fWeights.reserve(fWeights.size() + layerSize);
+      TMatrixT<Double_t> Weights(layer.GetWeights());
 
+      for (size_t j = 0; j < (size_t) Weights.GetNcols(); j++) {
+         for (size_t i = 0; i < (size_t) Weights.GetNrows(); i++) {
+            fWeights.push_back(Weights(i,j));
+         }
+      }
 
+      if (l == 0) {
+         fWeights.reserve(fWeights.size() + layerWidth);
+         TMatrixT<Double_t> theta(layer.GetBiases());
+         for (size_t i = 0; i < layerWidth; i++) {
+             fWeights.push_back(theta(i,0));
+         }
+      }
+      prevLayerWidth = layerWidth;
+   }
+
+#else // DNNCUDA flag not set.
+
+   Log() << kFATAL << "CUDA backend not enabled. Please make sure "
+                      "you have CUDA installed and it was successfully "
+                      "detected by CMAKE." << Endl;
+#endif // DNNCUDA
+}
 
 //_______________________________________________________________________
 Double_t TMVA::MethodDNN::GetMvaValue( Double_t* /*errLower*/, Double_t* /*errUpper*/ )
diff --git a/tmva/tmva/test/CMakeLists.txt b/tmva/tmva/test/CMakeLists.txt
new file mode 100644
index 0000000000000..fc2a3e633d359
--- /dev/null
+++ b/tmva/tmva/test/CMakeLists.txt
@@ -0,0 +1,9 @@
+############################################################################
+# CMakeLists.txt file for building ROOT TMVA tests.
+# @author Simon Pfreundschuh
+############################################################################
+
+project(tmva-tests)
+find_package(ROOT REQUIRED)
+
+ROOT_ADD_TEST_SUBDIRECTORY(DNN)
diff --git a/tmva/tmva/test/DNN/CMakeLists.txt b/tmva/tmva/test/DNN/CMakeLists.txt
new file mode 100644
index 0000000000000..d9c36a272c93a
--- /dev/null
+++ b/tmva/tmva/test/DNN/CMakeLists.txt
@@ -0,0 +1,89 @@
+############################################################################
+# CMakeLists.txt file for building TMVA/DNN tests.
+# @author Simon Pfreundschuh
+############################################################################
+
+project(tmva-tests)
+find_package(ROOT REQUIRED)
+
+set(Libraries Core MathCore Matrix TMVA)
+include_directories(${ROOT_INCLUDE_DIRS})
+
+# DNN - Backpropagation
+ROOT_EXECUTABLE(testBackpropagation TestBackpropagation.cxx
+                LIBRARIES ${Libraries})
+ROOT_ADD_TEST(TMVA-DNN-Backpropagation COMMAND testBackpropagation)
+
+# DNN - Activation Functions
+ROOT_EXECUTABLE(testActivationFunctions TestActivationFunctions.cxx
+                LIBRARIES ${Libraries})
+ROOT_ADD_TEST(TMVA-DNN-ActivationFunctions COMMAND testActivationFunctions)
+
+# DNN - Loss Functions
+ROOT_EXECUTABLE(testLossFunctions TestLossFunctions.cxx
+                LIBRARIES ${Libraries})
+ROOT_ADD_TEST(TMVA-DNN-LossFunctions COMMAND testLossFunctions)
+
+# DNN - Derivatives
+ROOT_EXECUTABLE(testDerivatives TestDerivatives.cxx
+                LIBRARIES ${Libraries})
+ROOT_ADD_TEST(TMVA-DNN-Derivatives COMMAND testDerivatives)
+
+# DNN - Minimization
+ROOT_EXECUTABLE(testMinimization TestMinimization.cxx
+                LIBRARIES ${Libraries})
+ROOT_ADD_TEST(TMVA-DNN-Minimization COMMAND testMinimization)
+
+# DNN - Data Loader
+ROOT_EXECUTABLE(testDataLoader TestDataLoader.cxx
+                LIBRARIES ${Libraries})
+ROOT_ADD_TEST(TMVA-DNN-DataLoader COMMAND testDataLoader)
+
+find_package(CUDA)
+if (CUDA_FOUND)
+  # DNN - Cuda
+  CUDA_ADD_EXECUTABLE(testCuda TestCuda.cxx)
+  TARGET_LINK_LIBRARIES(testCuda ${Libraries} dnn_cuda ${CUDA_CUBLAS_LIBRARIES})
+  ROOT_ADD_TEST(TMVA-DNN-CUDA COMMAND testCuda)
+
+  # DNN - Activation Functions Cuda
+  CUDA_ADD_EXECUTABLE(testActivationFunctionsCuda TestActivationFunctionsCuda.cxx)
+  TARGET_LINK_LIBRARIES(testActivationFunctionsCuda ${Libraries} dnn_cuda
+    ${CUDA_CUBLAS_LIBRARIES})
+  ROOT_ADD_TEST(TMVA-DNN-ActivationFunctionsCuda COMMAND testActivationFunctionsCuda)
+
+  # DNN - Loss Functions Cuda
+  CUDA_ADD_EXECUTABLE(testLossFunctionsCuda TestLossFunctionsCuda.cxx)
+  TARGET_LINK_LIBRARIES(testLossFunctionsCuda ${Libraries} dnn_cuda
+    ${CUDA_CUBLAS_LIBRARIES})
+  ROOT_ADD_TEST(TMVA-DNN-LossFunctionsCuda COMMAND testLossFunctionsCuda)
+
+  # DNN - Derivatives Cuda
+  CUDA_ADD_EXECUTABLE(testDerivativesCuda TestDerivativesCuda.cxx)
+  TARGET_LINK_LIBRARIES(testDerivativesCuda ${Libraries} dnn_cuda
+    ${CUDA_CUBLAS_LIBRARIES} dnn_cuda)
+  ROOT_ADD_TEST(TMVA-DNN-DerivativesCuda COMMAND testDerivativesCuda)
+
+  # DNN - Backpropagation Cuda
+  CUDA_ADD_EXECUTABLE(testBackpropagationCuda TestBackpropagationCuda.cxx)
+  TARGET_LINK_LIBRARIES(testBackpropagationCuda ${Libraries} dnn_cuda
+    ${CUDA_CUBLAS_LIBRARIES})
+  ROOT_ADD_TEST(TMVA-DNN-BackpropagationCuda COMMAND testBackpropagationCuda)
+
+  # DNN - Minimization Cuda
+  CUDA_ADD_EXECUTABLE(testMinimizationCuda TestMinimizationCuda.cxx)
+  TARGET_LINK_LIBRARIES(testMinimizationCuda ${Libraries} dnn_cuda
+    ${CUDA_CUBLAS_LIBRARIES})
+  ROOT_ADD_TEST(TMVA-DNN-MinimizationCuda COMMAND testMinimizationCuda)
+
+  # DNN - Arithmetic Cuda
+  CUDA_ADD_EXECUTABLE(testArithmeticCuda TestMatrixArithmeticCuda.cxx)
+  TARGET_LINK_LIBRARIES(testArithmeticCuda ${Libraries} dnn_cuda
+    ${CUDA_CUBLAS_LIBRARIES})
+  ROOT_ADD_TEST(TMVA-DNN-ArithmeticCuda COMMAND testArithmeticCuda)
+
+  # DNN - Arithmetic Cuda
+  CUDA_ADD_EXECUTABLE(testDataLoaderCuda TestDataLoaderCuda.cxx)
+  TARGET_LINK_LIBRARIES(testDataLoaderCuda ${Libraries} dnn_cuda
+    ${CUDA_CUBLAS_LIBRARIES})
+endif (CUDA_FOUND)
\ No newline at end of file
diff --git a/tmva/tmva/test/DNN/TestActivationFunctions.cxx b/tmva/tmva/test/DNN/TestActivationFunctions.cxx
new file mode 100644
index 0000000000000..e5b9cc769c33e
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestActivationFunctions.cxx
@@ -0,0 +1,127 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+//  Concrete instantiation of the generic activation function test  //
+//  for the reference architecture.                                 //
+//////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TestActivationFunctions.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    std::cout << "Testing Activation Functions:" << std::endl;
+
+    double error;
+
+    // Identity.
+
+    error = testIdentity<TReference<double>>(10);
+    std::cout << "Testing identity activation:               ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testIdentityDerivative<TReference<double>>(10);
+    std::cout << "Testing identity activation derivative:    ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // ReLU.
+
+    error = testRelu<TReference<double>>(10);
+    std::cout << "Testing ReLU activation:                   ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testReluDerivative<TReference<double>>(10);
+    std::cout << "Testing ReLU activation derivative:        ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // Sigmoid.
+
+    error = testSigmoid<TReference<double>>(10);
+    std::cout << "Testing Sigmoid activation:                ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testSigmoidDerivative<TReference<double>>(10);
+    std::cout << "Testing Sigmoid activation derivative:     ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // TanH.
+
+    error = testTanh<TReference<double>>(10);
+    std::cout << "Testing TanH activation:                   ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testTanhDerivative<TReference<double>>(10);
+    std::cout << "Testing TanH activation derivative:        ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // Symmetric ReLU.
+
+    error = testSymmetricRelu<TReference<double>>(10);
+    std::cout << "Testing Symm. ReLU activation:             ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testSymmetricRelu<TReference<double>>(10);
+    std::cout << "Testing Symm. ReLU activation derivative:  ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // Soft Sign.
+
+    error = testSoftSign<TReference<double>>(10);
+    std::cout << "Testing Soft Sign activation:              ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testSoftSign<TReference<double>>(10);
+    std::cout << "Testing Soft Sign activation derivative:   ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // Gauss.
+
+    error = testGauss<TReference<double>>(10);
+    std::cout << "Testing Gauss activation:                  ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testGauss<TReference<double>>(10);
+    std::cout << "Testing Gauss activation derivative:       ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestActivationFunctions.h b/tmva/tmva/test/DNN/TestActivationFunctions.h
new file mode 100644
index 0000000000000..b50a5598d8b8f
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestActivationFunctions.h
@@ -0,0 +1,457 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+// Generic tests of the layer activation functions                  //
+//                                                                  //
+// Contains tests for each of the layer activation functions that   //
+// test the evaluation of the function using the evaluate(...)      //
+// method and the computation of the derivatives using              //
+// evaluate_derivative(...) on a randomly generated matrix. Each    //
+// function returns the maximum relative error between the expected //
+// result and the result obtained for the given arcthitecture.      //
+//////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_TEST_DNN_TEST_ACTIVATION_FUNCTIONS
+#define TMVA_TEST_DNN_TEST_ACTIVATION_FUNCTIONS
+
+#include "TMatrixT.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TMVA/DNN/Functions.h"
+#include "TMVA/DNN/Net.h"
+#include "Utility.h"
+
+using namespace TMVA::DNN;
+
+//______________________________________________________________________________
+//
+//  Identity Activation Function
+//______________________________________________________________________________
+
+/*! Test application of identity function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testIdentity(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::IDENTITY);
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the identity function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testIdentityDerivative(size_t ntests)
+    -> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::IDENTITY, AArch);
+      evaluateDerivative<TReference<Double_t>>(BRef, EActivationFunction::IDENTITY,
+                                               ARef);
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, BRef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  ReLU Activation Function
+//______________________________________________________________________________
+
+/*! Test application of ReLU function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testRelu(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::RELU);
+      applyMatrix(ARef, [](double x){return x < 0.0 ? 0.0 : x;});
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the ReLU function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testReluDerivative(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::RELU, AArch);
+      applyMatrix(ARef, [](double x){return x > 0.0 ? 1.0 : 0.0;});
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  Sigmoid Activation Function
+//______________________________________________________________________________
+
+/*! Test application of Sigmoid function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testSigmoid(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::SIGMOID);
+      applyMatrix(ARef, [](double x){return 1.0 / (1.0 + std::exp(-x));});
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the ReLU function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testSigmoidDerivative(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::SIGMOID, AArch);
+      applyMatrix(ARef, [](Double_t x){
+             Double_t sig = 1.0 / (1.0 + std::exp(-x));
+             return sig * (1.0 - sig);
+          });
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  Tanh Activation Function
+//______________________________________________________________________________
+
+/*! Test application of tanh function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testTanh(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::TANH);
+      applyMatrix(ARef, [](double x){return tanh(x);});
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the tanh function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testTanhDerivative(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::TANH, AArch);
+      applyMatrix(ARef, [](Double_t x){
+             Double_t t = tanh(x);
+             return 1 - t * t;
+          });
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  Symmetric ReLU Activation Function
+//______________________________________________________________________________
+
+/*! Test application of symmetric ReLU function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testSymmetricRelu(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::SYMMRELU);
+      applyMatrix(ARef, [](double x){return fabs(x);});
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the symmetric ReLU function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testSymmetricReluDerivative(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::SYMMRELU, AArch);
+      applyMatrix(ARef, [](Double_t x){
+             return (x < 0) ? -1.0 : 1.0;
+          });
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  Soft Sign Activation Function
+//______________________________________________________________________________
+
+/*! Test application of symmetric soft sign function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testSoftSign(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::SOFTSIGN);
+      applyMatrix(ARef, [](double x){return x / (1 + fabs(x));});
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the soft sign function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testSoftSignDerivative(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::SOFTSIGN, AArch);
+      applyMatrix(ARef, [](Double_t x){
+             Double_t y = 1 + fabs(x);
+             return 1.0 / (y * y);
+          });
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  Gauss Activation Functions
+//______________________________________________________________________________
+
+/*! Test application of Gauss activation function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testGauss(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::GAUSS);
+      applyMatrix(ARef, [](double x){return exp(- x * x);});
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the Gauss activation function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testGaussDerivative(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::GAUSS, AArch);
+      applyMatrix(ARef, [](Double_t x){return -2.0 * x * exp(- x * x);});
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+#endif
diff --git a/tmva/tmva/test/DNN/TestActivationFunctionsCuda.cxx b/tmva/tmva/test/DNN/TestActivationFunctionsCuda.cxx
new file mode 100644
index 0000000000000..14fdea79e7a5a
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestActivationFunctionsCuda.cxx
@@ -0,0 +1,72 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+//  Concrete instantiation of the generic activation function test  //
+//  for the TCuda implementation.                                    //
+//////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "Utility.h"
+#include "TestActivationFunctions.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    std::cout << "Testing Activation Functions:" << std::endl;
+
+    double error;
+
+    // Identity.
+
+    error = testIdentity<TCuda>(10);
+    std::cout << "Testing identity activation:            ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testIdentityDerivative<TCuda>(10);
+    std::cout << "Testing identity activation derivative: ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // ReLU.
+
+    error = testRelu<TCuda>(10);
+    std::cout << "Testing ReLU activation:                ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testReluDerivative<TCuda>(10);
+    std::cout << "Testing ReLU activation derivative:     ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // Sigmoid.
+
+    error = testSigmoid<TCuda>(10);
+    std::cout << "Testing Sigmoid activation:             ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testSigmoidDerivative<TCuda>(10);
+    std::cout << "Testing Sigmoid activation derivative:  ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-10)
+        return 1;
+    return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestBackpropagation.cxx b/tmva/tmva/test/DNN/TestBackpropagation.cxx
new file mode 100644
index 0000000000000..638c222609329
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestBackpropagation.cxx
@@ -0,0 +1,50 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////////
+// Concrete instantiation of the generic backpropagation test for //
+// the reference architecture.                                    //
+////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TestBackpropagation.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    std::cout << "Testing Backpropagation:" << std::endl;
+
+    double error;
+
+    //
+    // Test backpropagation for linear net.
+    //
+
+    error = testBackpropagationWeightsLinear<TReference<double>>(1.0);
+    if (error > 1e-7)
+        return 1;
+
+    error = testBackpropagationL1Regularization<TReference<double>>(1e-2);
+    if (error > 1e-7)
+        return 1;
+
+    error = testBackpropagationL2Regularization<TReference<double>>(1.0);
+    if (error > 1e-7)
+        return 1;
+
+    error = testBackpropagationBiasesLinear<TReference<double>>(1.0);
+    if (error > 1e-7)
+        return 1;
+
+    return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestBackpropagation.h b/tmva/tmva/test/DNN/TestBackpropagation.h
new file mode 100644
index 0000000000000..9082d3a33e315
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestBackpropagation.h
@@ -0,0 +1,416 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////////
+// Generic tests of the backpropagation algorithm.                //
+//                                                                //
+// All tests randomly generate a net with identity activation     //
+// functions, i.e.  which is completely linear and then tests the //
+// computed gradients for each layer using numerical              //
+// derivation. The restriction to linear nets is to avoid the     //
+// required division by the finite difference interval used to    //
+// approximate the numerical derivatives, which would otherwise   //
+// cause precision loss.                                          //
+////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Functions.h"
+#include "TMVA/DNN/Net.h"
+#include "Utility.h"
+
+using namespace TMVA::DNN;
+
+/*! Compute the loss of the net as a function of the weight at index (i,j) in
+ *  layer l. dx is added as an offset to the current value of the weight. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto evaluate_net_weight(TNet<Architecture> &net,
+                               typename Architecture::Matrix_t &X,
+                         const typename Architecture::Matrix_t &Y,
+                         size_t l,
+                         size_t i,
+                         size_t j,
+                         typename Architecture::Scalar_t dx)
+    -> typename Architecture::Scalar_t
+{
+    using Scalar_t = typename Architecture::Scalar_t;
+
+    net.GetLayer(l).GetWeights().operator()(i,j) += dx;
+    Scalar_t res = net.Loss(X, Y);
+    net.GetLayer(l).GetWeights().operator()(i,j) -= dx;
+    return res;
+}
+
+/*! Compute the loss of the net as a function of the weight at index i in
+ *  layer l. dx is added as an offset to the current value of the weight. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto evaluate_net_bias(TNet<Architecture> &net,
+                               typename Architecture::Matrix_t &X,
+                         const typename Architecture::Matrix_t &Y,
+                         size_t l,
+                         size_t i,
+                         typename Architecture::Scalar_t dx)
+    -> typename Architecture::Scalar_t
+{
+    using Scalar_t = typename Architecture::Scalar_t;
+
+    net.GetLayer(l).GetBiases().operator()(i,0) += dx;
+    Scalar_t res = net.Loss(X, Y);
+    net.GetLayer(l).GetBiases().operator()(i,0) -= dx;
+    return res;
+}
+
+/*! Generate a random net, perform forward and backward propagation and check
+ *  the weight gradients using numerical differentiation. Returns the maximum
+ *  relative gradient error and also prints it to stdout. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testBackpropagationWeightsLinear(typename Architecture::Scalar_t dx)
+-> typename Architecture::Scalar_t
+{
+    using Scalar_t = typename Architecture::Scalar_t;
+    using Matrix_t = typename Architecture::Matrix_t;
+    using Net_t    = TNet<Architecture>;
+
+
+    Net_t net(50, 50, ELossFunction::MEANSQUAREDERROR);
+
+    // Random net.
+    constructRandomLinearNet(net);
+    net.Initialize(EInitialization::GAUSS);
+
+    // Random training data.
+    Matrix_t X(50, 50);
+    randomBatch(X);
+
+    Matrix_t Y(50, net.GetOutputWidth());
+    randomMatrix(Y);
+
+    net.Forward(X);
+    net.Backward(X,Y);
+
+    Scalar_t maximum_error = 0.0;
+
+    // Compute derivatives for all weights using finite differences and
+    // compare to result obtained from backpropagation.
+    for (size_t l = 0; l < net.GetDepth(); l++)
+    {
+        std::cout << "\rTesting weight gradients:      layer: " << l << " / " << net.GetDepth();
+        std::cout << std::flush;
+        auto & layer = net.GetLayer(l);
+        auto & W     = layer.GetWeightGradients();
+
+        for (size_t i = 0; i < layer.GetWidth(); i++)
+        {
+            for (size_t j = 0; j < layer.GetInputWidth(); j++)
+            {
+                auto f = [& net, & X, &Y, l, i, j](Scalar_t x)
+                    {
+                        return evaluate_net_weight(net, X, Y, l, i, j, x);
+                    };
+                Scalar_t dy     = finiteDifference(f, dx) / (2.0 * dx);
+                Scalar_t dy_ref = W(i,j);
+
+                // Compute the relative error if dy != 0.
+                Scalar_t error;
+                if (std::fabs(dy_ref) > 1e-15)
+                {
+                    error = std::fabs((dy - dy_ref) / dy_ref);
+                }
+                else
+                {
+                    error = std::fabs(dy - dy_ref);
+                }
+
+                maximum_error = std::max(error, maximum_error);
+            }
+        }
+    }
+
+    std::cout << "\rTesting weight gradients:      ";
+    std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl;
+    return maximum_error;
+}
+
+/*! Generate a random, linear net, perform forward and backward propagation with
+ *  L1 regularization and check the weight gradients using numerical
+ *  differentiation. Returns the maximum relative gradient error and
+ *  also prints it to stdout. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testBackpropagationL1Regularization(typename Architecture::Scalar_t dx)
+-> typename Architecture::Scalar_t
+{
+    using Scalar_t = typename Architecture::Scalar_t;
+    using Matrix_t = typename Architecture::Matrix_t;
+    using Net_t    = TNet<Architecture>;
+
+    Net_t net(50, 50, ELossFunction::MEANSQUAREDERROR, ERegularization::L1, 0.1);
+
+    // Random net.
+    constructRandomLinearNet(net);
+    net.Initialize(EInitialization::GAUSS);
+
+    // Random training data.
+    Matrix_t X(50, 50);
+    randomBatch(X);
+
+    Matrix_t Y(50, net.GetOutputWidth());
+    randomMatrix(Y);
+
+    net.Forward(X);
+    net.Backward(X,Y);
+
+    Scalar_t maximum_error = 0.0;
+
+    // Compute derivatives for all weights using finite differences and
+    // compare to result obtained from backpropagation.
+    for (size_t l = 0; l < net.GetDepth(); l++)
+    {
+        std::cout << "\rTesting weight gradients (L1): layer: " << l << " / " << net.GetDepth();
+        std::cout << std::flush;
+        auto & layer = net.GetLayer(l);
+        auto & W     = layer.GetWeights();
+        auto & dW    = layer.GetWeightGradients();
+
+        for (size_t i = 0; i < layer.GetWidth(); i++) {
+            for (size_t j = 0; j < layer.GetInputWidth(); j++) {
+                // Avoid running into the non-derivable point at 0.0.
+                if (std::abs(W(i,j)) > dx) {
+                    auto f = [& net, & X, &Y, l, i, j](Scalar_t x)
+                        {
+                            return evaluate_net_weight(net, X, Y, l, i, j, x);
+                        };
+                    Scalar_t dy     = finiteDifference(f, dx) / (2.0 * dx);
+                    Scalar_t dy_ref = dW(i,j);
+
+                    // Compute the relative error if dy != 0.
+                    Scalar_t error;
+                    if (std::fabs(dy_ref) > 1e-15)
+                    {
+                        error = std::fabs((dy - dy_ref) / dy_ref);
+                    }
+                    else
+                    {
+                        error = std::fabs(dy - dy_ref);
+                    }
+
+                    maximum_error = std::max(error, maximum_error);
+                }
+            }
+        }
+    }
+
+    std::cout << "\rTesting weight gradients (L1): ";
+    std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl;
+    return maximum_error;
+}
+
+/*! Generate a random, linear net, perform forward and backward propagation with
+ *  L2 regularization and check the weight gradients using numerical
+ *  differentiation. Returns the maximum relative gradient error and
+ *  also prints it to stdout. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testBackpropagationL2Regularization(typename Architecture::Scalar_t dx)
+-> typename Architecture::Scalar_t
+{
+    using Scalar_t = typename Architecture::Scalar_t;
+    using Matrix_t = typename Architecture::Matrix_t;
+    using Net_t    = TNet<Architecture>;
+
+    Net_t net(50, 50, ELossFunction::MEANSQUAREDERROR, ERegularization::L2, 0.1);
+
+    // Random net.
+    constructRandomLinearNet(net);
+    net.Initialize(EInitialization::GAUSS);
+
+    // Random training data.
+    Matrix_t X(50, 50);
+    randomBatch(X);
+
+    Matrix_t Y(50, net.GetOutputWidth());
+    randomMatrix(Y);
+
+    net.Forward(X);
+    net.Backward(X,Y);
+
+    Scalar_t maximum_error = 0.0;
+
+    // Compute derivatives for all weights using finite differences and
+    // compare to result obtained from backpropagation.
+    for (size_t l = 0; l < net.GetDepth(); l++)
+    {
+        std::cout << "\rTesting weight gradients (L2): layer: " << l << " / " << net.GetDepth();
+        std::cout << std::flush;
+        auto & layer = net.GetLayer(l);
+        auto & W     = layer.GetWeightGradients();
+
+        for (size_t i = 0; i < layer.GetWidth(); i++)
+        {
+            for (size_t j = 0; j < layer.GetInputWidth(); j++)
+            {
+                auto f = [& net, & X, &Y, l, i, j](Scalar_t x)
+                    {
+                        return evaluate_net_weight(net, X, Y, l, i, j, x);
+                    };
+                Scalar_t dy     = finiteDifference(f, dx) / (2.0 * dx);
+                Scalar_t dy_ref = W(i,j);
+
+                // Compute the relative error if dy != 0.
+                Scalar_t error;
+                if (std::fabs(dy_ref) > 1e-15)
+                {
+                    error = std::fabs((dy - dy_ref) / dy_ref);
+                }
+                else
+                {
+                    error = std::fabs(dy - dy_ref);
+                }
+
+                maximum_error = std::max(error, maximum_error);
+            }
+        }
+    }
+
+    std::cout << "\rTesting weight gradients (L2): ";
+    std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl;
+    return maximum_error;
+}
+
+/*! Generate a random net, perform forward and backward propagation and check
+ *  the bias gradients using numerical differentiation. Returns the maximum
+ *  relative gradient error and also prints it to stdout. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testBackpropagationBiasesLinear(typename Architecture::Scalar_t dx)
+-> typename Architecture::Scalar_t
+{
+    using Net_t    = TNet<Architecture>;
+    using Scalar_t   = typename Architecture::Scalar_t;
+    using Matrix_t = typename Architecture::Matrix_t;
+
+
+    Net_t net(50, 50, ELossFunction::MEANSQUAREDERROR);
+
+    // Random net.
+    constructRandomLinearNet(net);
+    net.Initialize(EInitialization::GAUSS);
+
+    // Random training data.
+    Matrix_t X(50, 50);
+    randomBatch(X);
+
+    Matrix_t Y(50, net.GetOutputWidth());
+    randomMatrix(Y);
+
+    net.Forward(X);
+    net.Backward(X,Y);
+
+    Scalar_t maximum_error = 0.0;
+
+    // Compute derivatives for all bias terms using finite differences and
+    // compare to result obtained from backpropagation.
+    for (size_t l = 0; l < net.GetDepth(); l++)
+    {
+        std::cout << "\rTesting bias gradients:       layer: " << l << " / " << net.GetDepth();
+        std::cout << std::flush;
+        auto & layer = net.GetLayer(l);
+        auto & dtheta = layer.GetBiasGradients();
+
+        for (size_t i = 0; i < layer.GetWidth(); i++)
+        {
+            auto f = [& net, & X, &Y, l, i](Scalar_t x)
+                {
+                    return evaluate_net_bias(net, X, Y, l, i, x);
+                };
+            Scalar_t dy     = finiteDifference(f, dx);
+            Scalar_t dy_ref = dtheta(i,0) * 2.0 * dx;
+
+            // Compute the relative error if dy != 0.
+            Scalar_t error;
+            if (std::fabs(dy_ref) > 1e-10)
+            {
+                error = std::fabs((dy - dy_ref) / dy_ref);
+            }
+            else
+            {
+                error = std::fabs(dy - dy_ref);
+            }
+
+            maximum_error = std::max(error, maximum_error);
+        }
+    }
+
+    std::cout << "\rTesting bias gradients:        ";
+    std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl;
+    return maximum_error;
+}
+
+/*! Generate a random net, perform forward and backward propagation and check
+ *  the weight gradients using numerical differentiation. Returns the maximum
+ *  relative gradient error and also prints it to stdout. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testNumerics(TNet<Architecture> & net,
+                  typename Architecture::Matrix_t X,
+                  typename Architecture::Matrix_t Y,
+                  typename Architecture::Scalar_t dx)
+    -> typename Architecture::Scalar_t
+{
+    using Scalar_t   = typename Architecture::Scalar_t;
+
+    net.Forward(X);
+    net.Backward(X,Y);
+
+    Scalar_t maximum_error = 0.0;
+
+    // Compute derivatives for all weight using finite differences and
+    // compare to result obtained from backpropagation.
+    for (size_t l = 0; l < net.GetDepth(); l++)
+    {
+        std::cout << "\rTesting weight gradients:      layer: " << l << " / " << net.GetDepth();
+        std::cout << std::flush;
+        auto & layer = net.GetLayer(l);
+        auto & W     = layer.GetWeightGradients();
+
+        for (size_t i = 0; i < layer.GetWidth(); i++)
+        {
+            for (size_t j = 0; j < layer.GetInputWidth(); j++)
+            {
+                auto f = [& net, & X, &Y, l, i, j](Scalar_t x)
+                    {
+                        return evaluate_net_weight(net, X, Y, l, i, j, x);
+                    };
+                Scalar_t dy     = finiteDifference(f, dx) / (2.0 * dx);
+                Scalar_t dy_ref = W(i,j);
+
+                // Compute the relative error if dy != 0.
+                Scalar_t error;
+                if (std::fabs(dy_ref) > 1e-15)
+                {
+                    error = std::fabs((dy - dy_ref) / dy_ref);
+                }
+                else
+                {
+                    error = std::fabs(dy - dy_ref);
+                }
+
+                maximum_error = std::max(error, maximum_error);
+            }
+        }
+    }
+
+    std::cout << "\rTesting weight gradients:      ";
+    std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl;
+    return maximum_error;
+}
diff --git a/tmva/tmva/test/DNN/TestBackpropagationCuda.cxx b/tmva/tmva/test/DNN/TestBackpropagationCuda.cxx
new file mode 100644
index 0000000000000..d2a875565d918
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestBackpropagationCuda.cxx
@@ -0,0 +1,47 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////////
+// Concrete instantiation of the generic backpropagation test for //
+// CUDA architectures.                                            //
+////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMatrix.h"
+#include "TestBackpropagation.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    std::cout << "Testing Backpropagation:" << std::endl;
+
+    double error;
+
+    error = testBackpropagationWeightsLinear<TCuda>(1.0);
+    if (error > 1e-7)
+        return 1;
+
+    error = testBackpropagationL1Regularization<TCuda>(1e-2);
+    if (error > 1e-7)
+        return 1;
+
+    error = testBackpropagationL2Regularization<TCuda>(1.0);
+    if (error > 1e-7)
+        return 1;
+
+    error = testBackpropagationBiasesLinear<TCuda>(1.0);
+    if (error > 1e-7)
+        return 1;
+
+    return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestCuda.cxx b/tmva/tmva/test/DNN/TestCuda.cxx
new file mode 100644
index 0000000000000..140b2d324bdcd
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestCuda.cxx
@@ -0,0 +1,196 @@
+#include "Utility.h"
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+#include <stdlib.h>
+
+using namespace TMVA::DNN;
+
+//_________________________________________________________________________________
+Double_t testMultiply()
+{
+    const size_t ntests = 100;
+
+    Double_t maximumError = 0;
+
+    for (size_t i = 0; i < ntests; i++) {
+        size_t m, n, k;
+        m = rand() % 50 + 1;
+        n = rand() % 50 + 1;
+        k = rand() % 50 + 1;
+
+        TMatrixT<Double_t> A(m,k), AT(k,m) , B(k,n), BT(n,k), C(m,n);
+        randomMatrix(A);
+        randomMatrix(AT);
+        randomMatrix(B);
+        randomMatrix(BT);
+        TCudaMatrix ACuda(A), ATCuda(AT), BCuda(B), BTCuda(BT),  CCuda(C);
+
+        TReference<Double_t>::MultiplyTranspose(C, A, BT);
+        TCuda::MultiplyTranspose(CCuda, ACuda, BTCuda);
+        TMatrixT<Double_t> CRef(CCuda);
+        Double_t error = maximumRelativeError(C, CRef);
+        maximumError   = std::max(error, maximumError);
+
+        C.Mult(A,B);
+        TCuda::Multiply(CCuda, ACuda, BCuda);
+        CRef = CCuda;
+        error = maximumRelativeError(C, CRef);
+        maximumError   = std::max(error, maximumError);
+
+        C.TMult(AT,B);
+        TCuda::TransposeMultiply(CCuda, ATCuda, BCuda);
+        CRef = CCuda;
+        error = maximumRelativeError(C, CRef);
+        maximumError   = std::max(error, maximumError);
+    }
+    return maximumError;
+}
+
+//_________________________________________________________________________________
+Double_t testAddRowWise()
+{
+   const size_t ntests = 10;
+
+   Double_t maximumError = 0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m, n;
+      m = rand() % 50 + 1;
+      n = rand() % 50 + 1;
+
+      TMatrixT<Double_t> A(m,n), B(m,n), theta(n,1);
+      //randomMatrix(A);
+      randomMatrix(theta);
+      TCudaMatrix ACuda(A), BCuda(B), thetaCuda(theta);
+
+      TReference<Double_t>::AddRowWise(A, theta);
+      TCuda::AddRowWise(ACuda,thetaCuda);
+      TMatrixT<Double_t> ARef(ACuda);
+
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError   = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//_________________________________________________________________________________
+Double_t testHadamard()
+{
+   const size_t ntests = 10;
+   Double_t maximumError = 0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m, n;
+      m = rand() % 10 + 1;
+      n = rand() % 10 + 1;
+
+      TMatrixT<Double_t> A(m,n), B(m,n);
+      randomMatrix(A);
+      randomMatrix(B);
+      TCudaMatrix ACuda(A), BCuda(B);
+
+      for (size_t j = 0; j < (size_t) A.GetNrows(); j++) {
+         for (size_t k = 0; k < (size_t) A.GetNcols(); k++) {
+             A(j,k) *= B(j,k);
+         }
+      }
+
+      TCuda::Hadamard(ACuda, BCuda);
+      TMatrixT<Double_t> ARef(ACuda);
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError   = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//_________________________________________________________________________________
+Double_t testReduction()
+{
+   const size_t ntests = 10;
+   Double_t maximumError = 0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m, n;
+      m = rand() % 1000 + 1;
+      n = rand() % 1000 + 1;
+
+      TMatrixT<Double_t> A(m,n);
+
+      for (size_t j = 0; j < m; j++) {
+         for (size_t k = 0; k < n; k++) {
+            A(j,k) = 1.0;
+         }
+      }
+      TCudaMatrix ACuda(A);
+
+      TCudaMatrix BCuda(1,n);
+      TCuda::InitializeZero(BCuda);
+      Double_t s  = TCuda::Sum(A);
+      TCuda::SumColumns(BCuda, ACuda);
+      TMatrixT<Double_t> B(BCuda);
+
+      Double_t error = s - ((Double_t) m * n);
+      maximumError   = std::max(error, maximumError);
+
+      for (size_t j = 0; j < n; j++) {
+         //std::cout << B(0,j) << " / " << j * m << std::endl;
+         error = std::abs(B(0,j) - m);
+         maximumError   = std::max(error, maximumError);
+      }
+   }
+   return maximumError;
+}
+
+//_________________________________________________________________________________
+Double_t testScaleAdd()
+{
+   const size_t ntests   = 10;
+   Double_t maximumError = 0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m, n;
+      m = rand() % 1000 + 1;
+      n = rand() % 1000 + 1;
+
+      TMatrixT<Double_t> A(m,n), B(m,n);
+
+      randomMatrix(A);
+      randomMatrix(B);
+
+      TCudaMatrix ACuda(A);
+      TCudaMatrix BCuda(B);
+
+      Double_t beta = ((Double_t) rand()) / ((Double_t) RAND_MAX);
+      TReference<Double_t>::ScaleAdd(A, B, beta);
+      TCuda::ScaleAdd(ACuda, BCuda, beta);
+
+      Double_t error = maximumRelativeError(A, (TMatrixT<Double_t>) ACuda);
+      maximumError   = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//_________________________________________________________________________________
+int main()
+{
+    Double_t error;
+    error = testReduction();
+    std::cout << "Testing reduction: max. rel. error = ";
+    std::cout << error << std::endl;
+
+    error = testScaleAdd();
+    std::cout << "Testing scale_add: max. rel. error = ";
+    std::cout << error << std::endl;
+
+    error = testHadamard();
+    std::cout << "Testing hadamard: max. rel. error = ";
+    std::cout << error << std::endl;
+
+    error = testMultiply();
+    std::cout << "Testing multiplication: max. rel. error = ";
+    std::cout << error << std::endl;
+
+    error = testAddRowWise();
+    std::cout << "Testing add_row_wise: max. rel. error = ";
+    std::cout << error << std::endl;
+}
diff --git a/tmva/tmva/test/DNN/TestDataLoader.cxx b/tmva/tmva/test/DNN/TestDataLoader.cxx
new file mode 100644
index 0000000000000..b68b383303cfa
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDataLoader.cxx
@@ -0,0 +1,26 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 12/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////
+// Test the reference data loader implementation. //
+////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TestDataLoader.h"
+
+using namespace TMVA::DNN;
+
+int main ()
+{
+   Double_t error = testDataLoader<TReference<Double_t>>();
+   std::cout << "Testing reference data loader: Mex. rel. error = " << error;
+   std::cout << std::endl;
+}
diff --git a/tmva/tmva/test/DNN/TestDataLoader.h b/tmva/tmva/test/DNN/TestDataLoader.h
new file mode 100644
index 0000000000000..e3664a8adb787
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDataLoader.h
@@ -0,0 +1,56 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 12/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////
+// Generic test for DataLoader implementations. //
+//////////////////////////////////////////////////
+
+#include "TMVA/DNN/Net.h"
+#include "Utility.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+template <typename Architecture_t>
+auto testDataLoader()
+    -> typename Architecture_t::Scalar_t
+{
+
+    using Scalar_t     = typename Architecture_t::Scalar_t;
+    using Matrix_t     = typename Architecture_t::Matrix_t;
+    using Net_t        = TNet<Architecture_t>;
+
+    using DataLoader_t = typename Architecture_t::template DataLoader_t<MatrixInput_t>;
+
+    Matrix_t X(2000, 100); randomMatrix(X);
+    MatrixInput_t input(X, X);
+    DataLoader_t loader(input, 2000, 20, 100, 100);
+
+    Net_t net(20, 100, ELossFunction::MEANSQUAREDERROR);
+    net.AddLayer(100,  EActivationFunction::IDENTITY);
+    net.AddLayer(100,  EActivationFunction::IDENTITY);
+    net.Initialize(EInitialization::IDENTITY);
+
+    Scalar_t maximumError = 0.0;
+    for (auto b : loader) {
+        Matrix_t inputMatrix  = b.GetInput();
+        Matrix_t outputMatrix = b.GetOutput();
+        Scalar_t error = net.Loss(inputMatrix, outputMatrix);
+        maximumError = std::max(error, maximumError);
+    }
+
+    return maximumError;
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/test/DNN/TestDataLoaderCuda.cxx b/tmva/tmva/test/DNN/TestDataLoaderCuda.cxx
new file mode 100644
index 0000000000000..6c5c78027c881
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDataLoaderCuda.cxx
@@ -0,0 +1,79 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 08/08/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////
+// Test the generic data loader for the CUDA implementation. //
+///////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Net.h"
+#include "TMVA/DNN/DataLoader.h"
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "Utility.h"
+#include "TMatrix.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+   using Matrix_t     = typename TCuda::Matrix_t;
+   using DataLoader_t = TDataLoader<MatrixInput_t, TCuda>;
+
+   TMatrixT<Double_t> A(20, 5), B(20, 5);
+   randomMatrix(A);
+   randomMatrix(B);
+
+   for(size_t i = 0; i < 20; i++) {
+      for (size_t j = 0; j < 5; j++) {
+         A(i,j) = i;
+         B(i,j) = j;
+      }
+   }
+
+   MatrixInput_t data(A, B);
+   DataLoader_t loader(data, 20, 5, 5, 5, 4);
+
+   TNet<TCuda> net(5, 5, ELossFunction::MEANSQUAREDERROR, ERegularization::NONE);
+   net.AddLayer(5, EActivationFunction::IDENTITY);
+   net.Initialize(EInitialization::IDENTITY);
+
+   std::vector<TNet<TCuda>> nets{};
+
+   std::vector<TCudaMatrix> ms{};
+   nets.reserve(4);
+   ms.reserve(4);
+   for (size_t i = 0; i < 4; i++) {
+      nets.push_back(net);
+      ms.push_back(TCudaMatrix(5,5));
+      for (size_t j = 0; j < net.GetDepth(); j++) {
+         TCuda::Copy(nets[i].GetLayer(j).GetWeights(), net.GetLayer(j).GetWeights());
+         TCuda::Copy(nets[i].GetLayer(j).GetBiases(), net.GetLayer(j).GetBiases());
+      }
+   }
+
+   std::cout << "net size: " << nets.size() << std::endl;
+   for (size_t i = 0; i < 4; i++) {
+   }
+   std::cout << "ms size: " << ms.size() << std::endl;
+
+   for (size_t i = 0; i < 4; i++) {
+      auto b = loader.GetBatch();
+      nets[i].Forward(b.GetInput());
+      TCuda::Copy(ms[i], nets[i].GetOutput());
+   }
+   for (size_t i = 0; i < 4; i++) {
+      ((TMatrixT<Double_t>) ms[i]).Print();
+   }
+}
+
+
+
+
+
diff --git a/tmva/tmva/test/DNN/TestDataStreamCuda.cxx b/tmva/tmva/test/DNN/TestDataStreamCuda.cxx
new file mode 100644
index 0000000000000..d67612595c310
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDataStreamCuda.cxx
@@ -0,0 +1,50 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Test TCudaDataLoader class for streaming data to CUDA devices. //
+///////////////////////////////////////////////////////////////////
+
+#include "TMatrix.h"
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "Utility.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    TMatrixT<Double_t> X(20,5), Y(20,10);
+    randomMatrix(X);
+    randomMatrix(Y);
+    MatrixInput_t inputData(X, Y);
+
+    TCudaDataLoader<MatrixInput_t> dataLoader(inputData, 20, 5, 10, 2, 2, 1);
+
+    CudaDouble_t maximumError = 0.0;
+    CudaDouble_t error        = 0.0;
+
+    for (auto b : dataLoader) {
+        TMatrixT<CudaDouble_t> X2(b.GetInput());
+        TMatrixT<CudaDouble_t> Y2(b.GetOutput());
+
+        std::cout << "::: BATCH :::" << std::endl;
+
+        X.Print();
+        X2.Print();
+        Y.Print();
+        Y2.Print();
+        maximumError = std::max(error, maximumError);
+        maximumError = std::max(error, maximumError);
+    }
+
+    std::cout << "Test data stream: Max. rel. error = " << maximumError << std::endl;
+
+}
diff --git a/tmva/tmva/test/DNN/TestDerivatives.cxx b/tmva/tmva/test/DNN/TestDerivatives.cxx
new file mode 100644
index 0000000000000..3a3e742469a69
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDerivatives.cxx
@@ -0,0 +1,65 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Concrete instantiation of the generic derivative test for the //
+//  reference implementation.                                    //
+///////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TestDerivatives.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+
+    double error;
+
+    //
+    // Activation Functions
+    //
+
+    std::cout << "Activation Functions:" << std::endl;
+    error = testActivationFunctionDerivatives<TReference<double>>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << print_error(error);
+    std::cout << std::endl << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    //
+    // Loss Functions
+    //
+
+    std::cout << "Loss Functions:" << std::endl;
+    error = testLossFunctionGradients<TReference<double>>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << print_error(error);
+    std::cout << std::endl << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    //
+    // Regularization Functions
+    //
+
+    std::cout << "Regularization:" << std::endl;
+    error = testRegularizationGradients<TReference<double>>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << print_error(error);
+    std::cout << std::endl << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestDerivatives.h b/tmva/tmva/test/DNN/TestDerivatives.h
new file mode 100644
index 0000000000000..07688e668642d
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDerivatives.h
@@ -0,0 +1,260 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+// Generic tests for the derivatives and gradiens of acitvation,    //
+// loss and regularization functions. Each function generates a     //
+// random 10 x 10 matrix and uses a central finite difference and   //
+// to numerically compute the derivative of the function            //
+// w.r.t. this element. The result is compared to the result        //
+// obtained by the corresponding analytic derivative implemented by //
+// the evaluateDerivative(...), evaluateGradients(...),             //
+// addRegularizationGradients(...) functions.                       //
+//////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Functions.h"
+#include "TMVA/DNN/Net.h"
+#include "Utility.h"
+
+using namespace TMVA::DNN;
+
+//______________________________________________________________________________
+//
+//  Activation Functions
+//______________________________________________________________________________
+
+/*! Generic function that numerically computes the derivative of a matrix
+ *  function f and the analytical solution given by df the function signatures
+ *  are assumed to be
+ *  - void f(Matrix_t &X)
+ *  - void df(Matrix_t &Y, const Matrix_t &X) -> derivative of f at X(i,j) is
+ *  The function f is supposed to apply the corresponding mathematical function
+ *  to each element in the provided matrix X. The function df is expected to
+ *  set each element in Y to the derivative of the corresponding mathematical
+ *  function evaluated at the corresponding element in X.
+ */
+template<typename Architecture, typename F, typename dF>
+    auto testDerivatives(F f, dF df,
+                         typename Architecture::Scalar_t dx)
+    -> typename Architecture::Scalar_t
+{
+    using Scalar_t   = typename Architecture::Scalar_t;
+    using Matrix_t   = typename Architecture::Matrix_t;
+
+    Scalar_t maximum_error = 0.0;
+
+    for (size_t i = 0; i < 100; i++)
+    {
+        Matrix_t X(10,10), Y(10,10);
+        randomMatrix(Y);
+
+        df(X, Y);
+        Scalar_t dy = X(0,0);
+
+        copyMatrix(X, Y);
+        X(0,0) += dx;
+        f(X);
+        Scalar_t y1 = X(0,0);
+        copyMatrix(X, Y);
+        X(0,0) -= dx;
+        f(X);
+        Scalar_t y0 = X(0,0);
+        Scalar_t dy_num = (y1 - y0) / (2.0 * dx);
+
+        Scalar_t error = 0.0;
+        if (std::fabs(dy) > 0)
+        {
+            error = std::fabs((dy_num - dy) / dy);
+        }
+        else
+            error = dy_num - dy;
+
+        maximum_error = std::max(maximum_error, error);
+    }
+
+    return maximum_error;
+}
+
+/*! Test derivatives of all activation functions and return the maximum relative
+ *  error. Prints the result for each function to the stdout. */
+//______________________________________________________________________________
+template<typename Architecture>
+auto testActivationFunctionDerivatives()
+    -> typename Architecture::Scalar_t
+{
+    using Scalar_t   = typename Architecture::Scalar_t;
+    using Matrix_t = typename Architecture::Matrix_t;
+
+    std::vector<EActivationFunction> EActivationFunctions
+        = {EActivationFunction::IDENTITY,
+           EActivationFunction::SIGMOID,
+           EActivationFunction::TANH,
+           EActivationFunction::SOFTSIGN,
+           EActivationFunction::GAUSS};
+
+    Scalar_t error, maximum_error;
+    maximum_error = 0.0;
+
+    for (auto & af : EActivationFunctions)
+    {
+        auto f  = [& af](Matrix_t &X){ evaluate<Architecture>(X, af);};
+        auto df = [& af](Matrix_t &X, const Matrix_t &Y)
+            {
+                evaluateDerivative<Architecture>(X, af, Y);
+            };
+        error = testDerivatives<Architecture>(f, df, 5e-6);
+
+        std::cout << "Testing " << static_cast<int>(af) << ": ";
+        std::cout << "Maximum Relative Error = " << print_error(error) << std::endl;
+
+        maximum_error = std::max(maximum_error, error);
+    }
+
+    return maximum_error;
+}
+
+//______________________________________________________________________________
+//
+//  Loss functions.
+//______________________________________________________________________________
+
+/*! Similar to testDerivatives only that here the mathematical function is
+ *  expected to be a matrix functional, i.e. to be mapping a matrix to a
+ *  scalar value. The scalar value is supposed to be computed by the provided
+ *  function object f, while the function object is just like above. */
+template<typename Architecture, typename F, typename dF>
+    auto testGradients(F f, dF df,
+                       typename Architecture::Scalar_t dx)
+    -> typename Architecture::Scalar_t
+{
+    using Scalar_t   = typename Architecture::Scalar_t;
+    using Matrix_t = typename Architecture::Matrix_t;
+
+    Scalar_t maximum_error = 0.0;
+
+    for (size_t i = 0; i < 100; i++)
+    {
+        Matrix_t X(10,10), Y(10,10), Z(10,10);
+        randomMatrix(X);
+        randomMatrix(Y);
+
+        df(Z, Y, X);
+        Scalar_t dy = Z(0,0);
+
+        X(0,0) += dx;
+        Scalar_t y1 = f(Y,X);
+        X(0,0) -= 2.0 * dx;
+        Scalar_t y0 = f(Y,X);
+        Scalar_t dy_num = (y1 - y0) / (2.0 * dx);
+
+        Scalar_t error = 0.0;
+        if (std::fabs(dy) > 0)
+        {
+            error = std::fabs((dy_num - dy) / dy);
+        }
+        else
+            error = dy_num - dy;
+
+        maximum_error = std::max(maximum_error, error);
+    }
+
+    return maximum_error;
+}
+
+/*! Test gradients of all loss function for the given architecture type and
+ *  return the maximum relative error. Prints results for each function to
+ *  standard out. */
+//______________________________________________________________________________
+template<typename Architecture>
+auto testLossFunctionGradients()
+    -> typename Architecture::Scalar_t
+{
+    using Scalar_t   = typename Architecture::Scalar_t;
+    using Matrix_t = typename Architecture::Matrix_t;
+
+    std::vector<ELossFunction> LossFunctions
+        = {ELossFunction::MEANSQUAREDERROR,
+           ELossFunction::CROSSENTROPY};
+
+    Scalar_t error, maximum_error;
+    maximum_error = 0.0;
+
+    for (auto & lf : LossFunctions)
+    {
+        auto f  = [lf](const Matrix_t &Y, const Matrix_t &Z)
+            {
+                return evaluate<Architecture>(lf, Y, Z);
+            };
+        auto df = [& lf](Matrix_t &X,
+                         const Matrix_t &Y,
+                         const Matrix_t &Z)
+            {
+                evaluateGradients<Architecture>(X, lf, Y, Z);
+            };
+
+        error = testGradients<Architecture>(f, df, 5e-6);
+
+        std::cout << "Testing " << static_cast<char>(lf) << ": ";
+        std::cout << "Maximum Relative Error = " << print_error(error) << std::endl;
+
+        maximum_error = std::max(maximum_error, error);
+    }
+
+    return maximum_error;
+}
+
+//______________________________________________________________________________
+//
+//  Regularization.
+//______________________________________________________________________________
+
+/*! Test the computation of gradients for all differentiable regularization types,
+ *  which is so far only L2 and no regularization and print the results to standard
+ *  out */
+template<typename Architecture>
+auto testRegularizationGradients()
+    -> typename Architecture::Scalar_t
+{
+    using Scalar_t   = typename Architecture::Scalar_t;
+    using Matrix_t = typename Architecture::Matrix_t;
+
+    std::vector<ERegularization> Regularizations
+        = {ERegularization::NONE,
+           ERegularization::L2};
+
+    Scalar_t error, maximum_error;
+    maximum_error = 0.0;
+
+    for (auto & r : Regularizations)
+    {
+        auto f  = [r](const Matrix_t & , const Matrix_t & Y)
+            {
+                return regularization<Architecture>(Y, r);
+            };
+        auto df = [& r](Matrix_t &X,
+                         const Matrix_t & ,
+                         const Matrix_t & Y)
+            {
+                applyMatrix(X, [](double){return 0.0;});
+                addRegularizationGradients<Architecture>(X, Y, (Scalar_t) 1.0, r);
+            };
+
+        error = testGradients<Architecture>(f, df, 1.0);
+
+        std::cout << "Testing " << static_cast<char>(r) << ": ";
+        std::cout << "Maximum Relative Error = " << print_error(error) << std::endl;
+
+        maximum_error = std::max(maximum_error, error);
+    }
+
+    return maximum_error;
+}
diff --git a/tmva/tmva/test/DNN/TestDerivativesCuda.cxx b/tmva/tmva/test/DNN/TestDerivativesCuda.cxx
new file mode 100644
index 0000000000000..fec8986e689da
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDerivativesCuda.cxx
@@ -0,0 +1,64 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Concrete instantiation of the generic derivative test for the //
+//  reference implementation.                                    //
+///////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TestDerivatives.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    double error;
+
+    //
+    // Activation Functions
+    //
+
+    std::cout << "Activation Functions:" << std::endl;
+    error = testActivationFunctionDerivatives<TCuda>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << print_error(error);
+    std::cout << std::endl << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    //
+    // Loss Functions
+    //
+
+    std::cout << "Loss Functions:" << std::endl;
+    error = testLossFunctionGradients<TCuda>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << print_error(error);
+    std::cout << std::endl << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    //
+    // Regularization Functions
+    //
+
+    std::cout << "Regularization:" << std::endl;
+    error = testRegularizationGradients<TCuda>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << print_error(error);
+    std::cout << std::endl << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestLossFunctions.cxx b/tmva/tmva/test/DNN/TestLossFunctions.cxx
new file mode 100644
index 0000000000000..831ecf22c9c57
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestLossFunctions.cxx
@@ -0,0 +1,60 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Test for the loss function reference implementation using the //
+// generic test defined in TestLossFunctions.h.                  //
+///////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TestLossFunctions.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    std::cout << "Testing Loss Functions:" << std::endl << std::endl;
+
+    double error;
+
+    //
+    // Mean Squared Error.
+    //
+
+    error = testMeanSquaredError<TReference<double>>(10);
+    std::cout << "Testing mean squared error loss:     ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testMeanSquaredErrorGradients<TReference<double>>(10);
+    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    //
+    // Cross Entropy.
+    //
+
+    error = testCrossEntropy<TReference<double>>(10);
+    std::cout << "Testing cross entropy loss:          ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testCrossEntropyGradients<TReference<double>>(10);
+    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+}
diff --git a/tmva/tmva/test/DNN/TestLossFunctions.h b/tmva/tmva/test/DNN/TestLossFunctions.h
new file mode 100644
index 0000000000000..179c92d2638de
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestLossFunctions.h
@@ -0,0 +1,195 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+// Generic tests of the loss functions                              //
+//                                                                  //
+// Contains generic test for architecture-specific implementations  //
+// of the loss functions. Requires the architecture-specific matrix //
+// type to be constructible and convertible from/to the             //
+// TMatrixT<Double_t> type.                                         //
+//////////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TMVA/DNN/Functions.h"
+#include "TMVA/DNN/Net.h"
+#include "Utility.h"
+
+using namespace TMVA::DNN;
+
+//______________________________________________________________________________
+//
+//  Mean Squared Error
+//______________________________________________________________________________
+
+template <typename Architecture>
+auto testMeanSquaredError(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Scalar_t   = typename Architecture::Scalar_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> X(m, n);
+      TMatrixT<Double_t> Y(m, n);
+      TMatrixT<Double_t> Z(m, n);
+
+      randomMatrix(X);
+      randomMatrix(Y);
+
+      Matrix_t XArch(X);
+      Matrix_t YArch(Y);
+
+      Scalar_t mse = evaluate<Architecture>(ELossFunction::MEANSQUAREDERROR,
+                                            YArch, XArch);
+      zipWithMatrix(Z, [](Scalar_t x, Scalar_t y){return x - y;}, X, Y);
+      auto squaredSum = [](Scalar_t x, Scalar_t y){return x + y * y;};
+      Scalar_t mseReference = reduceMean(squaredSum, 0.0, Z);
+
+      Double_t error;
+      if (mseReference != 0.0)
+          error = std::fabs((mse - mseReference) / mseReference);
+      else
+          error = std::fabs(mse - mseReference);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+template <typename Architecture>
+auto testMeanSquaredErrorGradients(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Scalar_t   = typename Architecture::Scalar_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+/*       size_t m = rand() % 100 + 1; */
+/*       size_t n = rand() % 100 + 1; */
+      size_t m = 8; //rand() % 100 + 1;
+      size_t n = 8; //rand() % 100 + 1;
+
+      TMatrixT<Double_t> X(m, n);
+      TMatrixT<Double_t> Y(m, n);
+      TMatrixT<Double_t> ZRef(m, n);
+
+      randomMatrix(X);
+      randomMatrix(Y);
+
+      Matrix_t XArch(X);
+      Matrix_t YArch(Y);
+      Matrix_t ZArch(Y);
+
+      evaluateGradients<Architecture>(ZArch, ELossFunction::MEANSQUAREDERROR,
+                                     XArch, YArch);
+      auto normedDifference = [m, n](Scalar_t x, Scalar_t y) {
+         return 2.0 * (y - x) / (m * n);
+      };
+      zipWithMatrix(ZRef, normedDifference, X, Y);
+      TMatrixT<Double_t> Z(ZArch);
+      Double_t error = maximumRelativeError(Z, ZRef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  Cross Entropy
+//______________________________________________________________________________
+
+template <typename Architecture>
+auto testCrossEntropy(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Scalar_t   = typename Architecture::Scalar_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> X(m, n);
+      TMatrixT<Double_t> Y(m, n);
+      TMatrixT<Double_t> Z(m, n);
+
+      randomMatrix(X);
+      randomMatrix(Y);
+
+      Matrix_t XArch(X);
+      Matrix_t YArch(Y);
+
+      Scalar_t ce = evaluate<Architecture>(ELossFunction::CROSSENTROPY,
+                                           YArch, XArch);
+
+      auto crossCorrelation = [](Scalar_t x, Scalar_t y) {
+         Scalar_t sig = 1.0 / (1.0 + std::exp(-x));
+            return y * std::log(sig) + (1 - y) * std::log(1 - sig);
+      };
+      zipWithMatrix(Z, crossCorrelation, X, Y);
+      auto sum = [](Scalar_t x, Scalar_t y) {return x + y;};
+      Scalar_t ceReference = - reduceMean(sum, 0.0, Z);
+
+      Double_t error;
+      if (ceReference != 0.0)
+          error = std::fabs((ce - ceReference) / ceReference);
+      else
+          error = std::fabs(ce - ceReference);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+template <typename Architecture>
+auto testCrossEntropyGradients(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Scalar_t   = typename Architecture::Scalar_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = 8; //rand() % 100 + 1;
+      size_t n = 8; //rand() % 100 + 1;
+
+      TMatrixT<Double_t> X(m, n);
+      TMatrixT<Double_t> Y(m, n);
+      TMatrixT<Double_t> ZRef(m, n);
+
+      randomMatrix(X);
+      randomMatrix(Y);
+
+      Matrix_t XArch(X);
+      Matrix_t YArch(Y);
+      Matrix_t ZArch(Y);
+
+      evaluateGradients<Architecture>(ZArch, ELossFunction::CROSSENTROPY,
+                                     YArch, XArch);
+      auto crossCorrelationGradient = [m, n](Scalar_t x, Scalar_t y) {
+         Scalar_t sig  = 1.0 / (1.0 + std::exp(-x));
+         Scalar_t norm = 1.0 / ((Scalar_t) m * n);
+         return (sig - y) * norm;};
+      zipWithMatrix(ZRef, crossCorrelationGradient, X, Y);
+
+      TMatrixT<Double_t> Z(ZArch);
+      Double_t error = maximumRelativeError(Z, ZRef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
diff --git a/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx b/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx
new file mode 100644
index 0000000000000..e4802770966b5
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx
@@ -0,0 +1,60 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Test for the loss function reference implementation using the //
+// generic test defined in TestLossFunctions.h.                  //
+///////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TestLossFunctions.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    std::cout << "Testing Loss Functions:" << std::endl << std::endl;
+
+    double error;
+
+    //
+    // Mean Squared Error.
+    //
+
+    error = testMeanSquaredError<TCuda>(10);
+    std::cout << "Testing mean squared error loss:     ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testMeanSquaredErrorGradients<TCuda>(10);
+    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    //
+    // Cross Entropy.
+    //
+
+    error = testCrossEntropy<TCuda>(10);
+    std::cout << "Testing cross entropy loss:          ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testCrossEntropyGradients<TCuda>(10);
+    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+}
diff --git a/tmva/tmva/test/DNN/TestMatrixArithmetic.h b/tmva/tmva/test/DNN/TestMatrixArithmetic.h
new file mode 100644
index 0000000000000..7bd351682b4f9
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestMatrixArithmetic.h
@@ -0,0 +1,119 @@
+// @(#)root/tmva/tmva/dnn:$Id$ // Author: Simon Pfreundschuh 20/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Test arithmetic functions defined on matrices and compare the //
+// results to the reference implementation.                      //
+///////////////////////////////////////////////////////////////////
+
+#include "TMatrix.h"
+#include "Utility.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+
+/** Test multiplication (standard, transposed, hadamard) operation on
+ *  architecture specific matrix types and compare with results
+ *  obtained with TMatrixT.
+ */
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto testMultiplication(size_t ntests)
+    -> typename Architecture_t::Scalar_t
+{
+
+   using Scalar_t = typename Architecture_t::Scalar_t;
+   using Matrix_t = typename Architecture_t::Matrix_t;
+
+   Scalar_t maximumError = 0.0;
+   for (size_t t = 0; t < ntests; t++) {
+
+      size_t m, n, k;
+      m = rand() % 100 + 1;
+      n = rand() % 100 + 1;
+      k = rand() % 100 + 1;
+
+      TMatrixT<Scalar_t> ARef(m,k), A2Ref(m,k), ATRef(k,m) , BRef(k,n),
+          BTRef(n,k), CRef(m,n);
+      TMVA::DNN::randomMatrix(ARef);
+      TMVA::DNN::randomMatrix(A2Ref);
+      TMVA::DNN::randomMatrix(ATRef);
+      TMVA::DNN::randomMatrix(BRef);
+      TMVA::DNN::randomMatrix(BTRef);
+      Matrix_t A(ARef), A2(A2Ref), AT(ATRef), B(BRef), BT(BTRef),  C(CRef);
+
+      // A * B
+      CRef.Mult(ARef,BRef);
+      Architecture_t::Multiply(C, A, B);
+      Scalar_t error = TMVA::DNN::maximumRelativeError((TMatrixT<Scalar_t>) C, CRef);
+      maximumError   = std::max(error, maximumError);
+
+      // A^T * B
+      CRef.TMult(ATRef,BRef);
+      Architecture_t::TransposeMultiply(C, AT, B);
+      error = TMVA::DNN::maximumRelativeError((TMatrixT<Scalar_t>) C, CRef);
+      maximumError   = std::max(error, maximumError);
+
+      // A * B^T
+      CRef.MultT(ARef,BTRef);
+      Architecture_t::MultiplyTranspose(C, A, BT);
+      error = TMVA::DNN::maximumRelativeError((TMatrixT<Scalar_t>) C, CRef);
+      maximumError   = std::max(error, maximumError);
+
+      // A .* B
+      for (size_t i = 0; i < (size_t) ARef.GetNrows(); i++) {
+         for (size_t j = 0; j < (size_t) ARef.GetNcols(); j++) {
+            ARef(i,j) *= A2Ref(i,j);
+         }
+      }
+      Architecture_t::Hadamard(A, A2);
+      error = TMVA::DNN::maximumRelativeError((TMatrixT<Scalar_t>) A, ARef);
+      maximumError   = std::max(error, maximumError);
+   }
+
+   return maximumError;
+}
+
+/** Test the summing over columns by summing by the sums obtained
+ *  from a matrix filled with column indices as elements.
+ */
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto testSumColumns(size_t ntests)
+    -> typename Architecture_t::Scalar_t
+{
+
+   using Scalar_t = typename Architecture_t::Scalar_t;
+   using Matrix_t = typename Architecture_t::Matrix_t;
+
+   Scalar_t maximumError = 0.0;
+   for (size_t t = 0; t < ntests; t++) {
+
+      Scalar_t error;
+
+      size_t m, n;
+      m = rand() % 100 + 1;
+      n = rand() % 100 + 1;
+
+      TMatrixT<Scalar_t> ARef(m,n), BRef(n,1);
+
+      for (size_t i = 0; i < (size_t) ARef.GetNrows(); i++) {
+         for (size_t j = 0; j < (size_t) ARef.GetNcols(); j++) {
+            ARef(i,j) = j;
+            if (i == 0) BRef(j, 0) = m * j;
+         }
+      }
+
+      Matrix_t A(ARef), B(n, 1);
+      Architecture_t::SumColumns(B, A);
+
+      error = TMVA::DNN::maximumRelativeError((TMatrixT<Scalar_t>) B ,BRef);
+      maximumError   = std::max(error, maximumError);
+   }
+   return maximumError;
+}
diff --git a/tmva/tmva/test/DNN/TestMatrixArithmeticCuda.cxx b/tmva/tmva/test/DNN/TestMatrixArithmeticCuda.cxx
new file mode 100644
index 0000000000000..58617a28c66ca
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestMatrixArithmeticCuda.cxx
@@ -0,0 +1,32 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////////
+// Concrete instantiation of the generic backpropagation test for //
+// CUDA architectures.                                            //
+////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TestMatrixArithmetic.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    std::cout << "Testing CUDA Matrix Arithmetic:" << std::endl;
+
+    Double_t error = testMultiplication<TCuda>(10);
+    std::cout << "Multiplication: " << "Max. rel. error: " << error << std::endl;
+
+    error = testSumColumns<TCuda>(1);
+    std::cout << "Column Sum:     " << "Max. rel. error: " << error << std::endl;
+}
diff --git a/tmva/tmva/test/DNN/TestMinimization.cxx b/tmva/tmva/test/DNN/TestMinimization.cxx
new file mode 100644
index 0000000000000..a2e5b36b7a249
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestMinimization.cxx
@@ -0,0 +1,29 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////
+// Test the Neural Network training using the reference   //
+// implementation.                                        //
+//                                                        //
+// Calls the generic testMinimization function defined in //
+// TestMinimization.cpp for the reference architecture.   //
+////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TestMinimization.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    testMinimization<TReference<double>>();
+}
diff --git a/tmva/tmva/test/DNN/TestMinimization.h b/tmva/tmva/test/DNN/TestMinimization.h
new file mode 100644
index 0000000000000..202dc0603f34d
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestMinimization.h
@@ -0,0 +1,64 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////////////
+// Test Standard Minimizer                                         //
+//                                                                 //
+// This test trains a linear neural network on a linear function   //
+// F(x) = W * x and computes the relative error between the matrix //
+// W' representing the linear function learned by the net to the   //
+// orignal matrix W.                                               //
+/////////////////////////////////////////////////////////////////////
+
+#include "TMatrix.h"
+#include "TMVA/DNN/Minimizers.h"
+#include "TMVA/DNN/Net.h"
+#include "Utility.h"
+
+using namespace TMVA::DNN;
+
+template <typename Architecture>
+   auto testMinimization()
+   -> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Net_t    = TNet<Architecture>;
+  
+   size_t nSamples  = 1000000;
+   size_t nFeatures = 20;
+   size_t batchSize = 1024;
+
+   TMatrixT<Double_t> XTrain(nSamples, nFeatures), YTrain(nSamples, 1),
+    XTest(batchSize, nFeatures), YTest(batchSize, 1), W(1, nFeatures);
+
+   randomMatrix(W);
+   randomMatrix(XTrain);
+   randomMatrix(XTest);
+   YTrain.MultT(XTrain, W);
+   YTest.MultT(XTest, W);
+
+   Net_t net(batchSize, nFeatures, ELossFunction::MEANSQUAREDERROR);
+   net.AddLayer(512, EActivationFunction::TANH);
+   net.AddLayer(512, EActivationFunction::TANH);
+   net.AddLayer(512, EActivationFunction::TANH);
+   net.AddLayer(512, EActivationFunction::TANH);
+   net.AddLayer(512, EActivationFunction::TANH);
+   net.AddLayer(1, EActivationFunction::IDENTITY);
+   net.Initialize(EInitialization::GAUSS);
+
+   TGradientDescent<Architecture> minimizer(0.000001, 1, 10);
+   MatrixInput_t trainingData(XTrain, YTrain);
+   MatrixInput_t testData(XTest, YTest);
+   minimizer.Train(trainingData, nSamples, testData, batchSize, net, 8);
+
+   return 0.0;
+}
+
diff --git a/tmva/tmva/test/DNN/TestMinimizationCuda.cxx b/tmva/tmva/test/DNN/TestMinimizationCuda.cxx
new file mode 100644
index 0000000000000..7fddad1cdd329
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestMinimizationCuda.cxx
@@ -0,0 +1,29 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////////////
+// Use the generic tests defined in TestMinimization.h to test the //
+// training of Neural Networks for CUDA architectures.             //
+/////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Minimizers.h"
+#include "TestMinimization.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+   testMinimization<TCuda>();
+   
+}
diff --git a/tmva/tmva/test/DNN/Utility.h b/tmva/tmva/test/DNN/Utility.h
new file mode 100644
index 0000000000000..0f66b7839955b
--- /dev/null
+++ b/tmva/tmva/test/DNN/Utility.h
@@ -0,0 +1,247 @@
+#ifndef TMVA_TEST_DNN_UTILITY
+#define TMVA_TEST_DNN_UTILITY
+
+#include <iostream>
+#include <sstream>
+#include <type_traits>
+#include "stdlib.h"
+#include "TRandom.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TMVA/DNN/Functions.h"
+#include "TMVA/DNN/Net.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+    template <typename Architecture>
+        void constructRandomLinearNet(TNet<Architecture> & net)
+    {
+        int nlayers = rand() % 5 + 1;
+
+        std::vector<EActivationFunction> ActivationFunctions
+            = {EActivationFunction::IDENTITY};
+
+        for (int i = 0; i < nlayers; i++)
+        {
+            int width = rand() % 10 + 1;
+            EActivationFunction f =
+                ActivationFunctions[rand() % ActivationFunctions.size()];
+            net.AddLayer(width, f);
+        }
+    }
+
+    /*! Set matrix to the identity matrix */
+    template <typename MatrixType>
+        void identityMatrix(MatrixType &X)
+    {
+        size_t m, n;
+        m = X.GetNrows();
+        n = X.GetNcols();
+
+
+        for (size_t i = 0; i < m; i++) {
+            for (size_t j = 0; j < n; j++) {
+                X(i,j) = 0.0;
+            }
+            if (i < n) {
+                X(i,i) = 1.0;
+            }
+        }
+    }
+
+    /*! Generate a random batch as input for a neural net. */
+    template <typename MatrixType>
+        void randomMatrix(MatrixType &X)
+    {
+        size_t m,n;
+        m = X.GetNrows();
+        n = X.GetNcols();
+
+        TRandom rand(clock());
+
+        Double_t sigma = sqrt(10.0);
+
+        for (size_t i = 0; i < m; i++)
+        {
+            for (size_t j = 0; j < n; j++)
+            {
+                X(i,j) = rand.Gaus(0.0, sigma);
+            }
+        }
+    }
+
+    /*! Generate a random batch as input for a neural net. */
+    template <typename MatrixType>
+        void randomBatch(MatrixType &X)
+    {
+        randomMatrix(X);
+    }
+
+    /*! Generate a random batch as input for a neural net. */
+    template <typename MatrixType>
+        void copyMatrix(MatrixType &X, const MatrixType &Y)
+    {
+        size_t m,n;
+        m = X.GetNrows();
+        n = X.GetNcols();
+
+        for (size_t i = 0; i < m; i++)
+        {
+            for (size_t j = 0; j < n; j++)
+            {
+                X(i,j) = Y(i,j);
+            }
+        }
+    }
+
+    /*! Apply functional to each element in the matrix. */
+    template <typename MatrixType, typename F>
+        void applyMatrix(MatrixType &X, F f)
+    {
+        size_t m,n;
+        m = X.GetNrows();
+        n = X.GetNcols();
+
+        for (size_t i = 0; i < m; i++)
+        {
+            for (size_t j = 0; j < n; j++)
+            {
+                X(i,j) = f(X(i,j));
+            }
+        }
+    }
+
+    /*! Combine elements of two given matrices into a single matrix using
+     *  the given function f. */
+    template <typename MatrixType, typename F>
+        void zipWithMatrix(MatrixType &Z,
+                           F f,
+                           const MatrixType &X,
+                           const MatrixType &Y)
+    {
+        size_t m,n;
+        m = X.GetNrows();
+        n = X.GetNcols();
+
+        for (size_t i = 0; i < m; i++) {for (size_t j = 0; j < n; j++)
+            {
+                Z(i,j) = f(X(i,j), Y(i,j));
+            }
+        }
+    }
+
+    /*! Generate a random batch as input for a neural net. */
+    template <typename MatrixType, typename RealType, typename F>
+        RealType reduce(F f, RealType start, const MatrixType &X)
+    {
+        size_t m,n;
+        m = X.GetNrows();
+        n = X.GetNcols();
+
+        RealType result = start;
+
+        for (size_t i = 0; i < m; i++)
+        {
+            for (size_t j = 0; j < n; j++)
+            {
+                result = f(result, X(i,j));
+            }
+        }
+        return result;
+    }
+
+    /*! Generate a random batch as input for a neural net. */
+    template <typename MatrixType, typename RealType, typename F>
+        RealType reduceMean(F f, RealType start, const MatrixType &X)
+    {
+        size_t m,n;
+        m = X.GetNrows();
+        n = X.GetNcols();
+
+        RealType result = start;
+
+        for (size_t i = 0; i < m; i++)
+        {
+            for (size_t j = 0; j < n; j++)
+            {
+                result = f(result, X(i,j));
+            }
+        }
+        return result / (RealType) (m * n);
+    }
+
+    /*! Compute the relative error of x and y normalized by y. Protected against
+     *  division by zero. */
+    template <typename RealType>
+        inline auto relativeError(const RealType &x,
+                                  const RealType &y)
+        -> RealType
+    {
+        if (y != 0.0)
+            return std::fabs((x - y) / y);
+        else
+            return std::fabs(x - y);
+    }
+
+    /*! Compute the maximum, element-wise relative error of the matrices
+     *  X and Y normalized by the element of Y. Protected against division
+     *  by zero. */
+    template <typename MatrixType>
+        auto maximumRelativeError(const MatrixType &X,
+                                  const MatrixType &Y)
+        -> decltype(X(0,0))
+    {
+
+        using RealType = decltype(X(0,0));
+
+        size_t m,n;
+        m = X.GetNrows();
+        n = X.GetNcols();
+
+        RealType maximumError = 0.0;
+
+        for (size_t i = 0; i < m; i++)
+        {
+            for (size_t j = 0; j < n; j++)
+            {
+                RealType error = relativeError(X(i,j), Y(i,j));
+                maximumError = std::max(error, maximumError);
+            }
+        }
+        return maximumError;
+    }
+
+    /*! Numerically compute the derivative of the functional f using finite
+     *  differences. */
+    template <typename F, typename RealType>
+        inline RealType finiteDifference(F f, RealType dx)
+    {
+        return f(dx) - f(0.0 - dx);
+    }
+
+    /*! Color code error. */
+    template <typename RealType>
+        std::string print_error(RealType &e)
+    {
+        std::ostringstream out{};
+
+        out << ("\e[");
+
+        if (e > 1e-5)
+            out << "31m";
+        else if (e > 1e-9)
+            out << "33m";
+        else
+            out << "32m";
+
+        out << e;
+        out << "\e[39m";
+
+        return out.str();
+    }
+}
+}
+
+#endif