diff --git a/tmva/tmva/CMakeLists.txt b/tmva/tmva/CMakeLists.txt index 449e9080999ad..9220b5fef07d2 100644 --- a/tmva/tmva/CMakeLists.txt +++ b/tmva/tmva/CMakeLists.txt @@ -35,6 +35,20 @@ set(headers4 TNeuron.h TSynapse.h TActivationChooser.h TActivation.h TActivation TNeuronInputSqSum.h TNeuronInputAbs.h Types.h Ranking.h RuleFit.h RuleFitAPI.h IMethod.h MsgLogger.h VariableTransformBase.h VariableIdentityTransform.h VariableDecorrTransform.h VariablePCATransform.h VariableGaussTransform.h VariableNormalizeTransform.h VariableRearrangeTransform.h VariableTransform.h ROCCalc.h ROCCurve.h) +set(dnn_files src/DNN/Architectures/Reference.cxx + src/DNN/Architectures/Reference/DataLoader.cxx) +set(dnn_cuda_files src/DNN/Architectures/Cuda/ActivationFunctions.cu + src/DNN/Architectures/Cuda/Arithmetic.cu + src/DNN/Architectures/Cuda/Buffers.cxx + src/DNN/Architectures/Cuda/CudaMatrix.cu + src/DNN/Architectures/Cuda/DataLoader.cu + src/DNN/Architectures/Cuda/Dropout.cu + src/DNN/Architectures/Cuda/Initialization.cu + src/DNN/Architectures/Cuda/Kernels.cu + src/DNN/Architectures/Cuda/LossFunctions.cu + src/DNN/Architectures/Cuda/OutputFunctions.cu + src/DNN/Architectures/Cuda/Propagation.cu + src/DNN/Architectures/Cuda/Regularization.cu) #---Need to suffix each header name by TMVA/ ----------------- foreach(hs headers1 headers2 headers3 headers4) @@ -45,7 +59,18 @@ endforeach() ROOT_GENERATE_DICTIONARY(G__TMVA ${theaders1} ${theaders2} ${theaders3} ${theaders4} MODULE TMVA LINKDEF LinkDef.h OPTIONS "-writeEmptyRootPCM") -ROOT_LINKER_LIBRARY(TMVA *.cxx G__TMVA.cxx LIBRARIES Core +#---Handle CUDA dependent code. ----------------- +find_package(CUDA) +if (CUDA_FOUND) + cuda_add_library(dnn_cuda ${dnn_cuda_files}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDNNCUDA") + set(cuda_libraries dnn_cuda ${CUDA_CUBLAS_LIBRARIES}) +else (CUDA_FOUND) + set(cuda_libraries) +endif(CUDA_FOUND) + +root_linker_library(TMVA *.cxx G__TMVA.cxx ${dnn_files} + LIBRARIES Core ${cuda_libraries} DEPENDENCIES RIO Hist Tree TreePlayer MLP Minuit XMLIO) install(DIRECTORY inc/TMVA/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/TMVA @@ -63,7 +88,7 @@ if(NOT gnuinstall) PATTERN "data" EXCLUDE) endif() -#ROOT_ADD_TEST_SUBDIRECTORY(test) +ROOT_ADD_TEST_SUBDIRECTORY(test/DNN) diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h new file mode 100644 index 0000000000000..a11f50d615cf4 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h @@ -0,0 +1,296 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 05/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////// +// Definition of the TCuda architecture, which provides an // +// implementation of the low-level functionality for neural // +// networks for the CUDA computing architectures. // +/////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CUDA +#define TMVA_DNN_ARCHITECTURES_CUDA + +#include + +#include "cuda.h" + +#include "Cuda/Types.h" +#include "Cuda/Kernels.h" +#include "Cuda/Buffers.h" +#include "Cuda/DataLoader.h" +#include "Cuda/CudaMatrix.h" +#include "TMVA/DNN/DataLoader.h" + +namespace TMVA +{ +namespace DNN +{ + +/** The TCuda architecture class. + * + * Low-level interface class for CUDA computing architecture. Contains as + * public types the declaration of the scalar, matrix and data loader types + * for this architecture as well as the remaining functions in the low-level + * interface in the form of static members. + */ +class TCuda +{ + +public: + + using Scalar_t = CudaDouble_t; + using Matrix_t = TCudaMatrix; + using DeviceBuffer_t = TCudaDeviceBuffer; + using HostBuffer_t = TCudaHostBuffer; + template + using DataLoader_t = TCudaDataLoader; + + //____________________________________________________________________________ + // + // Propagation + //____________________________________________________________________________ + + /** @name Forward Propagation + * Low-level functions required for the forward propagation of activations + * through the network. + */ + ///@{ + /** Matrix-multiply \p input with the transpose of \pweights and + * write the results into \p output. */ + static void MultiplyTranspose(TCudaMatrix &output, + const TCudaMatrix &input, + const TCudaMatrix &weights); + /** Add the vectors biases row-wise to the matrix output */ + static void AddRowWise(TCudaMatrix &output, + const TCudaMatrix &biases); + ///@} + + /** @name Backward Propagation + * Low-level functions required for the forward propagation of activations + * through the network. + */ + ///@{ + /** Perform the complete backward propagation step. If the provided + * \p activationGradientsBackward matrix is not empty, compute the + * gradients of the objective function with respect to the activations + * of the previous layer (backward direction). + * Also compute the weight and the bias gradients. Modifies the values + * in \p df and thus produces only a valid result, if it is applied the + * first time after the corresponding forward propagation has been per- + * formed. */ + static void Backward(TCudaMatrix & activationGradientsBackward, + TCudaMatrix & weightGradients, + TCudaMatrix & biasGradients, + TCudaMatrix & df, + const TCudaMatrix & activationGradients, + const TCudaMatrix & weights, + const TCudaMatrix & activationBackward); + /** Adds a the elements in matrix B scaled by c to the elements in + * the matrix A. This is required for the weight update in the gradient + * descent step.*/ + static void ScaleAdd(TCudaMatrix & A, + const TCudaMatrix & B, + Scalar_t beta = 1.0); + + static void Copy(TCudaMatrix & B, + const TCudaMatrix & A); + ///@} + + //____________________________________________________________________________ + // + // Activation Functions + //____________________________________________________________________________ + + /** @name Activation Functions + * For each activation function, the low-level interface contains two routines. + * One that applies the acitvation function to a matrix and one that evaluate + * the derivatives of the activation function at the elements of a given matrix + * and writes the results into the result matrix. + */ + ///@{ + static void Identity(TCudaMatrix & B); + static void IdentityDerivative(TCudaMatrix & B, + const TCudaMatrix & A); + + static void Relu(TCudaMatrix & B); + static void ReluDerivative(TCudaMatrix & B, + const TCudaMatrix & A); + + static void Sigmoid(TCudaMatrix & B); + static void SigmoidDerivative(TCudaMatrix & B, + const TCudaMatrix & A); + + static void Tanh(TCudaMatrix & B); + static void TanhDerivative(TCudaMatrix & B, + const TCudaMatrix & A); + + static void SymmetricRelu(TCudaMatrix & B); + static void SymmetricReluDerivative(TCudaMatrix & B, + const TCudaMatrix & A); + + static void SoftSign(TCudaMatrix & B); + static void SoftSignDerivative(TCudaMatrix & B, + const TCudaMatrix & A); + + static void Gauss(TCudaMatrix & B); + static void GaussDerivative(TCudaMatrix & B, + const TCudaMatrix & A); + ///@} + + //____________________________________________________________________________ + // + // Loss Functions + //____________________________________________________________________________ + + /** @name Loss Functions + * Loss functions compute a scalar value given the \p output of the network + * for a given training input and the expected network prediction \p Y that + * quantifies the quality of the prediction. For each function also a routing + * that computes the gradients (suffixed by Gradients) must be provided for + * the starting of the backpropagation algorithm. + */ + ///@{ + + static CudaDouble_t MeanSquaredError(const TCudaMatrix &Y, + const TCudaMatrix &output); + static void MeanSquaredErrorGradients(TCudaMatrix & dY, + const TCudaMatrix &Y, + const TCudaMatrix &output); + + /** Sigmoid transformation is implicitly applied, thus \p output should + * hold the linear activations of the last layer in the net. */ + static CudaDouble_t CrossEntropy(const TCudaMatrix &Y, + const TCudaMatrix &output); + + static void CrossEntropyGradients(TCudaMatrix & dY, + const TCudaMatrix & Y, + const TCudaMatrix & output); + ///@} + + //____________________________________________________________________________ + // + // Output Functions + //____________________________________________________________________________ + + /** @name Output Functions + * Output functions transform the activations \p output of the + * output layer in the network to a valid prediction \p YHat for + * the desired usage of the network, e.g. the identity function + * for regression or the sigmoid transformation for two-class + * classification. + */ + ///@{ + static void Sigmoid(TCudaMatrix &YHat, + const TCudaMatrix & ); + ///@} + + //____________________________________________________________________________ + // + // Regularization + //____________________________________________________________________________ + + /** @name Regularization + * For each regularization type two functions are required, one named + * Regularization that evaluates the corresponding + * regularization functional for a given weight matrix and the + * AddRegularizationGradients, that adds the regularization + * component in the gradients to the provided matrix. + */ + ///@{ + + static CudaDouble_t L1Regularization(const TCudaMatrix & W); + static void AddL1RegularizationGradients(TCudaMatrix & A, + const TCudaMatrix & W, + CudaDouble_t weightDecay); + + static CudaDouble_t L2Regularization(const TCudaMatrix & W); + static void AddL2RegularizationGradients(TCudaMatrix & A, + const TCudaMatrix & W, + CudaDouble_t weightDecay); + ///@} + + //____________________________________________________________________________ + // + // Initialization + //____________________________________________________________________________ + + /** @name Initialization + * For each initialization method, one function in the low-level interface + * is provided. The naming scheme is

Initialize

for a given + * initialization method Type. + */ + ///@{ + + static void InitializeGauss(TCudaMatrix & A); + static void InitializeUniform(TCudaMatrix & A); + static void InitializeIdentity(TCudaMatrix & A); + static void InitializeZero(TCudaMatrix & A); + + ///@} + + //____________________________________________________________________________ + // + // Dropout + //____________________________________________________________________________ + + /** @name Dropout + */ + ///@{ + + /** Apply dropout with activation probability \p p to the given + * matrix \p A and scale the result by reciprocal of \p p. */ + static void Dropout(TCudaMatrix & A, CudaDouble_t p); + + ///@} + + //____________________________________________________________________________ + // + // Additional Arithmetic Functions + //____________________________________________________________________________ + + /** @name Additional Arithmetic Functions + * + * Additional arithmetic on CUDA matrices used to implement the low-level + * interface. + */ + ///@{ + + /** Standard multiplication of two matrices \p A and \p B with the result being + * written into C. + */ + static void Multiply(TCudaMatrix &C, + const TCudaMatrix &A, + const TCudaMatrix &B); + /** Matrix multiplication of two matrices \p A and \p B^T (transposed) with the + * result being written into C. + */ + static void TransposeMultiply(TCudaMatrix &output, + const TCudaMatrix &input, + const TCudaMatrix &Weights); + /** In-place Hadamard (element-wise) product of matrices \p A and \p B + * with the result being written into \p A. + */ + static void Hadamard(TCudaMatrix &A, + const TCudaMatrix &B); + + /** Sum columns of (m x n) matrixx \p A and write the results into the first + * m elements in \p A. + */ + static void SumColumns(TCudaMatrix &B, const TCudaMatrix &A); + + /** Compute the sum of all elements in \p A */ + static CudaDouble_t Sum(const TCudaMatrix &A); +}; + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Buffers.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Buffers.h new file mode 100644 index 0000000000000..14be9534039b5 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Buffers.h @@ -0,0 +1,135 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 07/08/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////// +// Device and host buffer for CUDA architectures. // +//////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CUDA_BUFFERS +#define TMVA_DNN_ARCHITECTURES_CUDA_BUFFERS + +#include "cuda.h" +#include "cuda_runtime.h" +#include +#include "Types.h" + +namespace TMVA { +namespace DNN { + +class TCudaDeviceBuffer; + +/** Wrapper class for pinned memory double-precision buffers on the host. + * Uses std::shared_pointer with custom destructor to ensure consistent + * memory management and allow for easy copying/moving. Copying from device + * will set the corresponding data stream which can then be used to synchronize + * on the results. + */ +class TCudaHostBuffer +{ +private: + + size_t fOffset; + mutable cudaStream_t fComputeStream; + std::shared_ptr fDevicePointer; + + struct TDestructor + { + TDestructor() = default; + TDestructor(const TDestructor &) = default; + TDestructor( TDestructor &&) = default; + TDestructor & operator=(const TDestructor &) = default; + TDestructor & operator=( TDestructor &&) = default; + void operator()(CudaDouble_t ** devicePointer); + friend TCudaDeviceBuffer; + } fDestructor; + + friend TCudaDeviceBuffer; + +public: + + TCudaHostBuffer(size_t size); + TCudaHostBuffer(CudaDouble_t *); + TCudaHostBuffer() = default; + TCudaHostBuffer(const TCudaHostBuffer &) = default; + TCudaHostBuffer( TCudaHostBuffer &&) = default; + TCudaHostBuffer & operator=(const TCudaHostBuffer &) = default; + TCudaHostBuffer & operator=( TCudaHostBuffer &&) = default; + + /** Return sub-buffer of the current buffer. */ + TCudaHostBuffer GetSubBuffer(size_t offset, size_t size); + + /** Convert to raw device data pointer.*/ + operator CudaDouble_t * () const; + CudaDouble_t & operator[](size_t index) + { + return (*fDevicePointer + fOffset)[index]; + } + CudaDouble_t operator[](size_t index) const + { + return (*fDevicePointer + fOffset)[index]; + } + +}; + +/** Wrapper class for on-device memory double-precision buffers. Uses + * std::shared_pointer with custom destructor to ensure consistent + * memory management and allow for easy copying/moving. A device + * buffer has an associated CUDA compute stream , which is used for + * implicit synchronization of data transfers. + */ +class TCudaDeviceBuffer +{ +private: + + size_t fOffset; + size_t fSize; + cudaStream_t fComputeStream; + std::shared_ptr fDevicePointer; + + struct TDestructor + { + TDestructor() = default; + TDestructor(const TDestructor &) = default; + TDestructor( TDestructor &&) = default; + TDestructor & operator=(const TDestructor &) = default; + TDestructor & operator=( TDestructor &&) = default; + void operator()(CudaDouble_t ** devicePointer); + friend TCudaDeviceBuffer; + } fDestructor; + +public: + + TCudaDeviceBuffer(size_t size); + TCudaDeviceBuffer(size_t size, cudaStream_t stream); + TCudaDeviceBuffer(CudaDouble_t *, size_t size, cudaStream_t stream); + TCudaDeviceBuffer() = default; + TCudaDeviceBuffer(const TCudaDeviceBuffer &) = default; + TCudaDeviceBuffer( TCudaDeviceBuffer &&) = default; + TCudaDeviceBuffer & operator=(const TCudaDeviceBuffer &) = default; + TCudaDeviceBuffer & operator=( TCudaDeviceBuffer &&) = default; + + /** Return sub-buffer of the current buffer. */ + TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size); + /** Convert to raw device data pointer.*/ + operator CudaDouble_t * () const; + + void CopyFrom(const TCudaHostBuffer &) const; + void CopyTo(const TCudaHostBuffer &) const; + + cudaStream_t GetComputeStream() const {return fComputeStream;} + void SetComputeStream(cudaStream_t stream) {fComputeStream = stream;} + +}; + + +} // namespace DNN +} // namespace TMVA +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaMatrix.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaMatrix.h new file mode 100644 index 0000000000000..ce5279e77f05c --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaMatrix.h @@ -0,0 +1,258 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Contains the TCudaMatrix class for the representation of matrices // +// on CUDA devices as well as the TCudaDeviceReference class which // +// is a helper class to emulate lvalue references to floating point // +// values on the device. // +////////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX +#define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX + +#include + +#include "cuda.h" +#include "cuda_runtime.h" +#include "cublas_v2.h" +#include "curand_kernel.h" + +#include "TMatrixT.h" + +#include "Types.h" +#include "Buffers.h" + +#define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); } + +namespace TMVA { +namespace DNN { + +/** Function to check cuda return code. Taken from + * http://stackoverflow.com/questions/14038589/ + */ +inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +//____________________________________________________________________________ +// +// Cuda Device Reference +//____________________________________________________________________________ + +/** TCudaDeviceReference + * + * Helper class emulating lvalue references for CudaDouble_t values that are + * physically on the device. Allows for example to assign to matrix elements. + * Not that device access through CudaDeviceReferences enforces synchronization + * with all streams and thus qualifies as performance killer. Only used for + * testing. + */ +class TCudaDeviceReference +{ +private: + + CudaDouble_t * fDevicePointer; + +public: + +TCudaDeviceReference(CudaDouble_t * devicePointer) + : fDevicePointer(devicePointer) + { + // Nothing to do here. + } + + operator CudaDouble_t() + { + CudaDouble_t buffer; + cudaMemcpy(& buffer, fDevicePointer, sizeof(CudaDouble_t), + cudaMemcpyDeviceToHost); + return buffer; + } + + void operator=(const TCudaDeviceReference &other) + { + cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(CudaDouble_t), + cudaMemcpyDeviceToDevice); + } + + void operator=(CudaDouble_t value) + { + CudaDouble_t buffer = value; + cudaMemcpy(fDevicePointer, & buffer, sizeof(CudaDouble_t), + cudaMemcpyHostToDevice); + } + + void operator+=(CudaDouble_t value) + { + CudaDouble_t buffer; + cudaMemcpy(& buffer, fDevicePointer, sizeof(CudaDouble_t), + cudaMemcpyDeviceToHost); + buffer += value; + cudaMemcpy(fDevicePointer, & buffer, sizeof(CudaDouble_t), + cudaMemcpyHostToDevice); + } + + void operator-=(CudaDouble_t value) + { + CudaDouble_t buffer; + cudaMemcpy(& buffer, fDevicePointer, sizeof(CudaDouble_t), + cudaMemcpyDeviceToHost); + buffer -= value; + cudaMemcpy(fDevicePointer, & buffer, sizeof(CudaDouble_t), + cudaMemcpyHostToDevice); + } +}; + +//____________________________________________________________________________ +// +// Cuda Matrix +//____________________________________________________________________________ + +/** TCudaMatrix Class + * + * The TCudaMatrix class represents matrices on a CUDA device. The class takes + * care of allocating and freeing device memory. It also holds static variables + * to certain device resources such as the Cublas handle as the computation stream. + * + * Computations on all matrix instances are performend in a single, + * non-standard stream. That means that they are guaranteed to be + * executed in order, but may not wait for asynchronous data transfer + * in other streams. + * + * Each matrix also has an associated data stream, which defaults to the + * default (0) stream if not explicitly provided. + */ +class TCudaMatrix +{ +public: + +private: + + static size_t fInstances; ///< Number of existing TCudaMatrix instances. + static cublasHandle_t fCublasHandle; + static CudaDouble_t * fDeviceReturn; ///< Buffer for kernel return values. + static curandState_t * fCurandStates; + static size_t fNCurandStates; + + size_t fNRows; + size_t fNCols; + TCudaDeviceBuffer fElementBuffer; + +public: + + TCudaMatrix(); + TCudaMatrix(size_t i, size_t j); + TCudaMatrix(const TMatrixT &); + TCudaMatrix(TCudaDeviceBuffer buffer, size_t m, size_t n); + + TCudaMatrix(const TCudaMatrix &) = default; + TCudaMatrix( TCudaMatrix &&) = default; + TCudaMatrix & operator=(const TCudaMatrix &) = default; + TCudaMatrix & operator=( TCudaMatrix &&) = default; + ~TCudaMatrix() = default; + + /** Return the compute stream associated with this matrix */ + inline cudaStream_t GetComputeStream() const; + inline void SetComputeStream(cudaStream_t stream); + /** Blocking synchronization with the associated compute stream, if it's + * not the default stream. */ + inline void Synchronize(const TCudaMatrix &) const; + /** Set the return buffer on the device to the specified value. This is + * required for example for reductions in order to initialize the + * accumulator. */ + inline static void ResetDeviceReturn(CudaDouble_t value = 0.0); + /** Transfer the value in the device return buffer to the host. */ + inline static CudaDouble_t GetDeviceReturn(); + + /** Return device pointer to the device return buffer */ + inline static CudaDouble_t * GetDeviceReturnPointer() {return fDeviceReturn;} + inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;} + + /** Convert cuda matrix to Root TMatrix. */ + operator TMatrixT() const; + + void CopyToDevice(CudaDouble_t *source); + void CopyFromDevice(CudaDouble_t *dest); + + size_t GetNrows() const {return fNRows;} + size_t GetNcols() const {return fNCols;} + size_t GetNoElements() const {return fNRows * fNCols;} + const CudaDouble_t * GetDataPointer() const {return fElementBuffer;} + CudaDouble_t * GetDataPointer() {return fElementBuffer;} + const cublasHandle_t & GetCublasHandle() const {return fCublasHandle;} + + /** Access to elements of device matrices provided through TCudaDeviceReference + * class. Note that access is synchronous end enforces device synchronization + * on all streams. Only used for testing. */ + TCudaDeviceReference operator()(size_t i, size_t j) const + { + CudaDouble_t * elementPointer = fElementBuffer; + elementPointer += j * fNRows + i; + return TCudaDeviceReference(elementPointer); + } + +private: + + void InitializeCuda(); + void InitializeCurandStates(); + +}; + +// +// Inline Functions. +//______________________________________________________________________________ +inline cudaStream_t TCudaMatrix::GetComputeStream() const +{ + return fElementBuffer.GetComputeStream(); +} + +//______________________________________________________________________________ +inline void TCudaMatrix::SetComputeStream(cudaStream_t stream) +{ + return fElementBuffer.SetComputeStream(stream); +} + +//______________________________________________________________________________ +inline void TCudaMatrix::Synchronize(const TCudaMatrix &A) const { + cudaEvent_t event; + cudaEventCreateWithFlags(&event, cudaEventDisableTiming); + cudaEventRecord(event, A.GetComputeStream()); + cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0); + cudaEventDestroy(event); +} + +//______________________________________________________________________________ +inline void TCudaMatrix::ResetDeviceReturn(CudaDouble_t value) +{ + CudaDouble_t buffer = value; + cudaMemcpy(fDeviceReturn, & buffer, sizeof(CudaDouble_t), + cudaMemcpyHostToDevice); +} + +//______________________________________________________________________________ +inline CudaDouble_t TCudaMatrix::GetDeviceReturn() +{ + CudaDouble_t buffer; + cudaMemcpy(& buffer, fDeviceReturn, sizeof(CudaDouble_t), + cudaMemcpyDeviceToHost); + return buffer; +} + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/DataLoader.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/DataLoader.h new file mode 100644 index 0000000000000..09a2e9c25e3b8 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/DataLoader.h @@ -0,0 +1,220 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////////////// +// Declaration of the TCudaDataLoader class, which implements a // +// prefetching data loader for CUDA architecture. Also contains // +// the TCudaBatch class and the TCudaBatchIterator representing batches // +// and an iterator over batches in a data set for CUDA architectures // +/////////////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CUDA_DATA +#define TMVA_DNN_ARCHITECTURES_CUDA_DATA + +#include "CudaMatrix.h" +#include "Buffers.h" +#include "TMVA/Event.h" +#include +#include + +namespace TMVA { +namespace DNN { + +// Input Data Types +using MatrixInput_t = std::pair &, + const TMatrixT &>; +using TMVAInput_t = std::vector; + +using IndexIterator_t = typename std::vector::iterator; + +/** TCudaBatch class. + * + * Lightweight representation of a batch of data on the device. Holds + * pointer to the input (samples) and output data (labels) as well as + * to the data stream in which this batch is transferred. + * + * Provides GetInput() and GetOutput() member functions that return + * TCudaMatrix representations of the input and output data in the + * batch. + */ +class TCudaBatch +{ +private: + cudaStream_t fDataStream; ///< Cuda stream in which the data is transferred + CudaDouble_t * fInputMatrixData; ///< Pointer to the input data buffer. + CudaDouble_t * fOutputMatrixData; ///< Pointer to the ouput data buffer. + + size_t fNinputFeatures; ///< Number of input features. + size_t fNoutputFeatures; ///< Number of output features. + size_t fBatchSize; ///< Size of the batch. +public: + + TCudaBatch(size_t batchSize, + size_t ninputFeatures, + size_t noutputFeatures, + CudaDouble_t * inputMatrixData, + CudaDouble_t * outputMatrixData, + cudaStream_t dataStream) + : fDataStream(dataStream), fInputMatrixData(inputMatrixData), + fOutputMatrixData(outputMatrixData), fNinputFeatures(ninputFeatures), + fNoutputFeatures(noutputFeatures), fBatchSize(batchSize) + { + // Nothing to do here. + } + + /** Return the batch input data as a TCudaMatrix. The TCudaMatrix is passed + * the data stream in which the async. data transfer to the corresponding + * buffer is performed, so that operations on the matrix can synchronize + * with it. */ + TCudaMatrix GetInput() + { + size_t size = fBatchSize * fNinputFeatures; + return TCudaMatrix(TCudaDeviceBuffer(fInputMatrixData, size, fDataStream), + fBatchSize, fNinputFeatures); + } + + /** Return the outpur data as a TCudaMatrix. Also forwards the data stream in + * which the async. data transfer is performed to the matrix. See above. + */ + TCudaMatrix GetOutput() + { + size_t size = fBatchSize * fNoutputFeatures; + return TCudaMatrix(TCudaDeviceBuffer(fOutputMatrixData, size, fDataStream), + fBatchSize, fNoutputFeatures); + } +}; + +template +class TCudaDataLoader; + +/** TCudaBatchIterator Class + * + * Class that implements an iterator over data sets on a CUDA device. The + * batch iterator has to take care of the preloading of the data which is + * why a special implementation is required. + */ +template +class TCudaBatchIterator +{ +private: + using SampleIndexIterator_t = typename std::vector::iterator; + + TCudaDataLoader & fDataLoader; ///< Dataloader managing data transfer. + IndexIterator_t fSampleIndexIterator; ///< Sample indices in this batch. + IndexIterator_t fSampleIndexIteratorEnd; ///< End of this batch. + + const size_t fNbatchesInEpoch; ///< Total number of batches in the data set. + const size_t fBatchSize; ///< Size of this batch. + const size_t fTransferBatchSize; ///< No. of. batches in a data transfer batch. + size_t fBatchIndex; ///< Index of this batch in the current epoch. + +public: + + TCudaBatchIterator(TCudaDataLoader &dataLoader, + IndexIterator_t sampleIndexIterator, + IndexIterator_t sampleIndexIteratorEnd); + + /** Preloads the number of transfer batches as specified by the + * corresponding TCudaDataLoader object. */ + void PrepareStream(); + /** Advance to the next batch and check if data should be preloaded. */ + TCudaBatchIterator & operator++(); + /** Return TCudaBatch object corresponding to the current iterator position. */ + TCudaBatch operator*(); + + bool operator==(const TCudaBatchIterator & other); + bool operator!=(const TCudaBatchIterator & other); +}; + +/** The TCudaDataLoader Class + * + * The TCudaDataLoader class takes care of transferring training and test data + * from the host to the device. The data transfer is performed asynchronously + * and multiple data set batches can be transferred combined into transfer batches, + * which contain a fixed number of data set batches. The range of the preloading + * is defined in multiples of transfer batches. */ +template +class TCudaDataLoader +{ +private: + + const Data_t & fInputData; + + size_t fNsamples; ///< No. of samples in the data set. + size_t fNinputFeatures; ///< No. of features in input sample. + size_t fNoutputFeatures; ///< No. of features in output sample (truth). + size_t fBatchSize; ///< No. of samples in a (mini-)batch + size_t fTransferBatchSize; ///< No. of batches combined in a single data transfer. + size_t fPreloadOffset; ///< How many batch-batches data is loaded in advance. + + size_t fNbatchesInEpoch; ///< No. of batches in one epoch. + size_t fInputMatrixSize; ///< No. of elements in input matrix. + size_t fOutputMatrixSize;///< No. of elements in output matrix. + size_t fTransferSize; ///< Total size of one data transfer in bytes; + + size_t fStreamIndex; ///< Transfer stream index. + + CudaDouble_t **fHostData; ///< Host-side buffer array. + CudaDouble_t **fDeviceData; ///< Device-side buffer array. + cudaStream_t * fDataStreams; ///< Data stream array. + + std::vector fSampleIndices; ///< Shuffled sample indices. + +public: + + TCudaDataLoader(const Data_t & inputData, + size_t nsamples, + size_t batchSize, + size_t ninputFeatures, + size_t noutputFeatures, + size_t batchBatchSize = 5, + size_t preloadOffset = 2); + + ~TCudaDataLoader(); + + /** Return iterator to batches in the training set. Samples in batches are + * are sampled randomly from the data set without replacement. */ + TCudaBatchIterator begin(); + + TCudaBatchIterator end(); + + /** Called by the iterator to indicate that the current buffer containing a + * transfer batch of batchs has been consumed. */ + void NextBuffer() {fStreamIndex = (fStreamIndex + 1) % (fTransferBatchSize + 1);} + /** Invoke transfer of the current buffer in the buffer cycle. */ + void InvokeTransfer(); + /** Return the batch object corresponding to the given batchIndex in the current + * epoch. */ + TCudaBatch GetCurrentBatch(size_t batchIndex); + + /** Copy data from the input data object to the pinned transfer buffer on + * the host. Must be specialized for any type of input data that is used. */ + inline static void CopyBatches(Data_t data, + IndexIterator_t sampleIndexIteratorBegin, + IndexIterator_t sampleIndexIteratorEnd, + size_t batchSize, + size_t batchBatchSize, + CudaDouble_t * inputBuffer, + CudaDouble_t * outputBuffer); + + size_t GetNBatchesInEpoch() const {return fNbatchesInEpoch;} + size_t GetPreloadOffset() const {return fPreloadOffset;} + size_t GetBatchSize() const {return fBatchSize;} + size_t GetBatchBatchSize() const {return fTransferBatchSize;} + const Data_t & GetInputData() const {return fInputData;} + inline CudaDouble_t * GetInputTransferBuffer() const; + inline CudaDouble_t * GetOutputTransferBuffer() const; +}; + +} // namespace TMVA +} // namespace DNN + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h new file mode 100644 index 0000000000000..2048a921f02c2 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h @@ -0,0 +1,69 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////////// +// Declares the DeviceSettings struct which holdsx device specific // +// information to be defined at compile time. // +////////////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CUDA_DEVICE +#define TMVA_DNN_ARCHITECTURES_CUDA_DEVICE + +#include "cuda.h" +#include "vector_types.h" // definition of dim3 +#include "CudaMatrix.h" + +namespace TMVA +{ +namespace DNN +{ + +class TDevice +{ +public: + static constexpr int BlockDimX = 1; + static constexpr int BlockDimY = 32; + static constexpr int BlockSize = BlockDimX * BlockDimY; + + static dim3 BlockDims() + { + return dim3(BlockDimX, BlockDimY); + } + + static dim3 GridDims(const TCudaMatrix &A) + { + int gridDimX = A.GetNcols() / TDevice::BlockDimX; + if ((A.GetNcols() % TDevice::BlockDimX) != 0) + gridDimX += 1; + int gridDimY = A.GetNrows() / TDevice::BlockDimY; + if ((A.GetNrows() % TDevice::BlockDimY) != 0) + gridDimY += 1; + return dim3(gridDimX, gridDimY); + } + + static int NThreads(const TCudaMatrix &A) + { + int gridDimX = A.GetNcols() / TDevice::BlockDimX; + if ((A.GetNcols() % TDevice::BlockDimX) != 0) + gridDimX += 1; + int gridDimY = A.GetNrows() / TDevice::BlockDimY; + if ((A.GetNrows() % TDevice::BlockDimY) != 0) + gridDimY += 1; + return gridDimX * gridDimY * TDevice::BlockDimX * TDevice::BlockDimY; + } + +}; + + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Kernels.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Kernels.h new file mode 100644 index 0000000000000..23c6814c0260e --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Kernels.h @@ -0,0 +1,236 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Declaration of the device kernels for the CUDA implementation of // +// the low-level interface. // +////////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "cuda.h" +#include "curand_kernel.h" + +namespace TMVA { +namespace DNN { +namespace Cuda { + + +/** @name Device Kernels + * Utility device kernels that can only be called from the device. + */ +///@{ +/** Atomic addition of doubles, which is at the moment natively supported only + for floats. */ +//____________________________________________________________________________ +__device__ CudaDouble_t AtomicAdd(CudaDouble_t* address, CudaDouble_t val); + +/** Sum elements in columns of the 2D shared data array and accumulated the + * results in \p result. + * + * \param result Pointer to the result where the reduction results of each block + * will be accumulated using atomicAdd. The results of the jth column in this + * 2D block will be accumulated in the jth element. + * \param sdata 2D shared data array with the same dimensions as the current + * compute block. Contains the elements that will be row-wise accumulated. + */ +//____________________________________________________________________________ +__device__ void ReduceSumVertical(CudaDouble_t *result, CudaDouble_t * sdata); + +/** Sum up all elements in the current compute block and accumulate the results + * into \p result using atomicAdd */ +//____________________________________________________________________________ +__device__ void ReduceSum(CudaDouble_t *result, CudaDouble_t * sdata); +///@} + +/** @name Forward and Backward Propagation + */ +///@{ +/** Add the \p n -element vector \p theta row-wise to the \p m x \p n matrix + * in W */ +//____________________________________________________________________________ +__global__ void AddRowWise(CudaDouble_t * W, const CudaDouble_t * theta, + int m, int n); +/** Compute the Hadamard product of the \p m x \p n matrix B and A and write + * results into B. */ +//____________________________________________________________________________ +__global__ void Hadamard(CudaDouble_t * B, + const CudaDouble_t * A, + int m, int n); +///@} + +/** @name Activation Functions + */ +///@{ +//____________________________________________________________________________ +__global__ void IdentityDerivative(CudaDouble_t * A, + int m, int n); +//____________________________________________________________________________ +__global__ void Relu(CudaDouble_t * A, + int m, int n); + +//____________________________________________________________________________ +__global__ void ReluDerivative(CudaDouble_t * B, + const CudaDouble_t * A, int m, int n); + +//____________________________________________________________________________ +__global__ void Sigmoid(CudaDouble_t * A, + int m, int n); +//____________________________________________________________________________ +__global__ void SigmoidDerivative(CudaDouble_t * B, + const CudaDouble_t * A, + int m, int n); + +//____________________________________________________________________________ +__global__ void Tanh(CudaDouble_t * A, + int m, int n); +//____________________________________________________________________________ +__global__ void TanhDerivative(CudaDouble_t * B, + const CudaDouble_t * A, + int m, int n); + +//____________________________________________________________________________ +__global__ void SymmetricRelu(CudaDouble_t * A, + int m, int n); +//____________________________________________________________________________ +__global__ void SymmetricReluDerivative(CudaDouble_t * B, + const CudaDouble_t * A, + int m, int n); + +//____________________________________________________________________________ +__global__ void SoftSign(CudaDouble_t * A, + int m, int n); +//____________________________________________________________________________ +__global__ void SoftSignDerivative(CudaDouble_t * B, + const CudaDouble_t * A, + int m, int n); + +//____________________________________________________________________________ +__global__ void Gauss(CudaDouble_t * A, + int m, int n); +//____________________________________________________________________________ +__global__ void GaussDerivative(CudaDouble_t * B, + const CudaDouble_t * A, + int m, int n); + +//____________________________________________________________________________ +///@} + +/** @name Loss Functions + */ +///@{ + +//____________________________________________________________________________ +__global__ void MeanSquaredError(CudaDouble_t * result, + const CudaDouble_t * Y, + const CudaDouble_t * output, + int m, int n); + +//____________________________________________________________________________ +__global__ void MeanSquaredErrorGradients(CudaDouble_t * dY, + const CudaDouble_t * Y, + const CudaDouble_t * output, + int m, int n); +//____________________________________________________________________________ +__global__ void CrossEntropy(CudaDouble_t * result, + const CudaDouble_t * Y, + const CudaDouble_t * output, + int m, int n); + +//____________________________________________________________________________ +__global__ void CrossEntropyGradients(CudaDouble_t * dY, + const CudaDouble_t * Y, + const CudaDouble_t * output, + int m, int n); +///@} + +/** @name Regularization + */ +///@{ + +/** Compute the sum of the absolute values of the elements in the \p m x \p n + * matrix \p A and write the result into \p result. This is used to compute + * L1 regularization for weights matrices. */ +//____________________________________________________________________________ +__global__ void AbsoluteSum(CudaDouble_t * result, + const CudaDouble_t * A, + int m, int n); + +/** Compute the squared sum of the \p m x \p n matrix \p A and write the result + * into \p result. This is used to compute L2 regularization. */ +//____________________________________________________________________________ +__global__ void SquaredSum(CudaDouble_t * result, + const CudaDouble_t * A, + int m, int n); + +/** Add the gradients of L1 regularizatoin applied to the \p m x \p n matrix + * in \p A to the \p m x \p n matrix in B. \p weightDecay is the weight assigned + * to the L1 regularization term. */ +//____________________________________________________________________________ +__global__ void AddL1RegularizationGradients(CudaDouble_t * B, + const CudaDouble_t * A, + CudaDouble_t weightDecay, + int m, int n); + +/** Add the gradients of L2 regularizatoin applied to the \p m x \p n matrix + * in \p A to the \p m x \p n matrix in B. \p weightDecay is the weight assigned + * to the L1 regularization term. */ +//____________________________________________________________________________ +__global__ void AddL2RegularizationGradients(CudaDouble_t * A, + const CudaDouble_t * B, + CudaDouble_t weightDecay, + int m, int n); + +///@} + +///@{ +/** @name Dropout + */ +//____________________________________________________________________________ +__global__ void InitializeCurandStates(unsigned long long seed, + curandState_t *states); + +//____________________________________________________________________________ +__global__ void Dropout(CudaDouble_t *A, + int m, int n, + CudaDouble_t dropoutProbability, + curandState_t *states); + +///@} + +///@{ +/** @name Output Functions + */ +//____________________________________________________________________________ +__global__ void Sigmoid(CudaDouble_t *A, + const CudaDouble_t *B, + int m, int n); + +///@} + +///@{ +/** @name Miscellaneous + */ +///@{ + +//____________________________________________________________________________ +__global__ void ReduceMatrix(CudaDouble_t *result, + const CudaDouble_t *A, + int m, int n); + +//____________________________________________________________________________ +__global__ void SumColumns(CudaDouble_t *B, + const CudaDouble_t *A, + int m, int n); +///@} + +} // namespace CudaKernels +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Types.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Types.h new file mode 100644 index 0000000000000..22a38cc8f8465 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Types.h @@ -0,0 +1,22 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////// +// Contains type definitions for types used in the CUDA // +// implementation of the neural network low-level interface. // +/////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CUDA_TYPES +#define TMVA_DNN_ARCHITECTURES_CUDA_TYPES + +typedef double CudaDouble_t; + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h new file mode 100644 index 0000000000000..6b1772a02c93d --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h @@ -0,0 +1,250 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 20/06/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////////// +// Declaration of the TReference architecture, which provides a // +// reference implementation of the low-level interface for the DNN // +// implementation based on ROOT's TMatrixT matrix type. // +/////////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_REFERENCE +#define TMVA_DNN_ARCHITECTURES_REFERENCE + +#include "TMatrix.h" +#include "Reference/DataLoader.h" + +namespace TMVA +{ +namespace DNN +{ + +/*! The reference architecture class. +* +* Class template that contains the reference implementation of the low-level +* interface for the DNN implementation. The reference implementation uses the +* TMatrixT class template to represent matrices. +* +* \tparam Real_t The floating point type used to represent scalars. +*/ +template +class TReference +{ +public: + + using Scalar_t = Real_t; + using Matrix_t = TMatrixT; + template + using DataLoader_t = TReferenceDataLoader; + + //____________________________________________________________________________ + // + // Propagation + //____________________________________________________________________________ + + /** @name Forward Propagation + * Low-level functions required for the forward propagation of activations + * through the network. + */ + ///@{ + /** Matrix-multiply \p input with the transpose of \pweights and + * write the results into \p output. */ + static void MultiplyTranspose(TMatrixT &output, + const TMatrixT &input, + const TMatrixT &weights); + /** Add the vectors biases row-wise to the matrix output */ + static void AddRowWise(TMatrixT &output, + const TMatrixT &biases); + ///@} + + /** @name Backward Propagation + * Low-level functions required for the forward propagation of activations + * through the network. + */ + ///@{ + /** Perform the complete backward propagation step. If the provided + * \p activationGradientsBackward matrix is not empty, compute the + * gradients of the objective function with respect to the activations + * of the previous layer (backward direction). + * Also compute the weight and the bias gradients. Modifies the values + * in \p df and thus produces only a valid result, if it is applied the + * first time after the corresponding forward propagation has been per- + * formed. */ + static void Backward(TMatrixT & activationGradientsBackward, + TMatrixT & weightGradients, + TMatrixT & biasGradients, + TMatrixT & df, + const TMatrixT & activationGradients, + const TMatrixT & weights, + const TMatrixT & activationBackward); + /** Adds a the elements in matrix B scaled by c to the elements in + * the matrix A. This is required for the weight update in the gradient + * descent step.*/ + static void ScaleAdd(TMatrixT & A, + const TMatrixT & B, + Scalar_t beta = 1.0); + ///@} + + //____________________________________________________________________________ + // + // Activation Functions + //____________________________________________________________________________ + + /** @name Activation Functions + * For each activation function, the low-level interface contains two routines. + * One that applies the acitvation function to a matrix and one that evaluate + * the derivatives of the activation function at the elements of a given matrix + * and writes the results into the result matrix. + */ + ///@{ + static void Identity(TMatrixT & B); + static void IdentityDerivative(TMatrixT & B, + const TMatrixT & A); + + static void Relu(TMatrixT & B); + static void ReluDerivative(TMatrixT & B, + const TMatrixT & A); + + static void Sigmoid(TMatrixT & B); + static void SigmoidDerivative(TMatrixT & B, + const TMatrixT & A); + + static void Tanh(TMatrixT & B); + static void TanhDerivative(TMatrixT & B, + const TMatrixT & A); + + static void SymmetricRelu(TMatrixT & B); + static void SymmetricReluDerivative(TMatrixT & B, + const TMatrixT & A); + + static void SoftSign(TMatrixT & B); + static void SoftSignDerivative(TMatrixT & B, + const TMatrixT & A); + + static void Gauss(TMatrixT & B); + static void GaussDerivative(TMatrixT & B, + const TMatrixT & A); + + ///@} + + //____________________________________________________________________________ + // + // Loss Functions + //____________________________________________________________________________ + + /** @name Loss Functions + * Loss functions compute a scalar value given the \p output of the network + * for a given training input and the expected network prediction \p Y that + * quantifies the quality of the prediction. For each function also a routing + * that computes the gradients (suffixed by Gradients) must be provided for + * the starting of the backpropagation algorithm. + */ + ///@{ + + static Real_t MeanSquaredError(const TMatrixT &Y, + const TMatrixT &output); + static void MeanSquaredErrorGradients(TMatrixT & dY, + const TMatrixT &Y, + const TMatrixT &output); + + /** Sigmoid transformation is implicitly applied, thus \p output should + * hold the linear activations of the last layer in the net. */ + static Real_t CrossEntropy(const TMatrixT &Y, + const TMatrixT &output); + + static void CrossEntropyGradients(TMatrixT & dY, + const TMatrixT & Y, + const TMatrixT & output); + ///@} + + //____________________________________________________________________________ + // + // Output Functions + //____________________________________________________________________________ + + /** @name Output Functions + * Output functions transform the activations \p output of the + * output layer in the network to a valid prediction \p YHat for + * the desired usage of the network, e.g. the identity function + * for regression or the sigmoid transformation for two-class + * classification. + */ + ///@{ + static void Sigmoid(TMatrixT &YHat, + const TMatrixT & ); + ///@} + + //____________________________________________________________________________ + // + // Regularization + //____________________________________________________________________________ + + /** @name Regularization + * For each regularization type two functions are required, one named + * Regularization that evaluates the corresponding + * regularization functional for a given weight matrix and the + * AddRegularizationGradients, that adds the regularization + * component in the gradients to the provided matrix. + */ + ///@{ + + static Real_t L1Regularization(const TMatrixT & W); + static void AddL1RegularizationGradients(TMatrixT & A, + const TMatrixT & W, + Real_t weightDecay); + + static Real_t L2Regularization(const TMatrixT & W); + static void AddL2RegularizationGradients(TMatrixT & A, + const TMatrixT & W, + Real_t weightDecay); + ///@} + + //____________________________________________________________________________ + // + // Initialization + //____________________________________________________________________________ + + /** @name Initialization + * For each initialization method, one function in the low-level interface + * is provided. The naming scheme is

Initialize

for a given + * initialization method Type. + */ + ///@{ + + static void InitializeGauss(TMatrixT & A); + + static void InitializeUniform(TMatrixT & A); + + static void InitializeIdentity(TMatrixT & A); + + static void InitializeZero(TMatrixT & A); + + ///@} + + //____________________________________________________________________________ + // + // Dropout + //____________________________________________________________________________ + + /** @name Dropout + */ + ///@{ + + /** Apply dropout with activation probability \p p to the given + * matrix \p A and scale the result by reciprocal of \p p. */ + static void Dropout(TMatrixT & A, Real_t dropoutProbability); + + ///@} +}; + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Reference/DataLoader.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference/DataLoader.h new file mode 100644 index 0000000000000..c2c25dace4108 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference/DataLoader.h @@ -0,0 +1,164 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 12/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////////// +// This file contains the declaration of the Batch, BatchIterator // +// and DataLoader classes for the reference implementation. // +//////////////////////////////////////////////////////////////////// + +#include +#include "TMatrix.h" +#include "TMVA/Event.h" + +#ifndef TMVA_DNN_ARCHITECTURES_REFERENCE_DATALOADER +#define TMVA_DNN_ARCHITECTURES_REFERENCE_DATALOADER + +namespace TMVA +{ +namespace DNN +{ + +// Input Data Types +using MatrixInput_t = std::pair &, + const TMatrixT &>; +using TMVAInput_t = std::vector; + +/** Reference Batch Class + * + * The Batch class for the reference implementation. Provides the required + * GetInput() and GetOutput() method which returns the input and output + * corresponding to this batch in matrix form. + * + * \tparam Real_t The floating point type to represent floating point numbers. + */ +template +class TReferenceBatch +{ + using Matrix_t = TMatrixT; + + const Matrix_t & fInput; + const Matrix_t & fOutput; + +public: + + /** Create a batch containing the provided input and output in matrix + * form. + * + * \param input Reference to the input data in matrix form. + * \param output Reference to the expected output (truth) in matrix form. + */ + TReferenceBatch(Matrix_t & input, Matrix_t & output) + : fInput(input), fOutput(output) + { + // Nothing to do here. + } + + Matrix_t GetInput() {return fInput;} + Matrix_t GetOutput() {return fOutput;} + +}; + +/** Reference DataLoader Class + * + * DataLoader class for the reference implementation. The + * TReferenceDataLoader class takes a reference to the input data and + * prepares the batches for the training and evaluation of the neural + * network. + * + * \tparam Data_t Defines the input data type, i.e. the class that + * provides access to the training and test data. To instantiate this + * class with a given input data type, the CopyBatch function + * template must be specialized for this type. + * + * \tparam Real_t The floating point type used to represent scalars. + */ +template +class TReferenceDataLoader +{ + using Scalar_t = Real_t; + using Matrix_t = TMatrixT; + using BatchIterator_t = typename std::vector>::iterator; + using IndexIterator_t = typename std::vector::iterator; + + const Data_t & fInput; + + size_t fNSamples; + size_t fBatchSize; + size_t fNInputFeatures; + size_t fNOutputFeatures; + size_t fNBatches; + + std::vector> fInputMatrices; + std::vector> fOutputMatrices; + std::vector> fBatches; + std::vector fSampleIndices; + +public: + + /** Create data loader from the given input data \p input. The data set + * shoule consists of \p nsamples samples. Will create a vector of + * batches that are sample randomly from the input data. + * + * \param nsamples Number of samples in the training set. + * \param batchSize The batch size. + * \param ninputFeatures Number of input features + * \param noutputFeatures Number of output features. + */ + TReferenceDataLoader(const Data_t &input, + size_t nsamples, + size_t batchSize, + size_t ninputFeatures, + size_t noutputFeatures); + + /** This function copies a batch from the input data into the the matrices + * used as input for the neural network. + * + * \param input Reference to the matrix to write the input samples into. + * \param output Reference to the matrix to write the output labels into. + * \param begin Index iterator containing the sample indices which to include + * in the current batch. + * \param end Iterator to the index of the last sample to include in thie + * batch. + */ + void CopyBatch(Matrix_t &inputMatrix, + Matrix_t &outputMatrix, + const Data_t & input, + IndexIterator_t begin, + IndexIterator_t end); + + /** Return iterator over batches in one epoch the data set. Shuffles + * the training sample before returning the iterator. */ + BatchIterator_t begin(); + /** Iterator pointing to the end of the epoch, required for C++ range + * loops */ + BatchIterator_t end(); + + size_t GetBatchSize() const {return fBatchSize;} + size_t GetNBatchesInEpoch() const {return fNBatches;} +}; + +template <> +void TReferenceDataLoader::CopyBatch(Matrix_t &inputMatrix, + Matrix_t &outputMatrix, + const MatrixInput_t & input, + IndexIterator_t begin, + IndexIterator_t end); + +template <> +void TReferenceDataLoader::CopyBatch(Matrix_t &inputMatrix, + Matrix_t &outputMatrix, + const TMVAInput_t & input, + IndexIterator_t begin, + IndexIterator_t end); +} // namespace TMVA +} // namespace DNN + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/DataLoader.h b/tmva/tmva/inc/TMVA/DNN/DataLoader.h new file mode 100644 index 0000000000000..7ff3d61591a93 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/DataLoader.h @@ -0,0 +1,222 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 08/08/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////// +// Generic data loader for neural network input data. // +//////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_DATALOADER +#define TMVA_DNN_DATALOADER + +#include "TMatrix.h" +#include + +#include "TMVA/Event.h" + +namespace TMVA { +namespace DNN { + +// +// Input Data Types +//______________________________________________________________________________ +using MatrixInput_t = std::pair &, + const TMatrixT &>; +using TMVAInput_t = std::vector; + +// +// TBatch Class. +//______________________________________________________________________________ +template +class TBatch +{ +private: + + using Matrix_t = typename Architecture_t::Matrix_t; + + Matrix_t fInputMatrix; + Matrix_t fOutputMatrix; + +public: + + TBatch(Matrix_t &&, Matrix_t &&); + TBatch(const TBatch &) = default; + TBatch( TBatch &&) = default; + TBatch & operator=(const TBatch &) = default; + TBatch & operator=( TBatch &&) = default; + + Matrix_t & GetInput() {return fInputMatrix;} + Matrix_t & GetOutput() {return fOutputMatrix;} +}; + +// +// TBatchIterator Class. +//______________________________________________________________________________ + +template class TDataLoader; + +template +class TBatchIterator +{ +private: + + TDataLoader & fDataLoader; + size_t fBatchIndex; + +public: + +TBatchIterator(TDataLoader & dataLoader, size_t index = 0) +: fDataLoader(dataLoader), fBatchIndex(index) +{ + // Nothing to do here. +} + + TBatch operator*() {return fDataLoader.GetBatch();} + TBatchIterator operator++() {fBatchIndex++; return *this;} + bool operator!=(const TBatchIterator & other) { + return fBatchIndex != other.fBatchIndex; + } +}; + +// +// TDataLoader Class. +//______________________________________________________________________________ +template +class TDataLoader +{ +private: + + using HostBuffer_t = typename Architecture_t::HostBuffer_t; + using DeviceBuffer_t = typename Architecture_t::DeviceBuffer_t; + using IndexIterator_t = typename std::vector::iterator; + using Matrix_t = typename Architecture_t::Matrix_t; + using BatchIterator_t = TBatchIterator; + + const Data_t & fData; + + size_t fNSamples; + size_t fBatchSize; + size_t fNInputFeatures; + size_t fNOutputFeatures; + size_t fBatchIndex; + + size_t fNStreams; + std::vector fDeviceBuffers; + std::vector fHostBuffers; + + std::vector fSampleIndices; + +public: + + TDataLoader(const Data_t & data, size_t nSamples, size_t batchSize, + size_t nInputFeatures, size_t nOutputFeatures, size_t nStreams = 4); + TDataLoader(const TDataLoader &) = default; + TDataLoader( TDataLoader &&) = default; + TDataLoader & operator=(const TDataLoader &) = default; + TDataLoader & operator=( TDataLoader &&) = default; + + /** Copy input matrix into the given host buffer. Function to be specialized by + * the architecture-specific backend. */ + void CopyInput(HostBuffer_t &buffer, IndexIterator_t begin, size_t batchSize); + /** Copy output matrix into the given host buffer. Function to be specialized + * by the architecture-spcific backend. */ + void CopyOutput(HostBuffer_t &buffer, IndexIterator_t begin, size_t batchSize); + + BatchIterator_t begin() {return TBatchIterator(*this);} + BatchIterator_t end() + { + return TBatchIterator(*this,(fNSamples / fBatchSize)+1); + } + + void Shuffle(); + TBatch GetBatch(); + +}; + +// +// TBatch Class. +//______________________________________________________________________________ +template +TBatch::TBatch(Matrix_t && inputMatrix, Matrix_t && outputMatrix) + : fInputMatrix(inputMatrix), fOutputMatrix(outputMatrix) +{ + // Nothing to do here. +} + +// +// TDataLoader Class. +//______________________________________________________________________________ +template +TDataLoader::TDataLoader( + const Data_t & data, size_t nSamples, size_t batchSize, + size_t nInputFeatures, size_t nOutputFeatures, size_t nStreams) + : fData(data), fNSamples(nSamples), fBatchSize(batchSize), + fNInputFeatures(nInputFeatures), fNOutputFeatures(nOutputFeatures), + fBatchIndex(0), fNStreams(nStreams), fDeviceBuffers(), fHostBuffers(), + fSampleIndices() +{ + size_t inputMatrixSize = fBatchSize * fNInputFeatures; + size_t outputMatrixSize = fBatchSize * fNOutputFeatures; + + for (size_t i = 0; i < fNStreams; i++) + { + fHostBuffers.push_back(HostBuffer_t(inputMatrixSize + outputMatrixSize)); + fDeviceBuffers.push_back(DeviceBuffer_t(inputMatrixSize + outputMatrixSize)); + } + + fSampleIndices.reserve(fNSamples); + for (size_t i = 0; i < fNSamples; i++) { + fSampleIndices.push_back(i); + } +} + +template +TBatch TDataLoader::GetBatch() +{ + fBatchIndex %= (fNSamples / fBatchSize); // Cycle through samples. + + + size_t inputMatrixSize = fBatchSize * fNInputFeatures; + size_t outputMatrixSize = fBatchSize * fNOutputFeatures; + + size_t streamIndex = fBatchIndex % fNStreams; + HostBuffer_t & hostBuffer = fHostBuffers[streamIndex]; + DeviceBuffer_t & deviceBuffer = fDeviceBuffers[streamIndex]; + + HostBuffer_t inputHostBuffer = hostBuffer.GetSubBuffer(0, inputMatrixSize); + HostBuffer_t outputHostBuffer = hostBuffer.GetSubBuffer(inputMatrixSize, + outputMatrixSize); + + DeviceBuffer_t inputDeviceBuffer = deviceBuffer.GetSubBuffer(0, inputMatrixSize); + DeviceBuffer_t outputDeviceBuffer = deviceBuffer.GetSubBuffer(inputMatrixSize, + outputMatrixSize); + size_t sampleIndex = fBatchIndex * fBatchSize; + IndexIterator_t sampleIndexIterator = fSampleIndices.begin() + sampleIndex; + + CopyInput(inputHostBuffer, sampleIndexIterator, fBatchSize); + CopyOutput(outputHostBuffer, sampleIndexIterator, fBatchSize); + + deviceBuffer.CopyFrom(hostBuffer); + Matrix_t inputMatrix(inputDeviceBuffer, fBatchSize, fNInputFeatures); + Matrix_t outputMatrix(outputDeviceBuffer, fBatchSize, fNOutputFeatures); + + fBatchIndex++; + return TBatch(std::move(inputMatrix), std::move(outputMatrix)); +} + +template +void TDataLoader::Shuffle() +{ + std::random_shuffle(fSampleIndices.begin(), fSampleIndices.end()); +} + +} +} +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Functions.h b/tmva/tmva/inc/TMVA/DNN/Functions.h new file mode 100644 index 0000000000000..72ac34f1f69d7 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Functions.h @@ -0,0 +1,265 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 20/06/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////////////// +// Contains function enums for activation and output functions, as // +// well as generic evaluation functions, that delegate the call to // +// the corresponding evaluation kernel. // +///////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_FUNCTIONS +#define TMVA_DNN_FUNCTIONS + +namespace TMVA +{ +namespace DNN +{ +//______________________________________________________________________________ +// +// Enum Definitions +//______________________________________________________________________________ + +/*! Enum that represents layer activation functions. */ +enum class EActivationFunction +{ + IDENTITY = 0, + RELU = 1, + SIGMOID = 2, + TANH = 3, + SYMMRELU = 4, + SOFTSIGN = 5, + GAUSS = 6 +}; + +/*! Enum that represents output functions */ +enum class EOutputFunction +{ + IDENTITY = 'I', + SIGMOID = 'S' +}; + +/*! Enum that represents objective functions for the net, i.e. functions +* that take the output from the last layer in the net together with the +* truths and return the objective function values that is to be minimized +* in the training process. */ +enum class ELossFunction +{ + CROSSENTROPY = 'C', + MEANSQUAREDERROR = 'R' + }; + +/*! Enum representing the regularization type applied for a given layer */ +enum class ERegularization +{ + NONE = '0', + L1 = '1', + L2 = '2' + }; + +/* Enum represnting the initialization method used for this layer. */ +enum class EInitialization { + GAUSS = 'G', + UNIFORM = 'U', + IDENTITY = 'I', + ZERO = 'Z' +}; + +//______________________________________________________________________________ +// +// Activation Functions +//______________________________________________________________________________ + +/*! Apply the given activation function to each value in the given +* matrix A. */ +template +inline void evaluate(typename Architecture_t::Matrix_t &A, + EActivationFunction f) +{ + switch(f) + { + case EActivationFunction::IDENTITY : break; + case EActivationFunction::RELU : Architecture_t::Relu(A); + break; + case EActivationFunction::SIGMOID : Architecture_t::Sigmoid(A); + break; + case EActivationFunction::TANH : Architecture_t::Tanh(A); + break; + case EActivationFunction::SYMMRELU : Architecture_t::SymmetricRelu(A); + break; + case EActivationFunction::SOFTSIGN : Architecture_t::SoftSign(A); + break; + case EActivationFunction::GAUSS : Architecture_t::Gauss(A); + break; + } +} + + +/*! Compute the first partial derivative of the activation function for +* the values given in matrix A and write the results into B. */ +//______________________________________________________________________________ +template +inline void evaluateDerivative(typename Architecture_t::Matrix_t & B, + EActivationFunction f, + const typename Architecture_t::Matrix_t & A) +{ + switch(f) + { + case EActivationFunction::IDENTITY : Architecture_t::IdentityDerivative(B, A); + break; + case EActivationFunction::RELU : Architecture_t::ReluDerivative(B, A); + break; + case EActivationFunction::SIGMOID : Architecture_t::SigmoidDerivative(B, A); + break; + case EActivationFunction::TANH : Architecture_t::TanhDerivative(B, A); + break; + case EActivationFunction::SYMMRELU : Architecture_t::SymmetricReluDerivative(B, A); + break; + case EActivationFunction::SOFTSIGN : Architecture_t::SoftSignDerivative(B, A); + break; + case EActivationFunction::GAUSS : Architecture_t::GaussDerivative(B, A); + break; + } +} + +//______________________________________________________________________________ +// +// Output Functions +//______________________________________________________________________________ + +/*! Apply the given output function to each value in the given +* matrix A. */ +template +inline void evaluate(typename Architecture_t::Matrix_t &A, + EOutputFunction f, + const typename Architecture_t::Matrix_t &X) +{ + switch(f) + { + case EOutputFunction::IDENTITY : break; + case EOutputFunction::SIGMOID : Architecture_t::Sigmoid(A, X); + break; + } +} + +//______________________________________________________________________________ +// +// Loss Functions +//______________________________________________________________________________ + +/*! Compute the value of the objective function f for given activations +* of the ouput layer and the truth Y. */ +template +inline auto evaluate(ELossFunction f, + const typename Architecture_t::Matrix_t & Y, + const typename Architecture_t::Matrix_t & output) +-> decltype(Architecture_t::CrossEntropy(Y,output)) +{ + switch(f) + { + case ELossFunction::CROSSENTROPY : + return Architecture_t::CrossEntropy(Y, output); + case ELossFunction::MEANSQUAREDERROR : + return Architecture_t::MeanSquaredError(Y, output); + } + return 0.0; +} + +/*! Compute the gradient of the given output function f for given activations +* output of the output layer and truth Y and write the results into dY. */ +//______________________________________________________________________________ +template +inline void evaluateGradients(typename Architecture_t::Matrix_t & dY, + ELossFunction f, + const typename Architecture_t::Matrix_t &Y, + const typename Architecture_t::Matrix_t &output) +{ + switch(f) + { + case ELossFunction::CROSSENTROPY : + Architecture_t::CrossEntropyGradients(dY, Y, output); + break; + case ELossFunction::MEANSQUAREDERROR : + Architecture_t::MeanSquaredErrorGradients(dY, Y, output); + break; + } +} + + +//______________________________________________________________________________ +// +// Regularization +//______________________________________________________________________________ + +/*! Evaluate the regularization functional for a given weight matrix. */ +template +inline auto regularization(const typename Architecture_t::Matrix_t &A, + ERegularization R) +-> decltype(Architecture_t::L1Regularization(A)) +{ + switch(R) + { + case ERegularization::NONE : + return 0.0; + case ERegularization::L1 : + return Architecture_t::L1Regularization(A); + case ERegularization::L2 : + return Architecture_t::L2Regularization(A); + } + return 0.0; +} + +/*! Add the regularization gradient corresponding to weight matrix W, to +* the matrix A. */ +//______________________________________________________________________________ +template +inline void addRegularizationGradients(typename Architecture_t::Matrix_t &A, + const typename Architecture_t::Matrix_t &W, + typename Architecture_t::Scalar_t weightDecay, + ERegularization R) +{ + switch(R) + { + case ERegularization::NONE : + break; + case ERegularization::L1 : + Architecture_t::AddL1RegularizationGradients(A, W, weightDecay); + break; + case ERegularization::L2 : + Architecture_t::AddL2RegularizationGradients(A, W, weightDecay); + break; + } +} + +//______________________________________________________________________________ +// +// Initialization +//______________________________________________________________________________ + +template +inline void initialize(typename Architecture_t::Matrix_t & A, + EInitialization m) +{ + switch(m) { + case EInitialization::GAUSS : Architecture_t::InitializeGauss(A); + break; + case EInitialization::UNIFORM : Architecture_t::InitializeUniform(A); + break; + case EInitialization::IDENTITY : Architecture_t::InitializeIdentity(A); + break; + case EInitialization::ZERO : Architecture_t::InitializeZero(A); + break; + } +} + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Layer.h b/tmva/tmva/inc/TMVA/DNN/Layer.h new file mode 100644 index 0000000000000..da53fb37df246 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Layer.h @@ -0,0 +1,399 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 20/06/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Contains Layer and SharedLayer classes, that represent layers in // +// neural networks. // +////////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_LAYER +#define TMVA_DNN_LAYER + +#include + +#include "TMatrix.h" +#include "Functions.h" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +// +// The Layer Class +//______________________________________________________________________________ + +/** \class TLayer + + Generic layer class. + + This generic layer class represents a layer of a neural network with + a given width n and activation function f. The activation + function of each layer is given by \f$\mathbf{u} = + \mathbf{W}\mathbf{x} + \boldsymbol{\theta}\f$. + + In addition to the weight and bias matrices, each layer allocates memory + for its activations and the corresponding first partial fDerivatives of + the activation function as well as the gradients of the fWeights and fBiases. + + The layer provides member functions for the forward propagation of + activations through the given layer. +*/ +template + class TLayer +{ + +public: + using Scalar_t = typename Architecture_t::Scalar_t; + using Matrix_t = typename Architecture_t::Matrix_t; + +private: + + size_t fBatchSize; ///< Batch size used for training and evaluation. + size_t fInputWidth; ///< Number of neurons of the previous layer. + size_t fWidth; ///< Number of neurons of this layer. + + Scalar_t fDropoutProbability; ///< Probability that an input is active. + + Matrix_t fWeights; ///< The fWeights of this layer. + Matrix_t fBiases; ///< The bias values of this layer. + Matrix_t fOutput; ///< Activations of this layer. + Matrix_t fDerivatives; ///< First fDerivatives of the activations of this layer. + Matrix_t fWeightGradients; ///< Gradients w.r.t. the weigths of this layer. + Matrix_t fBiasGradients; ///< Gradients w.r.t. the bias values of this layer. + Matrix_t fActivationGradients; ///< Gradients w.r.t. the activations of this layer. + + EActivationFunction fF; ///< Activation function of the layer. + +public: + + TLayer(size_t BatchSize, + size_t InputWidth, + size_t Width, + EActivationFunction f, + Scalar_t dropoutProbability); + TLayer(const TLayer &); + + /*! Initialize fWeights according to the given initialization + * method. */ + void Initialize(EInitialization m); + /*! Compute activation of the layer for the given input. The input + * must be in matrix form with the different rows corresponding to + * different events in the batch. Computes activations as well as + * the first partial derivative of the activation function at those + * activations. */ + void inline Forward(Matrix_t & input, bool applyDropout = false); + /*! Compute weight, bias and activation gradients. Uses the precomputed + * first partial derviatives of the activation function computed during + * forward propagation and modifies them. Must only be called directly + * a the corresponding call to Forward(...). */ + void inline Backward(Matrix_t & gradients_backward, + const Matrix_t & activations_backward, + ERegularization r, + Scalar_t weightDecay); + + void Print() const; + + size_t GetBatchSize() const {return fBatchSize;} + size_t GetInputWidth() const {return fInputWidth;} + size_t GetWidth() const {return fWidth;} + size_t GetDropoutProbability() const {return fDropoutProbability;} + + void SetDropoutProbability(Scalar_t p) {fDropoutProbability = p;} + + EActivationFunction GetActivationFunction() const {return fF;} + + Matrix_t & GetOutput() {return fOutput;} + const Matrix_t & GetOutput() const {return fOutput;} + Matrix_t & GetWeights() {return fWeights;} + const Matrix_t & GetWeights() const {return fWeights;} + Matrix_t & GetBiases() {return fBiases;} + const Matrix_t & GetBiases() const {return fBiases;} + Matrix_t & GetActivationGradients() {return fActivationGradients;} + const Matrix_t & GetActivationGradients() const {return fActivationGradients;} + Matrix_t & GetBiasGradients() {return fBiasGradients;} + const Matrix_t & GetBiasGradients() const {return fBiasGradients;} + Matrix_t & GetWeightGradients() {return fWeightGradients;} + const Matrix_t & GetWeightGradients() const {return fWeightGradients;} + +}; + +//______________________________________________________________________________ +// +// The Shared Layer Class +//______________________________________________________________________________ + +/** \class TSharedLayer + + Layer class width shared weight and bias layers. + + Like the Layer class only that weight matrices are shared between + different instances of the net, which can be used to implement + multithreading 'Hogwild' style. +*/ + +template +class TSharedLayer +{ + +public: + + using Scalar_t = typename Architecture_t::Scalar_t; + using Matrix_t = typename Architecture_t::Matrix_t; + +private: + + size_t fBatchSize; ///< Batch size used for training and evaluation. + size_t fInputWidth; ///< Number of neurons of the previous layer. + size_t fWidth; ///< Number of neurons of this layer. + + Scalar_t fDropoutProbability; ///< Probability that an input is active. + + Matrix_t & fWeights; ///< Reference to the weight matrix of this layer. + Matrix_t & fBiases; ///< Reference to the bias vectors of this layer. + Matrix_t fOutput; ///< Activations of this layer. + Matrix_t fDerivatives; ///< First fDerivatives of the activations of this layer. + Matrix_t fWeightGradients; ///< Gradients w.r.t. the weigths of this layer. + Matrix_t fBiasGradients; ///< Gradients w.r.t. the bias values of this layer. + Matrix_t fActivationGradients; ///< Gradients w.r.t. the activations of this layer. + + EActivationFunction fF; ///< Activation function of the layer. + +public: + + TSharedLayer(size_t fBatchSize, + TLayer & layer); + TSharedLayer(const TSharedLayer & layer); + + /*! Compute activation of the layer for the given input. The input + * must be in matrix form with the different rows corresponding to + * different events in the batch. Computes activations as well as + * the first partial derivative of the activation function at those + * activations. */ + void inline Forward(Matrix_t & input, bool applyDropout = false); + /*! Compute weight, bias and activation gradients. Uses the precomputed + * first partial derviatives of the activation function computed during + * forward propagation and modifies them. Must only be called directly + * a the corresponding call to Forward(...). */ + void inline Backward(Matrix_t & gradients_backward, + const Matrix_t & activations_backward, + ERegularization r, + Scalar_t weightDecay); + + void Print() const; + + size_t GetBatchSize() const {return fBatchSize;} + size_t GetInputWidth() const {return fInputWidth;} + size_t GetWidth() const {return fWidth;} + size_t GetDropoutProbability() const {return fDropoutProbability;} + + void SetDropoutProbability(Scalar_t p) {fDropoutProbability = p;} + + EActivationFunction GetActivationFunction() const {return fF;} + + Matrix_t & GetOutput() {return fOutput;} + const Matrix_t & GetOutput() const {return fOutput;} + Matrix_t & GetWeights() const {return fWeights;} + Matrix_t & GetBiases() {return fBiases;} + const Matrix_t & GetBiases() const {return fBiases;} + Matrix_t & GetActivationGradients() {return fActivationGradients;} + const Matrix_t & GetActivationGradients() const {return fActivationGradients;} + Matrix_t & GetBiasGradients() {return fBiasGradients;} + const Matrix_t & GetBiasGradients() const {return fBiasGradients;} + Matrix_t & GetWeightGradients() {return fWeightGradients;} + const Matrix_t & GetWeightGradients() const {return fWeightGradients;} + +}; + +//______________________________________________________________________________ +// +// The Layer Class - Implementation +//______________________________________________________________________________ + +template + TLayer::TLayer(size_t batchSize, + size_t inputWidth, + size_t width, + EActivationFunction f, + Scalar_t dropoutProbability) + : fBatchSize(batchSize), fInputWidth(inputWidth), fWidth(width), + fDropoutProbability(dropoutProbability), fWeights(width, fInputWidth), + fBiases(width, 1), fOutput(fBatchSize, width), fDerivatives(fBatchSize, width), + fWeightGradients(width, fInputWidth), fBiasGradients(width, 1), + fActivationGradients(fBatchSize, width), fF(f) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ +template +TLayer::TLayer(const TLayer &layer) + : fBatchSize(layer.fBatchSize), + fInputWidth(layer.fInputWidth), fWidth(layer.fWidth), + fWeights(layer.fWidth, layer.fInputWidth), fBiases(layer.fWidth, 1), + fOutput(layer.fBatchSize, layer.fWidth), + fDerivatives(layer.fBatchSize, layer.fWidth), + fWeightGradients(layer.fWidth, layer.fInputWidth), + fBiasGradients(layer.fWidth, 1), + fActivationGradients(layer.fBatchSize, layer.fWidth), + fF(layer.fF) +{ + std::cout << "Copy const. " << std::endl; + // Nothing to do here. +} + +//______________________________________________________________________________ +template +auto TLayer::Initialize(EInitialization m) +-> void +{ + initialize(fWeights, m); + initialize(fBiases, EInitialization::ZERO); +} + +//______________________________________________________________________________ +template +auto inline TLayer::Forward(Matrix_t & input, + bool applyDropout) +-> void +{ + if (applyDropout && (fDropoutProbability != 1.0)) { + Architecture_t::Dropout(input, fDropoutProbability); + } + Architecture_t::MultiplyTranspose(fOutput, input, fWeights); + Architecture_t::AddRowWise(fOutput, fBiases); + evaluateDerivative(fDerivatives, fF, fOutput); + evaluate(fOutput, fF); +} + +//______________________________________________________________________________ +template +auto TLayer::Backward(Matrix_t & gradients_backward, + const Matrix_t & activations_backward, + ERegularization r, + Scalar_t weightDecay) +-> void +{ + Architecture_t::Backward(gradients_backward, + fWeightGradients, + fBiasGradients, + fDerivatives, + fActivationGradients, + fWeights, + activations_backward); + addRegularizationGradients(fWeightGradients, + fWeights, + weightDecay, r); +} + +//______________________________________________________________________________ +template + void TLayer::Print() const +{ + std::cout << "Width: " << fWeights.GetNrows(); + std::cout << "Biases: " << std::endl; + ((TMatrixT) fBiases).Print(); + std::cout << "Weights: " << std::endl; + ((TMatrixT) fWeights).Print(); + std::cout << ", activation function: "; + std::cout << static_cast(fF) << std::endl; + ((TMatrixT) fOutput).Print(); + std::cout << "weight gradients: " << std::endl; + ((TMatrixT) fWeightGradients).Print(); + std::cout << "bias gradients: " << std::endl; + ((TMatrixT) fBiasGradients).Print(); + +// ((TMatrixT) fBiases).Print(); +} + +//______________________________________________________________________________ +// +// The Shared Layer Class - Implementation +//______________________________________________________________________________ + +//______________________________________________________________________________ +template +TSharedLayer::TSharedLayer(size_t BatchSize, + TLayer &layer) +: fBatchSize(BatchSize), +fInputWidth(layer.GetInputWidth()), fWidth(layer.GetWidth()), +fDropoutProbability(layer.GetDropoutProbability()), +fWeights(layer.GetWeights()), fBiases(layer.GetBiases()), +fOutput(fBatchSize, fWidth), fDerivatives(fBatchSize, fWidth), +fWeightGradients(fWidth, fInputWidth), fBiasGradients(fWidth, 1), +fActivationGradients(fBatchSize, fWidth), fF(layer.GetActivationFunction()) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ +template +TSharedLayer::TSharedLayer(const TSharedLayer &layer) + : fBatchSize(layer.fBatchSize), + fInputWidth(layer.GetInputWidth()), fWidth(layer.GetWidth()), + fWeights(layer.fWeights), fBiases(layer.fBiases), + fOutput(layer.fBatchSize, fWidth), fDerivatives(layer.fBatchSize, fWidth), + fWeightGradients(fWidth, fInputWidth), fBiasGradients(fWidth, 1), + fActivationGradients(layer.fBatchSize, fWidth), + fF(layer.fF) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ +template +auto inline TSharedLayer::Forward(Matrix_t & input, + bool applyDropout) +-> void +{ + if (applyDropout && (fDropoutProbability != 1.0)) { + Architecture_t::Dropout(input, fDropoutProbability); + } + Architecture_t::MultiplyTranspose(fOutput, input, fWeights); + Architecture_t::AddRowWise(fOutput, fBiases); + evaluateDerivative(fDerivatives, fF, fOutput); + evaluate(fOutput, fF); +} + +//______________________________________________________________________________ +template +auto inline TSharedLayer::Backward(Matrix_t & gradients_backward, + const Matrix_t & activations_backward, + ERegularization r, + Scalar_t weightDecay) +-> void +{ + Architecture_t::Backward(gradients_backward, + fWeightGradients, + fBiasGradients, + fDerivatives, + fActivationGradients, + fWeights, + activations_backward); + addRegularizationGradients(fWeightGradients, + fWeights, + weightDecay, r); +} + +//______________________________________________________________________________ +template +void TSharedLayer::Print() const +{ + std::cout << "Width: " << fWeights.GetNrows(); + std::cout << ", activation function: "; + std::cout << static_cast(fF) << std::endl; +} + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Minimizers.h b/tmva/tmva/inc/TMVA/DNN/Minimizers.h new file mode 100644 index 0000000000000..e54dd3a78a483 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Minimizers.h @@ -0,0 +1,407 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh 21/06/16 + +/************************************************************************* + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#ifndef TMVA_DNN_MINIMIZERS +#define TMVA_DNN_MINIMIZERS + +#include "DataLoader.h" +#include "Functions.h" +#include + +namespace TMVA { +namespace DNN { + +//______________________________________________________________________________ +// +// Generic Gradient Descent Class +//______________________________________________________________________________ +// + +/** \class TGradientDescent + + Generic implementation of gradient descent minimization. + + The TGradientDescent class implements an architecture and input data + independent implementation of the gradient descent minimization algorithm. + + The Train(...) method trains a given neural network using the provided training + and test (validation) data. The interface between input data and the matrix + representation required by the net is given by the prepare_batches(...) and + prepare_test_data(...) methods as well as the TBatch class of the + architecture-specific back end. + + The prepare_batches(...) is expected to generate an iterable of batches. On + each of these batches, the Step(...) routine of the minimizer is called, which + for the gradient descent method just adds the corresponding gradients scaled + by \f$-\alpha\f$ to the weights and biases of each layer. Here \f$\alpha\f$ is + the learning rate of the gradient descent method. + + The prepare_test_data(...) routine should return a batch representing + the test data, which is used to evaluate the performance of the net + every testInterval steps. + + \tparam Architecture_t Type representing which implementation of the low-level + interface to use. + */ +template +class TGradientDescent +{ +public: + using Scalar_t = typename Architecture_t::Scalar_t; + using Matrix_t = typename Architecture_t::Matrix_t; + +private: + size_t fBatchSize; ///< Batch size to use for the training. + size_t fStepCount; ///< Number of steps performed in the current training sessiong. + size_t fConvergenceSteps; ///< Number of training epochs without considerable decrease in the test error for convergence. + size_t fConvergenceCount; ///< Current number of training epochs without considerable decrease in the test error. + size_t fTestInterval; ///< Interval for the computation of the test error. + Scalar_t fTrainingError;///< Holds the most recently computed training loss. + Scalar_t fTestError; ///< Holds the most recently computed test loss. + Scalar_t fLearningRate; ///< Learning rate \f$\alpha\f$ + Scalar_t fMinimumError; ///< The minimum loss achieved on the training set during the current traning session. + +public: + TGradientDescent(); + TGradientDescent(Scalar_t learningRate, + size_t convergenceSteps, + size_t testInterval); + /*! Reset minimizer object to initial state. Does nothing for this minimizer. */ + void Reset() {}; + /*! Train the given net using the given training input data (events), training + output data (labels), test input data (events), test output data (labels). */ + template + Scalar_t Train(const Data_t & TrainingDataIn, size_t nTrainingSamples, + const Data_t & TestDataIn, size_t nTestSamples, + Net_t & net, size_t nThreads = 1); + /*! Perform a single optimization step on a given batch. Propagates the input + matrix foward through the net, evaluates the loss and propagates the gradients + backward through the net. The computed gradients are scaled by the learning + rate \f$\alpha\f$ and subtracted from the weights and bias values of each + layer. */ + template + void Step(Net_t &net, Matrix_t &input, const Matrix_t &output); + template + void Step(Net_t &master, + std::vector &nets, + std::vector> &batches); + /** Does not evaluate the loss and therefore not trigger a possible synchronization + * with the device. Trains the weights of each layer, but only the bias terms of + * the first layer for compatibility with the previous implementation. */ + template + void StepReducedWeights(Net_t &net, + Matrix_t &input, + const Matrix_t &output); + /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger + * synchronization with the device. */ + template + Scalar_t StepReducedWeightsLoss(Net_t &net, + Matrix_t &input, + const Matrix_t &output); + template + inline void TestError(Net_t &net, + Matrix_t &input, + const Matrix_t &output); + bool HasConverged(); + + size_t GetConvergenceCount() const {return fConvergenceCount;} + size_t getConvergenceSteps() const {return fConvergenceSteps;} + Scalar_t GetTrainingError() const {return fTrainingError;} + Scalar_t GetTestError() const {return fTestError;} + size_t GetTestInterval() const {return fTestInterval;} + + void SetConvergenceSteps(size_t steps) {fConvergenceSteps = steps;} + void SetTestInterval(size_t interval) {fTestInterval = interval;} + void SetLearningRate(Scalar_t rate) {fLearningRate = rate;} +}; + +//______________________________________________________________________________ +// +// Implementation +//______________________________________________________________________________ +template + TGradientDescent::TGradientDescent() + : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), + fConvergenceCount(0), fTestInterval(0), fLearningRate(0), + fMinimumError(1e100) +{ + // Nothing to do here. +} +//______________________________________________________________________________ +template +TGradientDescent::TGradientDescent(Scalar_t learningRate, + size_t convergenceSteps, + size_t testInterval) + : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), + fConvergenceCount(0), fTestInterval(testInterval), fLearningRate(learningRate), + fMinimumError(1e100) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ +template +template + auto TGradientDescent::Train(const Data_t & trainingData, + size_t nTrainingSamples, + const Data_t & testData, + size_t nTestSamples, + Net_t & net, + size_t nThreads) + -> Scalar_t +{ + // Reset iteration state. + fMinimumError = 1e100; + fConvergenceCount = 0; + fStepCount = 0; + + // Prepare training data. + bool converged = false; + + TDataLoader trainLoader(trainingData, nTrainingSamples, + net.GetBatchSize(), + net.GetInputWidth(), + net.GetOutputWidth(), nThreads); + auto testNet = net.CreateClone(nTestSamples); + TDataLoader testLoader(testData, nTestSamples, + testNet.GetBatchSize(), + testNet.GetInputWidth(), + net.GetOutputWidth()); + std::vector nets{}; + nets.reserve(nThreads); + for (size_t i = 0; i < nThreads; i++) { + nets.push_back(net); + for (size_t j = 0; j < net.GetDepth(); j++) + { + std::cout << "copy" << std::endl; + auto &masterLayer = net.GetLayer(j); + auto &layer = nets.back().GetLayer(j); + Architecture_t::Copy(layer.GetWeights(), + masterLayer.GetWeights()); + Architecture_t::Copy(layer.GetBiases(), + masterLayer.GetBiases()); + } + } + + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + + while (!converged) + { + fStepCount++; + + size_t netIndex = 0; + std::vector> batches{}; + for (size_t i = 0; i < nTrainingSamples / net.GetBatchSize(); i += nThreads) { + batches.clear(); + for (size_t j = 0; j < nThreads; j++) { + batches.reserve(nThreads); + batches.push_back(trainLoader.GetBatch()); + } + Step(net, nets, batches); + } + + // Compute test error. + if ((fStepCount % fTestInterval) == 0) { + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + start = std::chrono::system_clock::now(); + double seconds = elapsed_seconds.count(); + double nFlops = (double) (fTestInterval * (nTrainingSamples / net.GetBatchSize())); + nFlops *= net.GetNFlops(); + std::cout << "Elapsed time for " << fTestInterval << " Epochs: " + << seconds << " [s] => " << nFlops * 1e-9 / seconds + << " GFlop/s" << std::endl; + auto b = *testLoader.begin(); + auto inputMatrix = b.GetInput(); + auto outputMatrix = b.GetOutput(); + + Scalar_t loss = testNet.Loss(inputMatrix, outputMatrix); + std::cout << fStepCount << ": " << loss << std::endl; + converged = HasConverged(); + } + + } + return fMinimumError; +} + +//______________________________________________________________________________ +template + template + void inline TGradientDescent::Step(Net_t & net, + Matrix_t &input, + const Matrix_t &output) +{ + //Scalar_t loss = net.Loss(input, output); + //fTrainingError = loss; + net.Forward(input); + net.Backward(input, output); + + for (size_t i = 0; i < net.GetDepth(); i++) + { + auto &layer = net.GetLayer(i); + Architecture_t::ScaleAdd(layer.GetWeights(), + layer.GetWeightGradients(), + -fLearningRate); + Architecture_t::ScaleAdd(layer.GetBiases(), + layer.GetBiasGradients(), + -fLearningRate); + } +} + +//______________________________________________________________________________ +template + template + void inline TGradientDescent::Step( + Net_t & master, + std::vector & nets, + std::vector> & batches) +{ + typename Architecture_t::Matrix_t dummy(0,0); + size_t depth = master.GetDepth(); + + // Forward + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(0).Forward(batches[j].GetInput()); + } + + for (size_t i = 1; i < depth; i++) + { + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput()); + } + } + // Gradients + for (size_t j = 0; j < nets.size(); j++) { + evaluateGradients( + nets[j].GetLayer(depth-1).GetActivationGradients(), + nets[j].GetLossFunction(), + batches[j].GetOutput(), + nets[j].GetLayer(depth-1).GetOutput()); + } + // Backward + for (size_t i = depth - 1; i > 0; i--) + { + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(), + nets[j].GetLayer(i-1).GetOutput(), + nets[j].GetRegularization(), + nets[j].GetWeightDecay()); + } + } + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(0).Backward(dummy, + batches[j].GetInput(), + nets[j].GetRegularization(), + nets[j].GetWeightDecay()); + } + + for (size_t j = 0; j < nets.size(); j++) { + for (size_t i = 0; i < depth; i++) + { + auto &masterLayer = master.GetLayer(i); + auto &layer = nets[j].GetLayer(i); + Architecture_t::ScaleAdd(masterLayer.GetWeights(), + layer.GetWeightGradients(), + -fLearningRate); + Architecture_t::Copy(layer.GetWeights(), + masterLayer.GetWeights()); + Architecture_t::ScaleAdd(masterLayer.GetBiases(), + layer.GetBiasGradients(), + -fLearningRate); + Architecture_t::Copy(layer.GetBiases(), + masterLayer.GetBiases()); + } + } +} + + +//______________________________________________________________________________ +template +template +void inline TGradientDescent::StepReducedWeights( + Net_t & net, + Matrix_t &input, + const Matrix_t &output) +{ + net.Forward(input); + net.Backward(input, output); + + for (size_t i = 0; i < net.GetDepth(); i++) + { + auto &layer = net.GetLayer(i); + Architecture_t::ScaleAdd(layer.GetWeights(), + layer.GetWeightGradients(), + -fLearningRate); + if (i == 0) { + Architecture_t::ScaleAdd(layer.GetBiases(), + layer.GetBiasGradients(), + -fLearningRate); + } + } +} + +//______________________________________________________________________________ +template + template + auto inline TGradientDescent::StepReducedWeightsLoss( + Net_t & net, + Matrix_t &input, + const Matrix_t &output) + -> Scalar_t +{ + Scalar_t loss = net.Loss(input, output); + fTrainingError = loss; + net.Backward(input, output); + + for (size_t i = 0; i < net.GetDepth(); i++) + { + auto &layer = net.GetLayer(i); + Architecture_t::ScaleAdd(layer.GetWeights(), + layer.GetWeightGradients(), + -fLearningRate); + if (i == 0) { + Architecture_t::ScaleAdd(layer.GetBiases(), + layer.GetBiasGradients(), + -fLearningRate); + } + } + return loss; +} + +//______________________________________________________________________________ +template + template + inline void TGradientDescent::TestError(Net_t & net, + Matrix_t &input, + const Matrix_t &output) +{ + fTestError = net.Loss(input, output, false); +} + +//______________________________________________________________________________ +template +bool inline TGradientDescent::HasConverged() +{ + if (fTestError < fMinimumError * 0.999) { + fConvergenceCount = 0; + fMinimumError = fTestError; + } else { + fConvergenceCount++; + } + + return (fConvergenceCount >= fConvergenceSteps); +} + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Net.h b/tmva/tmva/inc/TMVA/DNN/Net.h new file mode 100644 index 0000000000000..4a4ae9957ef72 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Net.h @@ -0,0 +1,368 @@ +// @(#)root/tmva: $Id$ +// Author: Simon Pfreundschuh 20/06/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#ifndef TMVA_DNN_NET +#define TMVA_DNN_NET + +#include +#include + +#include "Layer.h" + +namespace TMVA { +namespace DNN { + +/** \class TNet + + Generic neural network class. + + This generic neural network class represents a concrete neural + network through a vector of layers and coordinates the forward + and backward propagation through the net. + + The net takes as input a batch from the training data given in + matrix form, with each row corresponding to a certain training + event. + + On construction, the neural network allocates all the memory + required for the training of the neural net and keeps it until + its destruction. + + The Architecture type argument simply holds the + architecture-specific data types, which are just the matrix type + Matrix_t and the used scalar type Scalar_t. + + \tparam Architecture The Architecture type that holds the + \tparam Layer_t The type used for the layers. Can be either + Layer or SharedWeightLayer. + datatypes for a given architecture. +*/ +template> + class TNet { + +public: + using Matrix_t = typename Architecture_t::Matrix_t; + using Scalar_t = typename Architecture_t::Scalar_t; + using LayerIterator_t = typename std::vector::iterator; + +private: + size_t fBatchSize; ///< Batch size for training and evaluation of the Network. + size_t fInputWidth; ///< Number of features in a single input event. + + std::vector fLayers; ///< Layers in the network. + + Matrix_t fDummy; ///< Empty matrix for last step in back propagation. + ELossFunction fJ; ///< The loss function of the network. + ERegularization fR; ///< The regularization used for the network. + Scalar_t fWeightDecay; ///< The weight decay factor. + +public: + TNet(); + TNet(const TNet & other); + /*! Construct a neural net for a given batch size with + * given output function * and regularization. */ + TNet(size_t batchSize, + size_t inputWidth, + ELossFunction fJ, + ERegularization fR = ERegularization::NONE, + Scalar_t fWeightDecay = 0.0); + /*! Create a clone that uses the same weight and biases matrices but + * potentially a difference batch size. */ + TNet> CreateClone(size_t batchSize); + + /*! Add a layer of the given size to the neural net. */ + void AddLayer(size_t width, EActivationFunction f, + Scalar_t dropoutProbability = 1.0); + + /*! Remove all layers from the network.*/ + void Clear(); + + /*! Add a layer which shares its weights with another TNet instance. */ + template + void AddLayer(SharedLayer & layer); + + /*! Iterator to the first layer of the net. */ + LayerIterator_t LayersBegin() {return fLayers;} + + /*! Iterator to the last layer of the net. */ + LayerIterator_t LayersEnd() {return fLayers;} + + /*! Initialize the weights in the net with the + * initialization method. */ + inline void Initialize(EInitialization m); + + /*! Forward a given input through the neural net. Computes + * all layer activations up to the output layer */ + inline void Forward(Matrix_t& X, bool applyDropout = false); + + /*! Compute the weight gradients in the net from the given training + * samples X and training labels Y. */ + inline void Backward(const Matrix_t &X, const Matrix_t &Y); + + /*! Evaluate the loss function of the net using the activations + * that are currently stored in the output layer. */ + inline Scalar_t Loss(const Matrix_t &Y) const; + + /*! Propagate the input batch X through the net and evaluate the + * error function for the resulting activations of the output + * layer */ + inline Scalar_t Loss(Matrix_t &X, const Matrix_t &Y, bool applyDropout = false); + + /*! Compute the neural network predictionion obtained from forwarding the + * batch X through the neural network and applying the output function + * f to the activation of the last layer in the network. */ + inline void Prediction(Matrix_t &Y_hat, Matrix_t &X, EOutputFunction f); + + /*! Compute the neural network rediction obtained from applying the output + * function f to the activation of the last layer in the network. */ + inline void Prediction(Matrix_t &Y_hat, EOutputFunction f) const; + + Scalar_t GetNFlops(); + + size_t GetDepth() const {return fLayers.size();} + size_t GetBatchSize() const {return fBatchSize;} + Layer_t & GetLayer(size_t i) {return fLayers[i];} + const Layer_t & GetLayer(size_t i) const {return fLayers[i];} + ELossFunction GetLossFunction() const {return fJ;} + Matrix_t & GetOutput() {return fLayers.back().GetOutput();} + size_t GetInputWidth() const {return fInputWidth;} + size_t GetOutputWidth() const {return fLayers.back().GetWidth();} + ERegularization GetRegularization() {return fR;} + Scalar_t GetWeightDecay() {return fWeightDecay;} + + void SetInputWidth(size_t InputWidth) {fInputWidth = InputWidth;} + void SetRegularization(ERegularization R) {fR = R;} + void SetLossFunction(ELossFunction J) {fJ = J;} + void SetWeightDecay(Scalar_t weightDecay) {fWeightDecay = weightDecay;} + + void Print(); +}; + +//______________________________________________________________________________ + +template + TNet::TNet() + : fBatchSize(0), fInputWidth(0), fDummy(0,0), + fJ(ELossFunction::MEANSQUAREDERROR), fR(ERegularization::NONE) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ + +template + TNet::TNet(const TNet & other) + : fBatchSize(other.fBatchSize), fInputWidth(other.fInputWidth), + fLayers(other.fLayers), fDummy(0,0), fJ(other.fJ), fR(other.fR) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ + +template + TNet::TNet(size_t batchSize, + size_t inputWidth, + ELossFunction J, + ERegularization R, + Scalar_t weightDecay) + : fBatchSize(batchSize), fInputWidth(inputWidth), fDummy(0,0), + fJ(J), fR(R), fWeightDecay(weightDecay) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ + +template + auto TNet::CreateClone(size_t BatchSize) + -> TNet> +{ + TNet> other(BatchSize, + fInputWidth, + fJ, fR); + for (auto &l : fLayers) { + other.AddLayer(l); + } + return other; +} + +//______________________________________________________________________________ + +template + void TNet::AddLayer(size_t width, + EActivationFunction f, + Scalar_t dropoutProbability) +{ + if (fLayers.size() == 0) { + fLayers.emplace_back(fBatchSize, fInputWidth, width, f, dropoutProbability); + } else { + size_t prevWidth = fLayers.back().GetWidth(); + fLayers.emplace_back(fBatchSize, prevWidth, width, f, dropoutProbability); + } +} + +//______________________________________________________________________________ +template + void TNet::Clear() +{ + fLayers.clear(); +} + +//______________________________________________________________________________ +template + template + inline void TNet::AddLayer(SharedLayer_t & layer) +{ + fLayers.emplace_back(fBatchSize, layer); +} + +//______________________________________________________________________________ +template + inline void TNet::Initialize(EInitialization m) +{ + for (auto &l : fLayers) { + l.Initialize(m); + } +} + +//______________________________________________________________________________ +template +inline void TNet::Forward(Matrix_t &input, + bool applyDropout) +{ + fLayers.front().Forward(input, applyDropout); + + for (size_t i = 1; i < fLayers.size(); i++) { + fLayers[i].Forward(fLayers[i-1].GetOutput(), applyDropout); + } +} + +//______________________________________________________________________________ +template + inline void TNet::Backward(const Matrix_t &X, + const Matrix_t &Y) +{ + + evaluateGradients(fLayers.back().GetActivationGradients(), + fJ, Y, fLayers.back().GetOutput()); + + for (size_t i = fLayers.size()-1; i > 0; i--) { + auto & activation_gradient_backward + = fLayers[i-1].GetActivationGradients(); + auto & activations_backward + = fLayers[i-1].GetOutput(); + fLayers[i].Backward(activation_gradient_backward, + activations_backward, fR, fWeightDecay); + } + fLayers[0].Backward(fDummy, X, fR, fWeightDecay); + +} + +//______________________________________________________________________________ + +template + inline auto TNet::Loss(const Matrix_t &Y) const + -> Scalar_t +{ + auto loss = evaluate(fJ, Y, fLayers.back().GetOutput()); + for (auto &l : fLayers) { + loss += fWeightDecay * regularization(l.GetWeights(), fR); + } + return loss; +} + +//______________________________________________________________________________ + +template + inline auto TNet::Loss(Matrix_t &X, + const Matrix_t &Y, + bool applyDropout) + -> Scalar_t +{ + Forward(X, applyDropout); + return Loss(Y); +} + +//______________________________________________________________________________ + +template + inline void TNet::Prediction(Matrix_t &Yhat, + Matrix_t &X, + EOutputFunction f) +{ + Forward(X, false); + evaluate(Yhat, f, fLayers.back().GetOutput()); +} + +//______________________________________________________________________________ + +template + inline void TNet::Prediction(Matrix_t &Y_hat, + EOutputFunction f) const +{ + evaluate(Y_hat, f, fLayers.back().GetOutput()); +} + +//______________________________________________________________________________ +template +auto TNet::GetNFlops() + -> Scalar_t +{ + Scalar_t flops = 0; + + Scalar_t nb = (Scalar_t) fBatchSize; + Scalar_t nlp = (Scalar_t) fInputWidth; + + for(size_t i = 0; i < fLayers.size(); i++) { + Layer_t & layer = fLayers[i]; + Scalar_t nl = (Scalar_t) layer.GetWidth(); + + // Forward propagation. + flops += nb * nl * (2.0 * nlp - 1); // Matrix mult. + flops += nb * nl; // Add bias values. + flops += 2 * nb * nl; // Apply activation function and compute + // derivative. + // Backward propagation. + flops += nb * nl; // Hadamard + flops += nlp * nb * (2.0 * nlp - 1.0); // Weight gradients + flops += nl * (nb - 1); // Bias gradients + if (i > 0) { + flops += nlp * nb * (2.0 * nl - 1.0); // Previous layer gradients. + } + nlp = nl; + } + return flops; +} + +//______________________________________________________________________________ +template + void TNet::Print() +{ + std::cout << "DEEP NEURAL NETWORK:"; + std::cout << " Loss function: " << static_cast(fJ); + std::cout << ", size: " << fLayers.size() << std::endl; + + size_t i = 1; + for (auto & l : fLayers) { + std::cout << "DNN Layer " << i << ":" << std::endl; + l.Print(); + i++; + } + +} + +//______________________________________________________________________________ + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/MethodDNN.h b/tmva/tmva/inc/TMVA/MethodDNN.h index 3b6bdc13971bb..0325459a4b706 100644 --- a/tmva/tmva/inc/TMVA/MethodDNN.h +++ b/tmva/tmva/inc/TMVA/MethodDNN.h @@ -57,6 +57,12 @@ #include "TMVA/NeuralNet.h" #endif +#include "TMVA/DNN/Net.h" +#include "TMVA/DNN/Minimizers.h" + +#ifdef DNNCUDA +#include "TMVA/DNN/Architectures/Cuda.h" +#endif namespace TMVA { @@ -82,6 +88,7 @@ namespace TMVA { std::vector> ParseKeyValueString(TString parseString, TString blockDelim, TString tokenDelim); void Train(); + void TrainGPU(); virtual Double_t GetMvaValue( Double_t* err=0, Double_t* errUpper=0 ); virtual const std::vector& GetRegressionValues(); @@ -142,6 +149,8 @@ namespace TMVA { bool fResume; TString fWeightInitializationStrategyString; TMVA::DNN::WeightInitializationStrategy fWeightInitializationStrategy; + TString fGPUString; + bool fGPU; std::vector> fSettings; diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu new file mode 100644 index 0000000000000..dad198013880c --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu @@ -0,0 +1,194 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////////// + // Implementation of the activation functions for the TCuda // + // implementation of the low-level interface. // + ////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "TMVA/DNN/Architectures/Cuda/Kernels.h" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +void TCuda::IdentityDerivative(TCudaMatrix & B, const TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::IdentityDerivative<<>>( + B.GetDataPointer(), + (int) B.GetNrows(), + (int) B.GetNcols()); + B.SetComputeStream(s); +} + +//______________________________________________________________________________ +void TCuda::Relu(TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Relu<<>>(A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +void TCuda::ReluDerivative(TCudaMatrix & B, const TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::ReluDerivative<<>>( + B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +//______________________________________________________________________________ +void TCuda::Sigmoid(TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Sigmoid<<>>( + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +void TCuda::SigmoidDerivative(TCudaMatrix & B, const TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::SigmoidDerivative<<>>( + B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +//______________________________________________________________________________ +void TCuda::Tanh(TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Tanh<<>>( + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +void TCuda::TanhDerivative(TCudaMatrix & B, const TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::TanhDerivative<<>>( + B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +//______________________________________________________________________________ +void TCuda::SymmetricRelu(TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::SymmetricRelu<<>>(A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +void TCuda::SymmetricReluDerivative(TCudaMatrix & B, const TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::SymmetricReluDerivative<<>>( + B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +//______________________________________________________________________________ +void TCuda::SoftSign(TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::SoftSign<<>>( + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +void TCuda::SoftSignDerivative(TCudaMatrix & B, const TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::SoftSignDerivative<<>>( + B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +//______________________________________________________________________________ +void TCuda::Gauss(TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Gauss<<>>( + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +void TCuda::GaussDerivative(TCudaMatrix & B, const TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::GaussDerivative<<>>( + B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu new file mode 100644 index 0000000000000..d7a366ec18ce6 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu @@ -0,0 +1,134 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Contains additional arithmetic functions required by the CUDA // +// neural network implementation. // +/////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Kernels.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" + +namespace TMVA +{ +namespace DNN +{ + +//____________________________________________________________________________ +void TCuda::Multiply(TCudaMatrix &C, + const TCudaMatrix &A, + const TCudaMatrix &B) +{ + int m, n, k; + m = A.GetNrows(); + k = A.GetNcols(); + n = B.GetNcols(); + CudaDouble_t alpha = 1.0, beta = 0.0; + + cudaStream_t s = A.GetComputeStream(); + cublasSetStream(A.GetCublasHandle(), s); + + // Compute C = beta * C + alpha * (A * B) + cublasDgemm(A.GetCublasHandle(), + CUBLAS_OP_N, CUBLAS_OP_N, + m, n, k, & alpha, + A.GetDataPointer(), m, // *A, lda + B.GetDataPointer(), k, // *B, ldb + & beta, // beta + C.GetDataPointer(), m); // *C, ldc + + C.SetComputeStream(s); +} + +//____________________________________________________________________________ +void TCuda::TransposeMultiply(TCudaMatrix & C, + const TCudaMatrix & A, + const TCudaMatrix & B) +{ + int m, n, k; + k = A.GetNrows(); + m = A.GetNcols(); + n = B.GetNcols(); + CudaDouble_t alpha = 1.0, beta = 0.0; + + cudaStream_t s = A.GetComputeStream(); + cublasSetStream(A.GetCublasHandle(), s); + + // Compute C = beta * C + alpha * (A^T * B) + cublasDgemm(A.GetCublasHandle(), + CUBLAS_OP_T, CUBLAS_OP_N, + m, n, k, & alpha, + A.GetDataPointer(), k, // *A, lda + B.GetDataPointer(), k, // *B, ldb + & beta, // beta + C.GetDataPointer(), m); // *C, ldc + + C.SetComputeStream(s); +} + +//____________________________________________________________________________ +void TCuda::Hadamard(TCudaMatrix &B, + const TCudaMatrix &A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Hadamard<<>>(B.GetDataPointer(), + A.GetDataPointer(), + A.GetNrows(), + A.GetNcols()); + B.SetComputeStream(s); +} + +//____________________________________________________________________________ +CudaDouble_t TCuda::Sum(const TCudaMatrix &A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + + TCudaMatrix::ResetDeviceReturn(); + ::TMVA::DNN::Cuda::ReduceMatrix<<>>( + TCudaMatrix::GetDeviceReturnPointer(), + A.GetDataPointer(), + A.GetNrows(), + A.GetNcols()); + return TCudaMatrix::GetDeviceReturn(); +} + +//____________________________________________________________________________ +void TCuda::SumColumns(TCudaMatrix &B, const TCudaMatrix &A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + + cudaStream_t s = A.GetComputeStream(); + cudaMemsetAsync(B.GetDataPointer(), 0, A.GetNcols() * sizeof(CudaDouble_t), s); + ::TMVA::DNN::Cuda::SumColumns<<>>(B.GetDataPointer(), + A.GetDataPointer(), + A.GetNrows(), + A.GetNcols()); + B.SetComputeStream(s); +} + +//____________________________________________________________________________ +void TCuda::ScaleAdd(TCudaMatrix &B, const TCudaMatrix &A, CudaDouble_t alpha) +{ + cudaStream_t s = 0; //A.GetComputeStream(); + cublasDaxpy(A.GetCublasHandle(), A.GetNoElements(), &alpha, + A.GetDataPointer(), 1, + B.GetDataPointer(), 1); + //B.SetComputeStream(s); +} + +} // DNN +} // TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Buffers.cxx b/tmva/tmva/src/DNN/Architectures/Cuda/Buffers.cxx new file mode 100644 index 0000000000000..cc8044934b80c --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Buffers.cxx @@ -0,0 +1,126 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 07/08/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////////////// +// Implementation of device and host buffers for CUDA architectures. // +//////////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda/Buffers.h" +#include "cuda_runtime.h" +#include + +namespace TMVA { +namespace DNN { + +// +// TCudaHostBuffer +//______________________________________________________________________________ +void TCudaHostBuffer::TDestructor::operator()(CudaDouble_t ** devicePointer) +{ + cudaFreeHost(*devicePointer); + delete[] devicePointer; +} + +//______________________________________________________________________________ +TCudaHostBuffer::TCudaHostBuffer(size_t size) + : fOffset(0), fComputeStream(0), fDestructor() +{ + CudaDouble_t ** pointer = new CudaDouble_t * [1]; + cudaMallocHost(pointer, size * sizeof(CudaDouble_t)); + fDevicePointer = std::shared_ptr(pointer, fDestructor); +} + +//______________________________________________________________________________ +TCudaHostBuffer::operator CudaDouble_t * () const +{ + return *fDevicePointer + fOffset; +} + +//______________________________________________________________________________ +TCudaHostBuffer TCudaHostBuffer::GetSubBuffer(size_t offset, size_t /*size*/) +{ + TCudaHostBuffer buffer = *this; + buffer.fOffset = offset; + return buffer; +} + +// +// TCudaDevicePointer +//______________________________________________________________________________ +void TCudaDeviceBuffer::TDestructor::operator()(CudaDouble_t ** devicePointer) +{ + cudaFree(*devicePointer); + delete[] devicePointer; +} + +//______________________________________________________________________________ +TCudaDeviceBuffer::TCudaDeviceBuffer(size_t size) + : fOffset(0), fSize(size), fDestructor() +{ + CudaDouble_t ** pointer = new CudaDouble_t * [1]; + cudaMalloc(pointer, size * sizeof(CudaDouble_t)); + fDevicePointer = std::shared_ptr(pointer, fDestructor); + cudaStreamCreate(&fComputeStream); +} + +//______________________________________________________________________________ +TCudaDeviceBuffer::TCudaDeviceBuffer(size_t size, cudaStream_t stream) + : fOffset(0), fSize(size), fComputeStream(stream), fDestructor() +{ + CudaDouble_t ** pointer = new CudaDouble_t * [1]; + cudaMalloc(pointer, size * sizeof(CudaDouble_t)); + fDevicePointer = std::shared_ptr(pointer, fDestructor); +} + +//______________________________________________________________________________ +TCudaDeviceBuffer::TCudaDeviceBuffer(CudaDouble_t * devicePointer, + size_t size, + cudaStream_t stream) + : fOffset(0), fSize(size), fComputeStream(stream), fDestructor() +{ + CudaDouble_t ** pointer = new CudaDouble_t * [1]; + *pointer = devicePointer; + fDevicePointer = std::shared_ptr(pointer, fDestructor); +} + +//______________________________________________________________________________ +TCudaDeviceBuffer TCudaDeviceBuffer::GetSubBuffer(size_t offset, size_t size) +{ + TCudaDeviceBuffer buffer = *this; + buffer.fOffset = offset; + buffer.fSize = size; + return buffer; +} + +//______________________________________________________________________________ +TCudaDeviceBuffer::operator CudaDouble_t * () const +{ + return *fDevicePointer + fOffset; +} + +//______________________________________________________________________________ +void TCudaDeviceBuffer::CopyFrom(const TCudaHostBuffer &buffer) const +{ + cudaStreamSynchronize(fComputeStream); + cudaMemcpyAsync(*this, buffer, fSize * sizeof(CudaDouble_t), + cudaMemcpyHostToDevice, fComputeStream); +} + +//______________________________________________________________________________ +void TCudaDeviceBuffer::CopyTo(const TCudaHostBuffer &buffer) const +{ + cudaMemcpyAsync(*this, buffer, fSize * sizeof(CudaDouble_t), + cudaMemcpyDeviceToHost, fComputeStream); + buffer.fComputeStream = fComputeStream; +} + +} // TMVA +} // DNN diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu b/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu new file mode 100644 index 0000000000000..e68db8150d199 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu @@ -0,0 +1,127 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////// +// Implementation of the TCudaMatrix class. // +///////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda/CudaMatrix.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "TMVA/DNN/Architectures/Cuda/Kernels.h" + +namespace TMVA { +namespace DNN { + +// Static members. +//____________________________________________________________________________ +size_t TCudaMatrix::fInstances = 0; +cublasHandle_t TCudaMatrix::fCublasHandle = nullptr; +CudaDouble_t * TCudaMatrix::fDeviceReturn = nullptr; +curandState_t * TCudaMatrix::fCurandStates = nullptr; +size_t TCudaMatrix::fNCurandStates = 0; + +// Constructors. +//____________________________________________________________________________ +TCudaMatrix::TCudaMatrix() + : fNRows(0), fNCols(0), fElementBuffer() +{ + InitializeCuda(); +} + +//____________________________________________________________________________ +TCudaMatrix::TCudaMatrix(size_t m, size_t n) + : fNRows(m), fNCols(n), fElementBuffer(m * n, 0) +{ + InitializeCuda(); +} + +//____________________________________________________________________________ +TCudaMatrix::TCudaMatrix(const TMatrixT & Host) + : fNRows(Host.GetNrows()), fNCols(Host.GetNcols()), + fElementBuffer(Host.GetNoElements(), 0) +{ + InitializeCuda(); + + CudaDouble_t * buffer = new CudaDouble_t[fNRows * fNCols]; + size_t index = 0; + for (size_t j = 0; j < fNCols; j++) { + for (size_t i = 0; i < fNRows; i++) { + buffer[index] = Host(i, j); + index++; + } + } + + cudaMemcpy(fElementBuffer, buffer, fNRows * fNCols * sizeof(CudaDouble_t), + cudaMemcpyHostToDevice); +} + +//____________________________________________________________________________ +TCudaMatrix::TCudaMatrix(TCudaDeviceBuffer buffer, + size_t m, size_t n) + : fNRows(m), fNCols(n), fElementBuffer(buffer) +{ + InitializeCuda(); +} + +//____________________________________________________________________________ +inline void TCudaMatrix::InitializeCuda() +{ + if (fInstances == 0) { + cublasCreate(&fCublasHandle); + CUDACHECK(cudaMalloc(& fDeviceReturn, sizeof(CudaDouble_t))); + CUDACHECK(cudaMalloc(& fCurandStates, TDevice::NThreads(*this))); + } + if (TDevice::NThreads(*this) > (int) fNCurandStates) { + fNCurandStates = TDevice::NThreads(*this); + if (fCurandStates) { + cudaFree(fCurandStates); + } + cudaMalloc(&fCurandStates, TDevice::NThreads(*this) * sizeof(curandState_t)); + InitializeCurandStates(); + } + fInstances++; +} + +//____________________________________________________________________________ +void TCudaMatrix::InitializeCurandStates() +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(*this); + ::TMVA::DNN::Cuda::InitializeCurandStates<<>>(time(nullptr), + fCurandStates); + +} + + +// Conversion to TMatrixT. +//____________________________________________________________________________ +TCudaMatrix::operator TMatrixT() const +{ + TMatrixT hostMatrix(GetNrows(), GetNcols()); + + CudaDouble_t * buffer = new CudaDouble_t[fNRows * fNCols]; + cudaMemcpy(buffer, fElementBuffer, fNRows * fNCols * sizeof(CudaDouble_t), + cudaMemcpyDeviceToHost); + + size_t index = 0; + for (size_t j = 0; j < fNCols; j++) { + for (size_t i = 0; i < fNRows; i++) { + hostMatrix(i, j) = buffer[index]; + index++; + } + } + + delete[] buffer; + return hostMatrix; +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/DataLoader.cu b/tmva/tmva/src/DNN/Architectures/Cuda/DataLoader.cu new file mode 100644 index 0000000000000..05dc5be445559 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/DataLoader.cu @@ -0,0 +1,377 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////// +// Implementation of the data loader for Cuda architectures. // +/////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/DataLoader.h" + +namespace TMVA { +namespace DNN { + + +// Inline Function Bodies +//____________________________________________________________________________ + + + +// TCudaBatchIterator +//____________________________________________________________________________ +template +TCudaBatchIterator::TCudaBatchIterator(TCudaDataLoader &dataLoader, + IndexIterator_t sampleIndexIterator, + IndexIterator_t sampleIndexIteratorEnd) + : fDataLoader(dataLoader), + fSampleIndexIterator(sampleIndexIterator), + fSampleIndexIteratorEnd(sampleIndexIteratorEnd), + fNbatchesInEpoch(dataLoader.GetNBatchesInEpoch()), + fBatchSize(dataLoader.GetBatchSize()), + fTransferBatchSize(dataLoader.GetBatchBatchSize()) +{ + size_t distance = std::distance(fSampleIndexIterator, + fSampleIndexIteratorEnd); + fBatchIndex = fNbatchesInEpoch - (distance / fBatchSize); +} + +//____________________________________________________________________________ +template +inline void TCudaBatchIterator::PrepareStream() +{ + for (size_t i = 0; i < fDataLoader.GetPreloadOffset() + 1; i++) { + TCudaDataLoader::CopyBatches(fDataLoader.GetInputData(), + fSampleIndexIterator, + fSampleIndexIteratorEnd, + fBatchSize, + fTransferBatchSize, + fDataLoader.GetInputTransferBuffer(), + fDataLoader.GetOutputTransferBuffer()); + fDataLoader.InvokeTransfer(); + fSampleIndexIterator += fTransferBatchSize * fBatchSize; + } +} + +//____________________________________________________________________________ +template +inline TCudaBatchIterator & TCudaBatchIterator::operator++() +{ + fBatchIndex++; + if ((fBatchIndex % fTransferBatchSize) == 0) { + if (fBatchIndex < fNbatchesInEpoch) { + TCudaDataLoader::CopyBatches(fDataLoader.GetInputData(), + fSampleIndexIterator, + fSampleIndexIteratorEnd, + fBatchSize, + fTransferBatchSize, + fDataLoader.GetInputTransferBuffer(), + fDataLoader.GetOutputTransferBuffer()); + fDataLoader.InvokeTransfer(); + fSampleIndexIterator += fTransferBatchSize * fBatchSize; + } + } + return *this; +} + +//____________________________________________________________________________ +template +inline TCudaBatch TCudaBatchIterator::operator*() +{ + return fDataLoader.GetCurrentBatch(fBatchIndex); +} + +//____________________________________________________________________________ +template +inline bool TCudaBatchIterator::operator==(const TCudaBatchIterator & other) +{ + return fBatchIndex == other.fBatchIndex; +} + +//____________________________________________________________________________ +template +inline bool TCudaBatchIterator::operator!=(const TCudaBatchIterator & other) +{ + return fBatchIndex != other.fBatchIndex; +} + +// TCudaDataLoader +//____________________________________________________________________________ +template +TCudaDataLoader::TCudaDataLoader(const Data_t & inputData, + size_t nsamples, + size_t batchSize, + size_t ninputFeatures, + size_t noutputFeatures, + size_t batchBatchSize, + size_t preloadOffset) +: fInputData(inputData), fNsamples(nsamples), fNinputFeatures(ninputFeatures), + fNoutputFeatures(noutputFeatures), fBatchSize(batchSize), + fTransferBatchSize(batchBatchSize), fPreloadOffset(preloadOffset), + fStreamIndex(0), fSampleIndices(nsamples) +{ + fNbatchesInEpoch = fNsamples / fBatchSize; + + fInputMatrixSize = fNinputFeatures * fBatchSize; + fOutputMatrixSize = fNoutputFeatures * fBatchSize; + fTransferSize = fTransferBatchSize * (fInputMatrixSize + fOutputMatrixSize); + fTransferSize *= sizeof(CudaDouble_t); + + fHostData = new CudaDouble_t * [fPreloadOffset + 1]; + fDeviceData = new CudaDouble_t * [fPreloadOffset + 1]; + fDataStreams = new cudaStream_t [fPreloadOffset + 1]; + + for (size_t i = 0; i < fPreloadOffset + 1; i++) { + CUDACHECK(cudaMallocHost(fHostData + i, fTransferSize)); + CUDACHECK(cudaMalloc(fDeviceData + i, fTransferSize)); + cudaStreamCreate(fDataStreams + i); + } + + for (size_t i = 0; i < fNsamples; i++) + fSampleIndices[i] = i; +} + +//____________________________________________________________________________ +template +TCudaDataLoader::~TCudaDataLoader() +{ + for (size_t i = 0; i < fPreloadOffset + 1; i++) { + cudaFree(fHostData + i); + cudaFree(fDeviceData + i); + } +} + +//____________________________________________________________________________ +template +inline TCudaBatchIterator TCudaDataLoader::begin() { + std::random_shuffle(fSampleIndices.begin(), fSampleIndices.end()); + TCudaBatchIterator iterator(*this, fSampleIndices.begin(), + fSampleIndices.end()); + iterator.PrepareStream(); + return iterator; +} + +//____________________________________________________________________________ +template +inline TCudaBatchIterator TCudaDataLoader::end() { + return TCudaBatchIterator(*this, fSampleIndices.end(), + fSampleIndices.end()); +} + + +//____________________________________________________________________________ +template +inline CudaDouble_t * TCudaDataLoader::GetInputTransferBuffer() const +{ + return fHostData[fStreamIndex]; +} + +//____________________________________________________________________________ +template +inline CudaDouble_t * TCudaDataLoader::GetOutputTransferBuffer() const +{ + return fHostData[fStreamIndex] + fTransferBatchSize * fInputMatrixSize; +} + +//____________________________________________________________________________ +template +void TCudaDataLoader::InvokeTransfer() +{ + cudaMemcpyAsync(fDeviceData[fStreamIndex], + fHostData[fStreamIndex], + fTransferSize, + cudaMemcpyHostToDevice, + fDataStreams[fStreamIndex]); + + // Cycle through buffers and data streams. + fStreamIndex = (fStreamIndex + 1) % (fPreloadOffset + 1); +} + +//____________________________________________________________________________ +template +auto TCudaDataLoader::GetCurrentBatch(size_t batchIndex) + -> TCudaBatch +{ + size_t bufferIndex = batchIndex % fTransferBatchSize; + size_t nextStreamIndex = fStreamIndex; + + CudaDouble_t * inputDataPointer = fDeviceData[nextStreamIndex]; + inputDataPointer += bufferIndex * fInputMatrixSize; + CudaDouble_t * outputDataPointer = fDeviceData[nextStreamIndex]; + outputDataPointer += fTransferBatchSize * fInputMatrixSize; + outputDataPointer += bufferIndex * fOutputMatrixSize; + + cudaStreamSynchronize(fDataStreams[nextStreamIndex]); + return TCudaBatch(fBatchSize, fNinputFeatures, fNoutputFeatures, + inputDataPointer, outputDataPointer, fDataStreams[nextStreamIndex]); +} + +//____________________________________________________________________________ +template <> +inline void TCudaDataLoader::CopyBatches( + MatrixInput_t data, + IndexIterator_t sampleIndexIteratorBegin, + IndexIterator_t sampleIndexIteratorEnd, + size_t batchSize, + size_t batchBatchSize, + CudaDouble_t * inputBuffer, + CudaDouble_t * outputBuffer) +{ + + const TMatrixT &inputData = std::get<0>(data); + const TMatrixT &outputData = std::get<1>(data); + + size_t n_input = inputData.GetNcols(); + size_t n_output = outputData.GetNcols(); + + // Copy input matrices; + + auto sampleIndexIterator = sampleIndexIteratorBegin; + size_t bufferOffset = 0; + for (size_t b = 0; b < batchBatchSize; b++) { + for (size_t i = 0; i < batchSize; i++) { + if (sampleIndexIterator < sampleIndexIteratorEnd) { + size_t sampleIndex = *sampleIndexIterator; + // Copy input matrices. + for (size_t j = 0; j < n_input; j++) { + size_t bufferIndex = bufferOffset + j * batchSize + i; + inputBuffer[bufferIndex] = inputData(sampleIndex, j); + } + sampleIndexIterator++; + } + } + bufferOffset += batchSize * n_input; + } + + // Copy output matrices; + + sampleIndexIterator = sampleIndexIteratorBegin; + bufferOffset = 0; + for (size_t b = 0; b < batchBatchSize; b++) { + for (size_t i = 0; i < batchSize; i++) { + if (sampleIndexIterator < sampleIndexIteratorEnd) { + size_t sampleIndex = *sampleIndexIterator; + for (size_t j = 0; j < n_output; j++) { + size_t bufferIndex = bufferOffset + j * batchSize + i; + outputBuffer[bufferIndex] = outputData(sampleIndex, j); + } + sampleIndexIterator++; + } + } + bufferOffset += batchSize * n_output; + } +} + +//____________________________________________________________________________ +template <> +inline void TCudaDataLoader::CopyBatches( + TMVAInput_t data, + IndexIterator_t sampleIndexIteratorBegin, + IndexIterator_t sampleIndexIteratorEnd, + size_t batchSize, + size_t batchBatchSize, + CudaDouble_t * inputBuffer, + CudaDouble_t * outputBuffer) +{ + + Event * event = data.front(); + size_t nInput = event->GetNVariables(); + size_t nOutput = (event->GetNTargets() == 0) ? 1 : event->GetNTargets(); + + // Copy input matrices; + + auto sampleIndexIterator = sampleIndexIteratorBegin; + size_t bufferOffset = 0; + for (size_t b = 0; b < batchBatchSize; b++) { + for (size_t i = 0; i < batchSize; i++) { + if (sampleIndexIterator < sampleIndexIteratorEnd) { + size_t sampleIndex = *sampleIndexIterator; + event = data[sampleIndex]; + // Copy input matrices. + for (size_t j = 0; j < nInput; j++) { + size_t bufferIndex = bufferOffset + j * batchSize + i; + inputBuffer[bufferIndex] = event->GetValue(j); + } + sampleIndexIterator++; + } + } + bufferOffset += batchSize * nInput; + } + + // Copy output matrices; + + sampleIndexIterator = sampleIndexIteratorBegin; + bufferOffset = 0; + for (size_t b = 0; b < batchBatchSize; b++) { + for (size_t i = 0; i < batchSize; i++) { + if (sampleIndexIterator < sampleIndexIteratorEnd) { + size_t sampleIndex = *sampleIndexIterator; + event = data[sampleIndex]; + for (size_t j = 0; j < nOutput; j++) { + size_t bufferIndex = bufferOffset + j * batchSize + i; + if (event->GetNTargets() == 0) { + outputBuffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0; + } else { + outputBuffer[bufferIndex] = event->GetTarget(j); + } + } + sampleIndexIterator++; + } + } + bufferOffset += batchSize * nOutput; + } +} + +// Explicit instantiation. +template class TCudaBatchIterator; +template class TCudaBatchIterator; +template class TCudaDataLoader; +template class TCudaDataLoader; + +template<> +void TDataLoader::CopyInput(TCudaHostBuffer & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + const TMatrixT &inputMatrix = std::get<0>(fData); + size_t n = inputMatrix.GetNcols(); + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = *sampleIterator; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = inputMatrix(sampleIndex, j); + } + sampleIterator++; + } +} + +template<> +void TDataLoader::CopyOutput(TCudaHostBuffer & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + const TMatrixT &outputMatrix = std::get<1>(fData); + size_t n = outputMatrix.GetNcols(); + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = *sampleIterator; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = outputMatrix(sampleIndex, j); + } + sampleIterator++; + } +} + +template class TDataLoader; + +} // namespace TMVA +} // namespace DNN diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu new file mode 100644 index 0000000000000..76aa842b8f560 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu @@ -0,0 +1,37 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 14/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" + +//////////////////////////////////////////////////////////////////// +// Implementation of the Dropout function for TCuda architectures. // +//////////////////////////////////////////////////////////////////// + +namespace TMVA { +namespace DNN { + +//____________________________________________________________________________ +void TCuda::Dropout(TCudaMatrix &A, CudaDouble_t dropoutProbability) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Dropout<<>>( + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols(), + dropoutProbability, + TCudaMatrix::GetCurandStatesPointer()); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Initialization.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Initialization.cu new file mode 100644 index 0000000000000..d66a05048d6bc --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Initialization.cu @@ -0,0 +1,103 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 14/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ///////////////////////////////////////////////////////////// + // Implementation of the initialization functions for CUDA // + // Architectures // + ///////////////////////////////////////////////////////////// + +#include "TRandom.h" +#include "TMatrix.h" +#include "TMVA/DNN/Architectures/Cuda.h" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +void TCuda::InitializeGauss(TCudaMatrix & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + TRandom rand(time(nullptr)); + TMatrixT B(m, n); + + Real_t sigma = sqrt(2.0 / ((Real_t) n)); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + B(i,j) = rand.Gaus(0.0, sigma); + } + } + A = B; +} + +//______________________________________________________________________________ +void TCuda::InitializeUniform(TCudaMatrix & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + TRandom rand(time(nullptr)); + TMatrixT B(m, n); + + Real_t range = sqrt(2.0 / ((Real_t) n)); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + B(i,j) = rand.Uniform(-range, range); + } + } + A = B; +} + +//______________________________________________________________________________ +void TCuda::InitializeIdentity(TCudaMatrix & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + TMatrixT B(m, n); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n ; j++) { + B(i,j) = 0.0; + } + + if (i < n) { + B(i,i) = 1.0; + } + } + A = B; +} + +//______________________________________________________________________________ +void TCuda::InitializeZero(TCudaMatrix & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + TMatrixT B(m, n); + + for (size_t i = 0; i < m * n; i++) { + for (size_t j = 0; j < n ; j++) { + B(i,j) = 0.0; + } + } + A = B; +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cu new file mode 100644 index 0000000000000..1c75e7be4d4d4 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cu @@ -0,0 +1,638 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////////////////// +// Implementation of the device kernels for the CUDA implementation of // +// the low-level interface. // +///////////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "cuda.h" +#include "math.h" + +namespace TMVA { +namespace DNN { +namespace Cuda { + +//____________________________________________________________________________ +__device__ CudaDouble_t AtomicAdd(CudaDouble_t* address, CudaDouble_t val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + +//____________________________________________________________________________ +__device__ void ReduceSumVertical(CudaDouble_t *result, + CudaDouble_t * sdata, + int n) +{ + // i,j are block row and column indices. + int i = threadIdx.y; + int j = threadIdx.x; + int index = i * blockDim.x + j; + + __syncthreads(); + if ((blockDim.y > 512) && (i < 512)) { + if ((i + 512) < blockDim.y) { + sdata[index] += sdata[index + 512 * blockDim.x]; + } + } + + __syncthreads(); + if ((blockDim.y > 256) && (i < 256)) { + if ((i + 256) < blockDim.y) { + sdata[index] += sdata[index + 256 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 128) && (i < 128)) { + if ((i + 128) < blockDim.y) { + sdata[index] += sdata[index + 128 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 64) && (i < 64)) { + if ((i + 64) < blockDim.y) { + sdata[index] += sdata[index + 64 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 32) && (i < 32)) { + if ((i + 32) < blockDim.y) { + sdata[index] += sdata[index + 32 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 16) && (i < 16)) { + if ((i + 16) < blockDim.y) { + sdata[index] += sdata[index + 16 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 8) && (i < 8)) { + if ((i + 8) < blockDim.y) { + sdata[index] += sdata[index + 8 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 4) && (i < 4)) { + if ((i + 4) < blockDim.y) { + sdata[index] += sdata[index + 4 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 2) && (i < 2)) { + if ((i + 2) < blockDim.y) { + sdata[index] += sdata[index + 2 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 1) && (i < 1)) { + if ((i + 1) < blockDim.y) { + sdata[index] += sdata[index + 1 * blockDim.x]; + } + } + __syncthreads(); + if ((i == 0) && ((blockIdx.x * blockDim.x + threadIdx.x) < n)) { + AtomicAdd(result + j, sdata[index]); + } + __syncthreads(); +} + +//____________________________________________________________________________ +__device__ void ReduceSum(CudaDouble_t *result, CudaDouble_t * sdata) +{ + int tid = threadIdx.x + threadIdx.y * blockDim.x; + + __syncthreads(); + if ((TDevice::BlockSize > 512) && (tid < 512)) { + if ((tid + 512) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 512]; + } + } + + __syncthreads(); + if ((TDevice::BlockSize > 256) && (tid < 256)) { + if ((tid + 256) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 256]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 128) && (tid < 128)) { + if ((tid + 128) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 128]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 64) && (tid < 64)) { + if ((tid + 64) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 64]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 32) && (tid < 32)) { + if ((tid + 32) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 32]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 16) && (tid < 16)) { + if ((tid + 16) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 16]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 8) && (tid < 8)) { + if ((tid + 8) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 8]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 4) && (tid < 4)) { + if ((tid + 4) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 4]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 2) && (tid < 2)) { + if ((tid + 2) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 2]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 1) && (tid < 1)) { + if ((tid + 1) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 1]; + } + } + if (tid == 0) { + AtomicAdd(result, sdata[0]); + } + + __syncthreads(); +} + +//____________________________________________________________________________ +__global__ void AddRowWise(CudaDouble_t * W, + const CudaDouble_t * theta, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) + W[index] += theta[j]; +} + +//____________________________________________________________________________ +__global__ void Hadamard(CudaDouble_t * B, + const CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) + B[index] *= A[index]; +} + +//____________________________________________________________________________ +__global__ void IdentityDerivative(CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) + A[index] = 1.0; +} + +//____________________________________________________________________________ +__global__ void Relu(CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + CudaDouble_t x = A[index]; + A[index] = (x < 0.0) ? 0.0 : x; + } +} + +//____________________________________________________________________________ +__global__ void ReluDerivative(CudaDouble_t * B, + const CudaDouble_t * A, int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + CudaDouble_t x = A[index]; + B[index] = (x < 0.0) ? 0.0 : 1.0; + } +} + +//____________________________________________________________________________ +__global__ void Sigmoid(CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + CudaDouble_t sig = 1.0 / (1.0 + exp(-A[index])); + A[index] = sig; + } +} + +//____________________________________________________________________________ +__global__ void Sigmoid(CudaDouble_t * B, + const CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + CudaDouble_t sig = 1.0 / (1.0 + exp(-A[index])); + B[index] = sig; + } +} +//____________________________________________________________________________ +__global__ void SigmoidDerivative(CudaDouble_t * B, + const CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + CudaDouble_t sig = 1.0 / (1.0 + exp(-A[index])); + B[index] = sig * (1.0 - sig); + } +} + +//____________________________________________________________________________ +__global__ void Tanh(CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + CudaDouble_t t = ::tanh(A[index]); + A[index] = t; + } +} + +//____________________________________________________________________________ +__global__ void TanhDerivative(CudaDouble_t * B, + const CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + CudaDouble_t t = ::tanh(A[index]); + B[index] = 1 - t*t; + } +} + +//____________________________________________________________________________ +__global__ void SymmetricRelu(CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + A[index] = abs(A[index]); + } +} + +//____________________________________________________________________________ +__global__ void SymmetricReluDerivative(CudaDouble_t * B, + const CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + B[index] = (A[index] < 0.0) ? -1.0 : 1.0; + } +} + +//____________________________________________________________________________ +__global__ void SoftSign(CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + CudaDouble_t x = A[index]; + A[index] = x / (1.0 + abs(x)); + } +} + +//____________________________________________________________________________ +__global__ void SoftSignDerivative(CudaDouble_t * B, + const CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + CudaDouble_t x = 1.0 + fabs(A[index]); + B[index] = 1 / (x * x); + } +} + +//____________________________________________________________________________ +__global__ void Gauss(CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + CudaDouble_t x = A[index]; + A[index] = exp(- x * x); + } +} + +//____________________________________________________________________________ +__global__ void GaussDerivative(CudaDouble_t * B, + const CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + CudaDouble_t x = A[index]; + B[index] = - 2.0 * x * exp(- x * x); + } +} + +//____________________________________________________________________________ +__global__ void MeanSquaredError(CudaDouble_t * result, + const CudaDouble_t * Y, + const CudaDouble_t * output, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = blockDim.x * threadIdx.y + threadIdx.x; + int index = j * m + i; + + __shared__ CudaDouble_t sdata[TDevice::BlockSize]; + + if ((i < m) && (j < n)) { + CudaDouble_t norm = 1 / ((CudaDouble_t) (m * n)); + CudaDouble_t e = Y[index] - output[index]; + sdata[tid] = norm * e * e; + } else { + sdata[tid] = 0.0; + } + ReduceSum(result, sdata); +} + +//____________________________________________________________________________ +__global__ void SquaredSum(CudaDouble_t * result, + const CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = blockDim.x * threadIdx.y + threadIdx.x; + int index = j * m + i; + + __shared__ CudaDouble_t sdata[TDevice::BlockSize]; + + if ((i < m) && (j < n)) { + CudaDouble_t e = A[index]; + sdata[tid] = e * e; + } else { + sdata[tid] = 0.0; + } + ReduceSum(result, sdata); +} + +//____________________________________________________________________________ +__global__ void AbsoluteSum(CudaDouble_t * result, + const CudaDouble_t * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = blockDim.x * threadIdx.y + threadIdx.x; + int index = j * m + i; + + __shared__ CudaDouble_t sdata[TDevice::BlockSize]; + + if ((i < m) && (j < n)) { + sdata[tid] = abs(A[index]); + } else { + sdata[tid] = 0.0; + } + ReduceSum(result, sdata); +} + +//____________________________________________________________________________ +__global__ void MeanSquaredErrorGradients(CudaDouble_t * dY, + const CudaDouble_t * Y, + const CudaDouble_t * output, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) + dY[index] = 2.0 / ((CudaDouble_t) (m * n)) * (output[index] - Y[index]); +} + +//____________________________________________________________________________ +__global__ void AddL1RegularizationGradients(CudaDouble_t * A, + const CudaDouble_t * B, + CudaDouble_t weightDecay, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + CudaDouble_t sign = (B[index] < 0.0) ? -1.0 : 1.0; + A[index] += sign * weightDecay; + } +} + +//____________________________________________________________________________ +__global__ void AddL2RegularizationGradients(CudaDouble_t * A, + const CudaDouble_t * B, + CudaDouble_t weightDecay, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + A[index] += 2.0 * weightDecay * B[index]; + } +} + +//____________________________________________________________________________ +__global__ void CrossEntropy(CudaDouble_t * result, + const CudaDouble_t * Y, + const CudaDouble_t * output, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = blockDim.x * threadIdx.y + threadIdx.x; + int index = j * m + i; + + __shared__ CudaDouble_t sdata[TDevice::BlockSize]; + + if ((i < m) && (j < n)) { + CudaDouble_t norm = 1 / ((CudaDouble_t) (m * n)); + CudaDouble_t sig = 1.0 / (1.0 + exp(-output[index])); + CudaDouble_t ce = Y[index] * log(sig) + (1.0 - Y[index]) * log(1.0 - sig); + sdata[tid] = - norm * ce; + } else { + sdata[tid] = 0.0; + } + + ReduceSum(result, sdata); +} + +//____________________________________________________________________________ +__global__ void CrossEntropyGradients(CudaDouble_t * dY, + const CudaDouble_t * Y, + const CudaDouble_t * output, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + CudaDouble_t norm = 1 / ((CudaDouble_t) (m * n)); + CudaDouble_t y = Y[index]; + CudaDouble_t sig = 1.0 / (1.0 + exp(-output[index])); + dY[index] = norm * (sig - y); + } +} + +//____________________________________________________________________________ +__global__ void ReduceMatrix(CudaDouble_t *result, + const CudaDouble_t *A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = threadIdx.y * blockDim.x + threadIdx.x; + int index = j * m + i; + + __shared__ CudaDouble_t smem[TDevice::BlockSize]; + if ((i < m) && (j < n)) + smem[tid] = A[index]; + else + smem[tid] = 0.0; + + ReduceSum(result, smem); +} + +//____________________________________________________________________________ +__global__ void SumColumns(CudaDouble_t *B, + const CudaDouble_t *A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int matrixIndex = j * m + i; + int blockIndex = blockDim.x * threadIdx.y + threadIdx.x; + + + __shared__ CudaDouble_t smem[TDevice::BlockSize]; + + if ((i < m) && (j < n)) { + smem[blockIndex] = A[matrixIndex]; + } else { + smem[blockIndex] = 0.0; + } + + ReduceSumVertical(B + blockDim.x * blockIdx.x, smem, n); +} + +//____________________________________________________________________________ +__global__ void InitializeCurandStates(unsigned long long seed, + curandState_t *state) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = i * gridDim.x + j; + curand_init(seed, 0, tid, state + tid); +} + +//____________________________________________________________________________ +__global__ void Dropout(CudaDouble_t *A, + int m, int n, + CudaDouble_t dropoutProbability, + curandState_t *state) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = i * gridDim.x + j; + if ((i < m) && (j < n)) { + float r = curand_uniform(state + tid); + if (r > dropoutProbability) { + A[j * m + i] = 0.0; + } else { + A[j * m + i] /= dropoutProbability; + } + } +} + +} // namespace Cuda +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu new file mode 100644 index 0000000000000..1f5479fce75b6 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu @@ -0,0 +1,95 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Implementation of the loss functions for the TCuda implementation // +// of the low-level interface. // +////////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "TMVA/DNN/Architectures/Cuda/Kernels.h" + +namespace TMVA +{ +namespace DNN +{ + +//____________________________________________________________________________ +CudaDouble_t TCuda::MeanSquaredError(const TCudaMatrix & Y, + const TCudaMatrix & output) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(Y); + cudaStream_t s = Y.GetComputeStream(); + TCudaMatrix::ResetDeviceReturn(); + ::TMVA::DNN::Cuda::MeanSquaredError<<>>( + TCudaMatrix::GetDeviceReturnPointer(), + Y.GetDataPointer(), + output.GetDataPointer(), + (int) Y.GetNrows(), + (int) Y.GetNcols()); + return TCudaMatrix::GetDeviceReturn(); +} + +//____________________________________________________________________________ +void TCuda::MeanSquaredErrorGradients(TCudaMatrix & dY, + const TCudaMatrix & Y, + const TCudaMatrix & output) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(Y); + cudaStream_t s = output.GetComputeStream(); + ::TMVA::DNN::Cuda::MeanSquaredErrorGradients<<>>( + dY.GetDataPointer(), + Y.GetDataPointer(), + output.GetDataPointer(), + (int) Y.GetNrows(), + (int) Y.GetNcols()); + dY.SetComputeStream(s); +} + +//____________________________________________________________________________ +CudaDouble_t TCuda::CrossEntropy(const TCudaMatrix & Y, + const TCudaMatrix & output) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(Y); + TCudaMatrix::ResetDeviceReturn(); + cudaStream_t s = Y.GetComputeStream(); + ::TMVA::DNN::Cuda::CrossEntropy<<>>( + TCudaMatrix::GetDeviceReturnPointer(), + Y.GetDataPointer(), + output.GetDataPointer(), + (int) Y.GetNrows(), + (int) Y.GetNcols()); + return TCudaMatrix::GetDeviceReturn(); +} + +//____________________________________________________________________________ +void TCuda::CrossEntropyGradients(TCudaMatrix & dY, + const TCudaMatrix & Y, + const TCudaMatrix & output) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(Y); + cudaStream_t s = output.GetComputeStream(); + ::TMVA::DNN::Cuda::CrossEntropyGradients<<>>( + dY.GetDataPointer(), + Y.GetDataPointer(), + output.GetDataPointer(), + (int) Y.GetNrows(), + (int) Y.GetNcols()); + dY.SetComputeStream(s); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu new file mode 100644 index 0000000000000..09dc46a3e8d1b --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu @@ -0,0 +1,39 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 11/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////// +// Explicit instantiation of the Reference architecture class // +// template for Double_t scalar types. // +//////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" + +namespace TMVA +{ +namespace DNN +{ + +void TCuda::Sigmoid(TCudaMatrix & B, + const TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Sigmoid<<>>(B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu new file mode 100644 index 0000000000000..f8e2b3bf7c8a3 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu @@ -0,0 +1,96 @@ +// @(#)root/tmva/tmva/dnn:$Id$ // Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////////// + // Implementation of the functions required for the forward and // + // backward propagation of activations through a neural network // + // for CUDA architectures. // + ////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "TMVA/DNN/Architectures/Cuda/Kernels.h" + +namespace TMVA { +namespace DNN { + + +void TCuda::MultiplyTranspose(TCudaMatrix &output, + const TCudaMatrix &input, + const TCudaMatrix &Weights) +{ + int m, n, k; + k = input.GetNcols(); + m = input.GetNrows(); + n = Weights.GetNrows(); + CudaDouble_t alpha = 1.0, beta = 0.0; + + // Compute C = beta * C + alpha * (A * B^T) + cudaStream_t s = input.GetComputeStream(); + cublasSetStream(input.GetCublasHandle(), s); + cublasDgemm(input.GetCublasHandle(), + CUBLAS_OP_N, CUBLAS_OP_T, + m, n, k, & alpha, + input.GetDataPointer(), m, // *A, lda + Weights.GetDataPointer(), n, // *B, ldb + & beta, // beta + output.GetDataPointer(), m); // *C, ldc + output.SetComputeStream(s); +} + +void TCuda::AddRowWise(TCudaMatrix &Weights, + const TCudaMatrix &theta) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(Weights); + cudaStream_t s = Weights.GetComputeStream(); + ::TMVA::DNN::Cuda::AddRowWise<<>>( + Weights.GetDataPointer(), + theta.GetDataPointer(), + Weights.GetNrows(), + Weights.GetNcols()); +} + +void TCuda::Backward(TCudaMatrix & activation_gradients_backward, + TCudaMatrix & weight_gradients, + TCudaMatrix & bias_gradients, + TCudaMatrix & df, + const TCudaMatrix & activation_gradients, + const TCudaMatrix & weights, + const TCudaMatrix & activation_backward) +{ + // Compute element-wise product. + TCuda::Hadamard(df, activation_gradients); + + // Activation gradients. + if (activation_gradients_backward.GetNoElements() > 0) + TCuda::Multiply(activation_gradients_backward, df, weights); + + // Weight gradients. + if (weight_gradients.GetNoElements() > 0) + TCuda::TransposeMultiply(weight_gradients, df, activation_backward); + + // Bias gradients. + if (bias_gradients.GetNoElements() > 0) + TCuda::SumColumns(bias_gradients, df); + +} + +void TCuda::Copy(TCudaMatrix & B, const TCudaMatrix & A) +{ + size_t m = B.GetNrows(); + size_t n = B.GetNcols(); + cudaMemcpyAsync(B.GetDataPointer(), A.GetDataPointer(), + m * n * sizeof(CudaDouble_t), cudaMemcpyDeviceToDevice, + 0); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu new file mode 100644 index 0000000000000..161b7361260fa --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu @@ -0,0 +1,88 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////// +// Contains the definitions of the kernel calling functions for // +// computation of regularization functionals and gradients // +// functions for CUDA architectures. // +////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "TMVA/DNN/Architectures/Cuda/Kernels.h" + +namespace TMVA { +namespace DNN { + +//______________________________________________________________________________ +CudaDouble_t TCuda::L1Regularization(const TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + TCudaMatrix::ResetDeviceReturn(); + ::TMVA::DNN::Cuda::AbsoluteSum<<>>( + TCudaMatrix::GetDeviceReturnPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + return TCudaMatrix::GetDeviceReturn(); +} + +//______________________________________________________________________________ +void TCuda::AddL1RegularizationGradients(TCudaMatrix & B, + const TCudaMatrix & A, + CudaDouble_t weightDecay) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::AddL1RegularizationGradients<<>>( + B.GetDataPointer(), + A.GetDataPointer(), + weightDecay, + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +CudaDouble_t TCuda::L2Regularization(const TCudaMatrix & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + TCudaMatrix::ResetDeviceReturn(); + ::TMVA::DNN::Cuda::SquaredSum<<>>( + TCudaMatrix::GetDeviceReturnPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + return TCudaMatrix::GetDeviceReturn(); +} + +//______________________________________________________________________________ +void TCuda::AddL2RegularizationGradients(TCudaMatrix & B, + const TCudaMatrix & A, + CudaDouble_t weightDecay) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::AddL2RegularizationGradients<<>>( + B.GetDataPointer(), + A.GetDataPointer(), + weightDecay, + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +} // namspace DNN +} // namspace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Reference.cxx b/tmva/tmva/src/DNN/Architectures/Reference.cxx new file mode 100644 index 0000000000000..0d8eaf2b422fe --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference.cxx @@ -0,0 +1,32 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////// +// Explicit instantiation of the TReference architecture class // +// template for Double_t scalar types. // +//////////////////////////////////////////////////////////////// + +#include +#include "TMVA/DNN/Architectures/Reference.h" + +#include "Reference/Propagation.cxx" +#include "Reference/ActivationFunctions.cxx" +#include "Reference/OutputFunctions.cxx" +#include "Reference/LossFunctions.cxx" +#include "Reference/Regularization.cxx" +#include "Reference/Initialization.cxx" +#include "Reference/Dropout.cxx" + +namespace TMVA { +namespace DNN { +template class TReference; +} // namespace TMVA +} // namespace DNN diff --git a/tmva/tmva/src/DNN/Architectures/Reference/ActivationFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/ActivationFunctions.cxx new file mode 100644 index 0000000000000..93afd86bbabc8 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/ActivationFunctions.cxx @@ -0,0 +1,237 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////////// + // Implementation of the activation functions for the reference // + // implementation. // + ////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Reference.h" +#include + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +template +void TReference::IdentityDerivative(TMatrixT & B, + const TMatrixT &A) +{ + size_t m,n; + m = B.GetNrows(); + n = B.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + B(i,j) = 1.0; + } + } +} + +//______________________________________________________________________________ +template +void TReference::Relu(TMatrixT &A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + A(i,j) = std::max((Real_t) 0.0, A(i,j)); + } + } +} + +//______________________________________________________________________________ +template +inline void TReference::ReluDerivative(TMatrixT & B, + const TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + B(i,j) = (A(i,j) < 0) ? 0.0 : 1.0; + } + } +} + +//______________________________________________________________________________ +template +void TReference::Sigmoid(TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j))); + A(i,j) = sig; + } + } +} + +//______________________________________________________________________________ +template +inline void TReference::SigmoidDerivative(TMatrixT & B, + const TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j))); + B(i,j) = sig * (1.0 - sig); + } + } +} + +//______________________________________________________________________________ +template +inline void TReference::Tanh(TMatrixT & B) +{ + size_t m,n; + m = B.GetNrows(); + n = B.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t t = tanh(B(i,j)); + B(i,j) = t; + } + } +} + +//______________________________________________________________________________ +template +inline void TReference::TanhDerivative(TMatrixT & B, + const TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t t = tanh(A(i,j)); + B(i,j) = 1 - t * t; + } + } +} + +//______________________________________________________________________________ +template +inline void TReference::SymmetricRelu(TMatrixT & B) +{ + size_t m,n; + m = B.GetNrows(); + n = B.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + B(i,j) = fabs(B(i,j)); + } + } +} + +//______________________________________________________________________________ +template +inline void TReference::SymmetricReluDerivative(TMatrixT & B, + const TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + B(i,j) = (A(i,j) < 0.0) ? -1.0 : 1.0; + } + } +} + +//______________________________________________________________________________ +template +inline void TReference::SoftSign(TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t x = A(i,j); + A(i,j) = x / (1 + fabs(x)); + } + } +} + +//______________________________________________________________________________ +template +inline void TReference::SoftSignDerivative(TMatrixT & B, + const TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t x = 1.0 + fabs(A(i,j)); + B(i,j) = 1.0 / (x * x); + } + } +} + +//______________________________________________________________________________ +template +inline void TReference::Gauss(TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t x = A(i,j); + A(i,j) = exp(- x * x); + } + } +} + +//______________________________________________________________________________ +template +inline void TReference::GaussDerivative(TMatrixT & B, + const TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t x = A(i,j); + B(i,j) = - 2.0 * x * exp(- x * x); + } + } +} +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Reference/DataLoader.cxx b/tmva/tmva/src/DNN/Architectures/Reference/DataLoader.cxx new file mode 100644 index 0000000000000..9a287cd9b743c --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/DataLoader.cxx @@ -0,0 +1,142 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 12/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////// +// Implementation for the DataLoader for the reference // +// implementation. // +///////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Reference/DataLoader.h" +#include "TMVA/Event.h" + +namespace TMVA +{ +namespace DNN +{ + +template +TReferenceDataLoader::TReferenceDataLoader(const Data_t &input, + size_t nsamples, + size_t batchSize, + size_t ninputFeatures, + size_t noutputFeatures) + : fInput(input), fNSamples(nsamples), fBatchSize(batchSize), + fNInputFeatures(ninputFeatures), fNOutputFeatures(noutputFeatures), + fNBatches(nsamples / batchSize), fInputMatrices(), fOutputMatrices(), + fBatches(), fSampleIndices() +{ + fInputMatrices.reserve(fNBatches); + fOutputMatrices.reserve(fNBatches); + for (size_t i = 0; i < fNBatches; i++) { + fInputMatrices.emplace_back(fBatchSize, fNInputFeatures); + fOutputMatrices.emplace_back(fBatchSize, fNOutputFeatures); + } + + fBatches.reserve(fNBatches); + fSampleIndices.reserve(fNBatches); + + for (size_t i = 0; i < fNSamples; i++) { + fSampleIndices.emplace_back(i); + } +} + +template +auto TReferenceDataLoader::begin() + -> BatchIterator_t +{ + random_shuffle(fSampleIndices.begin(), fSampleIndices.end()); + fBatches.clear(); + fBatches.reserve(fNSamples); + + size_t sampleIndex = 0; + for (size_t batchIndex = 0; batchIndex < fNBatches; batchIndex++) { + + CopyBatch(fInputMatrices[batchIndex], + fOutputMatrices[batchIndex], + fInput, + fSampleIndices.begin() + sampleIndex, + fSampleIndices.begin() + sampleIndex + fBatchSize); + fBatches.emplace_back(fInputMatrices[batchIndex], + fOutputMatrices[batchIndex]); + sampleIndex += fBatchSize; + } + return fBatches.begin(); +} + +template +auto TReferenceDataLoader::end() + -> BatchIterator_t +{ + return fBatches.end(); +} + +using MatrixInput_t = std::pair&, + const TMatrixT &>; + +template <> +void TReferenceDataLoader::CopyBatch( + Matrix_t &inputMatrix, + Matrix_t &outputMatrix, + const MatrixInput_t &input, + IndexIterator_t indexBegin, + IndexIterator_t indexEnd) +{ + const Matrix_t &in = std::get<0>(input); + const Matrix_t &out = std::get<1>(input); + + size_t batchIndex = 0; + for (IndexIterator_t i = indexBegin; i != indexEnd; i++) { + size_t index = *i; + for (size_t j = 0; j < (size_t) in.GetNcols(); j++) { + inputMatrix(batchIndex, j) = in(index, j); + } + for (size_t j = 0; j < (size_t) out.GetNcols(); j++) { + outputMatrix(batchIndex, j) = out(index, j); + } + batchIndex++; + } +} + +using TMVAInput_t = std::vector; + +template <> +void TReferenceDataLoader::CopyBatch( + Matrix_t &inputMatrix, + Matrix_t &outputMatrix, + const TMVAInput_t &input, + IndexIterator_t indexBegin, + IndexIterator_t indexEnd) +{ + size_t batchIndex = 0; + for (IndexIterator_t i = indexBegin; i != indexEnd; i++) { + size_t index = *i; + Event *event = input.at(index); + for (size_t j = 0; j < event->GetNVariables(); j++) { + inputMatrix(batchIndex, j) = event->GetValue(j); + } + if (event->GetNTargets() > 0) { + for (size_t j = 0; j < event->GetNTargets(); j++) { + outputMatrix(batchIndex, j) = event->GetTarget(j); + } + } else { + outputMatrix(batchIndex, 0) = (event->GetClass() == 0) ? 1.0 : 0.0; + batchIndex++; + } + } +} + +// Explicit instantiation. + +template class TReferenceDataLoader; +template class TReferenceDataLoader; + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Dropout.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Dropout.cxx new file mode 100644 index 0000000000000..04e14811436c0 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/Dropout.cxx @@ -0,0 +1,50 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////////// + // Implementation of the activation functions for the reference // + // implementation. // + ////////////////////////////////////////////////////////////////// + + +#include "TMVA/DNN/Architectures/Reference.h" +#include "TRandom.h" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ + +template +void TReference::Dropout(TMatrixT & B, Real_t dropoutProbability) +{ + size_t m,n; + m = B.GetNrows(); + n = B.GetNcols(); + + TRandom rand(time(nullptr)); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t r = rand.Uniform(); + if (r >= dropoutProbability) { + B(i,j) = 0.0; + } else { + B(i,j) /= dropoutProbability; + } + } + } +} + +} +} diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Initialization.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Initialization.cxx new file mode 100644 index 0000000000000..edd6ecaedb700 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/Initialization.cxx @@ -0,0 +1,97 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////////////// + // Implementation of the initialization functions for the reference // + // implementation. // + ////////////////////////////////////////////////////////////////////// + +#include "TRandom.h" +#include "TMVA/DNN/Architectures/Reference.h" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +template +void TReference::InitializeGauss(TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + TRandom rand(time(nullptr)); + + Real_t sigma = sqrt(2.0 / ((Real_t) n)); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + A(i,j) = rand.Gaus(0.0, sigma); + } + } +} + +//______________________________________________________________________________ +template +void TReference::InitializeUniform(TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + TRandom rand(time(nullptr)); + + Real_t range = sqrt(2.0 / ((Real_t) n)); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + A(i,j) = rand.Uniform(-range, range); + } + } +} + +//______________________________________________________________________________ +template +void TReference::InitializeIdentity(TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n ; j++) { + A(i,j) = 0.0; + } + + if (i < n) { + A(i,i) = 1.0; + } + } +} + +template +void TReference::InitializeZero(TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m * n; i++) { + for (size_t j = 0; j < n ; j++) { + A(i,j) = 0.0; + } + } +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx new file mode 100644 index 0000000000000..aa0b144be2053 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx @@ -0,0 +1,101 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + //////////////////////////////////////////////////////////// + // Implementation of the loss functions for the reference // + // implementation. // + //////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Reference.h" + +namespace TMVA +{ +namespace DNN +{ +//______________________________________________________________________________ +template +Real_t TReference::MeanSquaredError(const TMatrixT &Y, + const TMatrixT &output) +{ + size_t m,n; + m = Y.GetNrows(); + n = Y.GetNcols(); + Real_t result = 0.0; + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t dY = (Y(i,j) - output(i,j)); + result += dY * dY; + } + } + result /= (Real_t) (m * n); + return result; +} + +//______________________________________________________________________________ +template +void TReference::MeanSquaredErrorGradients(TMatrixT & dY, + const TMatrixT & Y, + const TMatrixT & output) +{ + size_t m,n; + m = Y.GetNrows(); + n = Y.GetNcols(); + + dY.Minus(Y, output); + dY *= - 2.0 / ((Real_t) (m*n)); +} + +//______________________________________________________________________________ +template +Real_t TReference::CrossEntropy(const TMatrixT &Y, + const TMatrixT &output) +{ + size_t m,n; + m = Y.GetNrows(); + n = Y.GetNcols(); + Real_t result = 0.0; + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t sig = 1.0 / (1.0 + std::exp(-output(i,j))); + result += Y(i,j) * std::log(sig) + + (1.0 - Y(i,j)) * std::log(1.0 - sig); + } + } + result /= - (Real_t) (m * n); + return result; +} + +//______________________________________________________________________________ +template +void TReference::CrossEntropyGradients(TMatrixT & dY, + const TMatrixT & Y, + const TMatrixT & output) +{ + size_t m,n; + m = Y.GetNrows(); + n = Y.GetNcols(); + + Real_t norm = 1.0 / ((Real_t) (m * n)); + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + Real_t y = Y(i,j); + Real_t sig = 1.0 / (1.0 + std::exp(-output(i,j))); + dY(i,j) = norm * (sig - y); + } + } +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx new file mode 100644 index 0000000000000..731c95713d404 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx @@ -0,0 +1,37 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 11/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////// +// Explicit instantiation of the TReference architecture class // +// template for Double_t scalar types. // +//////////////////////////////////////////////////////////////// + +namespace TMVA { +namespace DNN { + +template +void TReference::Sigmoid(TMatrixT & B, + const TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j))); + B(i,j) = sig; + } + } +} + +} // namespace TMVA +} // namespace DNN diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Outputfunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Outputfunctions.cxx new file mode 100644 index 0000000000000..b572b03baa60f --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/Outputfunctions.cxx @@ -0,0 +1,39 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 11/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////// +// Explicit instantiation of the Reference architecture class // +// template for Double_t scalar types. // +//////////////////////////////////////////////////////////////// + +namespace TMVA +{ +namespace DNN +{ + +template +void Reference::Sigmoid(TMatrixT & B, + const TMatrixT & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j))); + A(i,j) = sig; + } + } +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Propagation.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Propagation.cxx new file mode 100644 index 0000000000000..1b360810603f3 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/Propagation.cxx @@ -0,0 +1,95 @@ +// @(#)root/tmva/tmva/dnn:$Id$ // Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////////////// +// Implementation of the functions required for the forward and // +// backward propagation of activations through a neural network in // +// the reference implementation. // +///////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Reference.h" + +namespace TMVA +{ +namespace DNN +{ + +template +void TReference::MultiplyTranspose(TMatrixT &output, + const TMatrixT &input, + const TMatrixT &weights) +{ + output.MultT(input, weights); +} + +template +void TReference::AddRowWise(TMatrixT &output, + const TMatrixT &biases) +{ + for (size_t i = 0; i < (size_t) output.GetNrows(); i++) { + for (size_t j = 0; j < (size_t) output.GetNcols(); j++) { + output(i,j) += biases(j,0); + } + } +} + +template +void TReference::Backward(TMatrixT & activation_gradients_backward, + TMatrixT & weight_gradients, + TMatrixT & bias_gradients, + TMatrixT & df, + const TMatrixT & activation_gradients, + const TMatrixT & weights, + const TMatrixT & activations_backward) +{ + + // Compute element-wise product. + for (size_t i = 0; i < (size_t) df.GetNrows(); i++) { + for (size_t j = 0; j < (size_t) df.GetNcols(); j++) { + df(i,j) *= activation_gradients(i,j); + } + } + + // Activation gradients. + if (activation_gradients_backward.GetNoElements() > 0) { + activation_gradients_backward.Mult(df, weights); + } + + // Weights gradients. + if (weight_gradients.GetNoElements() > 0) { + weight_gradients.TMult(df, activations_backward); + } + + // Bias gradients. + if (bias_gradients.GetNoElements() > 0) { + for (size_t j = 0; j < (size_t) df.GetNcols(); j++) { + Scalar_t sum = 0.0; + for (size_t i = 0; i < (size_t) df.GetNrows(); i++) { + sum += df(i,j); + } + bias_gradients(j,0) = sum; + } + } +} + +template +void TReference::ScaleAdd(TMatrixT & A, + const TMatrixT & B, + Scalar_t beta) +{ + for (size_t i = 0; i < (size_t) A.GetNrows(); i++) { + for (size_t j = 0; j < (size_t) A.GetNcols(); j++) { + A(i,j) += beta * B(i,j); + } + } +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Regularization.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Regularization.cxx new file mode 100644 index 0000000000000..ffe0f19ec7db1 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/Regularization.cxx @@ -0,0 +1,98 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////////////// + // Implementation of the regularization functions for the reference // + // implementation. // + ////////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Reference.h" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +template +Real_t TReference::L1Regularization(const TMatrixT & W) +{ + size_t m,n; + m = W.GetNrows(); + n = W.GetNcols(); + + Real_t result = 0.0; + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + result += std::abs(W(i,j)); + } + } + return result; +} + +//______________________________________________________________________________ +template +void TReference::AddL1RegularizationGradients(TMatrixT & A, + const TMatrixT & W, + Real_t weightDecay) +{ + size_t m,n; + m = W.GetNrows(); + n = W.GetNcols(); + + Real_t sign = 0.0; + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + sign = (W(i,j) > 0.0) ? 1.0 : -1.0; + A(i,j) += sign * weightDecay; + } + } +} + +//______________________________________________________________________________ +template +Real_t TReference::L2Regularization(const TMatrixT & W) +{ + size_t m,n; + m = W.GetNrows(); + n = W.GetNcols(); + + Real_t result = 0.0; + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + result += W(i,j) * W(i,j); + } + } + return result; +} + +//______________________________________________________________________________ +template +void TReference::AddL2RegularizationGradients(TMatrixT & A, + const TMatrixT & W, + Real_t weightDecay) +{ + size_t m,n; + m = W.GetNrows(); + n = W.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + A(i,j) += weightDecay * 2.0 * W(i,j); + } + } +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/MethodDNN.cxx b/tmva/tmva/src/MethodDNN.cxx index d3f0e547ca349..c5fb4a9ac9722 100644 --- a/tmva/tmva/src/MethodDNN.cxx +++ b/tmva/tmva/src/MethodDNN.cxx @@ -42,6 +42,9 @@ #include "TMVA/Config.h" #include "TMVA/Ranking.h" +#include "TMVA/DNN/Net.h" +#include "TMVA/DNN/Architectures/Reference.h" + #include "TMVA/NeuralNet.h" #include "TMVA/Monitoring.h" @@ -52,7 +55,10 @@ REGISTER_METHOD(DNN) ClassImp(TMVA::MethodDNN) - +using TMVA::DNN::EActivationFunction; +using TMVA::DNN::ELossFunction; +using TMVA::DNN::EInitialization; +using TMVA::DNN::EOutputFunction; namespace TMVA @@ -160,6 +166,7 @@ void TMVA::MethodDNN::DeclareOptions() AddPreDefVal(TString("XAVIERUNIFORM")); AddPreDefVal(TString("LAYERSIZE")); + DeclareOptionRef(fGPUString="True", "GPU", "Use GPU for training."); DeclareOptionRef(fTrainingStrategy="LearningRate=1e-1,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5|LearningRate=1e-4,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropConfig=0.0+0.5+0.5,DropRepetitions=5,Multithreading=True", "TrainingStrategy", "defines the training strategies"); @@ -388,6 +395,12 @@ void TMVA::MethodDNN::ProcessOptions() else fWeightInitializationStrategy = TMVA::DNN::WeightInitializationStrategy::XAVIER; + fGPUString.ToUpper (); + if (fGPUString.BeginsWith ("T")) + fGPU = true; + else + fGPU = false; + // create settings if (fAnalysisType == Types::kClassification) { @@ -492,160 +505,382 @@ void TMVA::MethodDNN::ProcessOptions() //______________________________________________________________________________ void TMVA::MethodDNN::Train() { - - fMonitoring = NULL; - // if (!fMonitoring) - // { - // fMonitoring = make_shared(); - // fMonitoring->Start (); - // } + if (fGPU) { + TrainGPU(); + } else { + + fMonitoring = NULL; + // if (!fMonitoring) + // { + // fMonitoring = make_shared(); + // fMonitoring->Start (); + // } + + // INITIALIZATION + // create pattern + std::vector trainPattern; + std::vector testPattern; + + const std::vector& eventCollectionTraining = GetEventCollection (Types::kTraining); + const std::vector& eventCollectionTesting = GetEventCollection (Types::kTesting); + + for (size_t iEvt = 0, iEvtEnd = eventCollectionTraining.size (); iEvt < iEvtEnd; ++iEvt) + { + const TMVA::Event* event = eventCollectionTraining.at (iEvt); + const std::vector& values = event->GetValues (); + if (fAnalysisType == Types::kClassification) + { + double outputValue = event->GetClass () == 0 ? 0.9 : 0.1; + trainPattern.push_back (Pattern (values.begin (), values.end (), outputValue, event->GetWeight ())); + trainPattern.back ().addInput (1.0); // bias node + } + else + { + const std::vector& targets = event->GetTargets (); + trainPattern.push_back (Pattern (values.begin (), values.end (), targets.begin (), targets.end (), event->GetWeight ())); + trainPattern.back ().addInput (1.0); // bias node + } + } + + for (size_t iEvt = 0, iEvtEnd = eventCollectionTesting.size (); iEvt < iEvtEnd; ++iEvt) + { + const TMVA::Event* event = eventCollectionTesting.at (iEvt); + const std::vector& values = event->GetValues (); + if (fAnalysisType == Types::kClassification) + { + double outputValue = event->GetClass () == 0 ? 0.9 : 0.1; + testPattern.push_back (Pattern (values.begin (), values.end (), outputValue, event->GetWeight ())); + testPattern.back ().addInput (1.0); // bias node + } + else + { + const std::vector& targets = event->GetTargets (); + testPattern.push_back (Pattern (values.begin (), values.end (), targets.begin (), targets.end (), event->GetWeight ())); + testPattern.back ().addInput (1.0); // bias node + } + } + + if (trainPattern.empty () || testPattern.empty ()) + return; + + // create net and weights + fNet.clear (); + fWeights.clear (); + + // if "resume" from saved weights + if (fResume) + { + std::cout << ".. resume" << std::endl; + // std::tie (fNet, fWeights) = ReadWeights (fFileName); + } + else // initialize weights and net + { + size_t inputSize = GetNVariables (); //trainPattern.front ().input ().size (); + size_t outputSize = fAnalysisType == Types::kClassification ? 1 : GetNTargets (); //trainPattern.front ().output ().size (); + fNet.setInputSize (inputSize + 1); // num vars + bias node + fNet.setOutputSize (outputSize); // num vars + bias node + + // configure neural net + auto itLayout = std::begin (fLayout), itLayoutEnd = std::end (fLayout)-1; // all layers except the last one + for ( ; itLayout != itLayoutEnd; ++itLayout) + { + fNet.addLayer (DNN::Layer ((*itLayout).first, (*itLayout).second)); + Log() << kINFO + << "Add Layer with " << (*itLayout).first << " nodes." + << Endl; + } + + DNN::ModeOutputValues eModeOutputValues = DNN::ModeOutputValues::SIGMOID; + if (fAnalysisType == Types::kRegression) + { + eModeOutputValues = DNN::ModeOutputValues::DIRECT; + } + else if ((fAnalysisType == Types::kClassification || + fAnalysisType == Types::kMulticlass) && + fModeErrorFunction == TMVA::DNN::ModeErrorFunction::SUMOFSQUARES) + { + eModeOutputValues = DNN::ModeOutputValues::DIRECT; + } + fNet.addLayer (DNN::Layer (outputSize, (*itLayout).second, eModeOutputValues)); + Log() << kINFO + << "Add Layer with " << outputSize << " nodes." + << Endl << Endl; + fNet.setErrorFunction (fModeErrorFunction); + + size_t numWeights = fNet.numWeights (); + Log() << kINFO + << "Total number of Synapses = " + << numWeights + << Endl; + + // initialize weights + fNet.initializeWeights (fWeightInitializationStrategy, + std::back_inserter (fWeights)); + } + + + // loop through settings + // and create "settings" and minimizer + int idxSetting = 0; + for (auto itSettings = std::begin (fSettings), itSettingsEnd = std::end (fSettings); itSettings != itSettingsEnd; ++itSettings, ++idxSetting) + { + std::shared_ptr ptrSettings = *itSettings; + ptrSettings->setMonitoring (fMonitoring); + Log() << kINFO + << "Training with learning rate = " << ptrSettings->learningRate () + << ", momentum = " << ptrSettings->momentum () + << ", repetitions = " << ptrSettings->repetitions () + << Endl; + + ptrSettings->setProgressLimits ((idxSetting)*100.0/(fSettings.size ()), (idxSetting+1)*100.0/(fSettings.size ())); + + const std::vector& dropConfig = ptrSettings->dropFractions (); + if (!dropConfig.empty ()) + { + Log () << kINFO << "Drop configuration" << Endl + << " drop repetitions = " << ptrSettings->dropRepetitions () << Endl; + } + int idx = 0; + for (auto f : dropConfig) + { + Log () << kINFO << " Layer " << idx << " = " << f << Endl; + ++idx; + } + Log () << kINFO << Endl; + + if (ptrSettings->minimizerType () == TMVA::DNN::MinimizerType::fSteepest) + { + DNN::Steepest minimizer (ptrSettings->learningRate (), ptrSettings->momentum (), ptrSettings->repetitions ()); + /*E =*/fNet.train (fWeights, trainPattern, testPattern, minimizer, *ptrSettings.get ()); + } + ptrSettings.reset (); + Log () << kINFO << Endl; + } + fMonitoring = 0; + } +} - // INITIALIZATION - // create pattern - std::vector trainPattern; - std::vector testPattern; +void TMVA::MethodDNN::TrainGPU() +{ - const std::vector& eventCollectionTraining = GetEventCollection (Types::kTraining); - const std::vector& eventCollectionTesting = GetEventCollection (Types::kTesting); +#ifdef DNNCUDA // Included only if DNNCUDA flag is set. - for (size_t iEvt = 0, iEvtEnd = eventCollectionTraining.size (); iEvt < iEvtEnd; ++iEvt) - { - const TMVA::Event* event = eventCollectionTraining.at (iEvt); - const std::vector& values = event->GetValues (); - if (fAnalysisType == Types::kClassification) - { - double outputValue = event->GetClass () == 0 ? 0.9 : 0.1; - trainPattern.push_back (Pattern (values.begin (), values.end (), outputValue, event->GetWeight ())); - trainPattern.back ().addInput (1.0); // bias node - } - else - { - const std::vector& targets = event->GetTargets (); - trainPattern.push_back (Pattern (values.begin (), values.end (), targets.begin (), targets.end (), event->GetWeight ())); - trainPattern.back ().addInput (1.0); // bias node - } - } + TMVA::DNN::TNet GPUNet{}; - for (size_t iEvt = 0, iEvtEnd = eventCollectionTesting.size (); iEvt < iEvtEnd; ++iEvt) - { - const TMVA::Event* event = eventCollectionTesting.at (iEvt); - const std::vector& values = event->GetValues (); - if (fAnalysisType == Types::kClassification) - { - double outputValue = event->GetClass () == 0 ? 0.9 : 0.1; - testPattern.push_back (Pattern (values.begin (), values.end (), outputValue, event->GetWeight ())); - testPattern.back ().addInput (1.0); // bias node - } - else - { - const std::vector& targets = event->GetTargets (); - testPattern.push_back (Pattern (values.begin (), values.end (), targets.begin (), targets.end (), event->GetWeight ())); - testPattern.back ().addInput (1.0); // bias node - } - } + size_t inputSize = GetNVariables (); + size_t outputSize = (GetNTargets() == 0) ? 1 : GetNTargets(); - if (trainPattern.empty () || testPattern.empty ()) - return; + GPUNet.SetInputWidth(inputSize); - // create net and weights - fNet.clear (); - fWeights.clear (); + // Also need to set standard net structure. + fNet.setInputSize (inputSize + 1); + fNet.setOutputSize (outputSize); - // if "resume" from saved weights - if (fResume) + // configure neural net + auto itLayout = std::begin (fLayout), itLayoutEnd = std::end (fLayout)-1; // all layers except the last one + for ( ; itLayout != itLayoutEnd; ++itLayout) + { + fNet.addLayer (DNN::Layer ((*itLayout).first, (*itLayout).second)); + TMVA::DNN::EnumFunction f = (*itLayout).second; + switch(f) { - std::cout << ".. resume" << std::endl; - // std::tie (fNet, fWeights) = ReadWeights (fFileName); + case DNN::EnumFunction::RELU : + GPUNet.AddLayer((*itLayout).first, EActivationFunction::RELU); + break; + case DNN::EnumFunction::TANH : + GPUNet.AddLayer((*itLayout).first, EActivationFunction::TANH); + break; + case DNN::EnumFunction::SYMMRELU : + GPUNet.AddLayer((*itLayout).first, EActivationFunction::SYMMRELU); + break; + case DNN::EnumFunction::SOFTSIGN : + GPUNet.AddLayer((*itLayout).first, EActivationFunction::SOFTSIGN); + break; + case DNN::EnumFunction::SIGMOID : + GPUNet.AddLayer((*itLayout).first, EActivationFunction::SIGMOID); + break; + case DNN::EnumFunction::LINEAR : + GPUNet.AddLayer((*itLayout).first, EActivationFunction::IDENTITY); + break; + case DNN::EnumFunction::GAUSS : + GPUNet.AddLayer((*itLayout).first, EActivationFunction::GAUSS); + break; + default : + GPUNet.AddLayer((*itLayout).first, EActivationFunction::IDENTITY); + break; } - else // initialize weights and net - { - size_t inputSize = GetNVariables (); //trainPattern.front ().input ().size (); - size_t outputSize = fAnalysisType == Types::kClassification ? 1 : GetNTargets (); //trainPattern.front ().output ().size (); - fNet.setInputSize (inputSize + 1); // num vars + bias node - fNet.setOutputSize (outputSize); // num vars + bias node - - // configure neural net - auto itLayout = std::begin (fLayout), itLayoutEnd = std::end (fLayout)-1; // all layers except the last one - for ( ; itLayout != itLayoutEnd; ++itLayout) - { - fNet.addLayer (DNN::Layer ((*itLayout).first, (*itLayout).second)); - Log() << kINFO - << "Add Layer with " << (*itLayout).first << " nodes." - << Endl; - } + } - DNN::ModeOutputValues eModeOutputValues = DNN::ModeOutputValues::SIGMOID; - if (fAnalysisType == Types::kRegression) - { - eModeOutputValues = DNN::ModeOutputValues::DIRECT; - } - else if ((fAnalysisType == Types::kClassification || - fAnalysisType == Types::kMulticlass) && - fModeErrorFunction == TMVA::DNN::ModeErrorFunction::SUMOFSQUARES) - { - eModeOutputValues = DNN::ModeOutputValues::DIRECT; - } - fNet.addLayer (DNN::Layer (outputSize, (*itLayout).second, eModeOutputValues)); - Log() << kINFO - << "Add Layer with " << outputSize << " nodes." - << Endl << Endl; - fNet.setErrorFunction (fModeErrorFunction); - - size_t numWeights = fNet.numWeights (); - Log() << kINFO - << "Total number of Synapses = " - << numWeights - << Endl; + DNN::ModeOutputValues eModeOutputValues = DNN::ModeOutputValues::SIGMOID; + if (fAnalysisType == Types::kRegression) + { + eModeOutputValues = DNN::ModeOutputValues::DIRECT; + GPUNet.AddLayer(outputSize, EActivationFunction::IDENTITY); + GPUNet.SetLossFunction(ELossFunction::MEANSQUAREDERROR); + } else if ((fAnalysisType == Types::kClassification || + fAnalysisType == Types::kMulticlass) && + fModeErrorFunction == TMVA::DNN::ModeErrorFunction::SUMOFSQUARES) { + GPUNet.AddLayer(outputSize, EActivationFunction::IDENTITY); + GPUNet.SetLossFunction(ELossFunction::MEANSQUAREDERROR); + } else { + eModeOutputValues = DNN::ModeOutputValues::DIRECT; + GPUNet.AddLayer(outputSize, EActivationFunction::IDENTITY); + GPUNet.SetLossFunction(ELossFunction::CROSSENTROPY); + } - // initialize weights - fNet.initializeWeights (fWeightInitializationStrategy, - std::back_inserter (fWeights)); - } + fNet.addLayer (DNN::Layer (outputSize, (*itLayout).second, eModeOutputValues)); + fNet.setErrorFunction (fModeErrorFunction); + switch(fWeightInitializationStrategy) + { + case DNN::WeightInitializationStrategy::XAVIER : + GPUNet.Initialize(EInitialization::GAUSS); + break; + case DNN::WeightInitializationStrategy::XAVIERUNIFORM: + GPUNet.Initialize(EInitialization::UNIFORM); + break; + default : + GPUNet.Initialize(EInitialization::GAUSS); + break; + } + + size_t nTrainingSamples = GetEventCollection(Types::kTraining).size(); + size_t nTestSamples = GetEventCollection(Types::kTesting).size(); - // loop through settings - // and create "settings" and minimizer int idxSetting = 0; - for (auto itSettings = std::begin (fSettings), itSettingsEnd = std::end (fSettings); itSettings != itSettingsEnd; ++itSettings, ++idxSetting) + for (auto itSettings = std::begin (fSettings), itSettingsEnd = std::end (fSettings); + itSettings != itSettingsEnd; ++itSettings, ++idxSetting) { - std::shared_ptr ptrSettings = *itSettings; - ptrSettings->setMonitoring (fMonitoring); + + TMVA::DNN::Settings settings = **itSettings; + settings.setMonitoring (fMonitoring); + Log() << kINFO - << "Training with learning rate = " << ptrSettings->learningRate () - << ", momentum = " << ptrSettings->momentum () - << ", repetitions = " << ptrSettings->repetitions () + << "Training on GPU with learning rate = " + << settings.learningRate () + << ", momentum = " << settings.momentum () + << ", repetitions = " << settings.repetitions () << Endl; - ptrSettings->setProgressLimits ((idxSetting)*100.0/(fSettings.size ()), (idxSetting+1)*100.0/(fSettings.size ())); + settings.setProgressLimits ((idxSetting)*100.0/(fSettings.size ()), + (idxSetting+1)*100.0/(fSettings.size ())); - const std::vector& dropConfig = ptrSettings->dropFractions (); + const std::vector& dropConfig = settings.dropFractions (); if (!dropConfig.empty ()) - { - Log () << kINFO << "Drop configuration" << Endl - << " drop repetitions = " << ptrSettings->dropRepetitions () << Endl; - } + { + Log () << kINFO << "Drop configuration" << Endl + << " drop repetitions = " + << settings.dropRepetitions () << Endl; + } + + auto trainNet = GPUNet.CreateClone(settings.batchSize()); int idx = 0; for (auto f : dropConfig) - { - Log () << kINFO << " Layer " << idx << " = " << f << Endl; - ++idx; - } + { + Log () << kINFO << " Layer " << idx << " = " << f << Endl; + trainNet.GetLayer(idx).SetDropoutProbability(f); + ++idx; + } Log () << kINFO << Endl; - - if (ptrSettings->minimizerType () == TMVA::DNN::MinimizerType::fSteepest) - { - DNN::Steepest minimizer (ptrSettings->learningRate (), ptrSettings->momentum (), ptrSettings->repetitions ()); - /*E =*/fNet.train (fWeights, trainPattern, testPattern, minimizer, *ptrSettings.get ()); + + using DataLoader_t = typename DNN::TCuda::DataLoader_t; + DataLoader_t trainingData(GetEventCollection(Types::kTraining), + nTrainingSamples, + trainNet.GetBatchSize(), + trainNet.GetInputWidth(), + trainNet.GetOutputWidth()); + + DataLoader_t testData(GetEventCollection(Types::kTesting), + nTestSamples, + nTestSamples, + trainNet.GetInputWidth(), + trainNet.GetOutputWidth()); + auto testNet = GPUNet.CreateClone(testData.GetBatchSize()); + DNN::TGradientDescent minimizer{}; + + minimizer.Reset(); + minimizer.SetLearningRate(settings.learningRate()); + minimizer.SetTestInterval(settings.testRepetitions()); + minimizer.SetConvergenceSteps(settings.convergenceSteps()); + + bool converged = false; + size_t stepCount = 0; + + while (!converged) + { + // Perform minimization steps for a full epoch. + if ((stepCount % minimizer.GetTestInterval()) != 0) { + for (auto batch : trainingData) { + auto inputMatrix = batch.GetInput(); + auto outputMatrix = batch.GetOutput(); + minimizer.StepReducedWeights(trainNet, inputMatrix, outputMatrix); + } + } else { + Double_t trainingError = 0.0; + for (auto batch : trainingData) { + auto inputMatrix = batch.GetInput(); + auto outputMatrix = batch.GetOutput(); + trainingError += minimizer.StepReducedWeightsLoss( + trainNet, + inputMatrix, + outputMatrix); + } + trainingError /= (Double_t) trainingData.GetNBatchesInEpoch(); + + auto testBatch = *testData.begin(); + auto testInput = testBatch.GetInput(); + auto testOutput = testBatch.GetOutput(); + minimizer.TestError(testNet, testInput, testOutput); + + TString convText = Form("(train/test/epo/conv/maxco): %.3g/%.3g/%d/%d", + trainingError, + minimizer.GetTestError(), + (int) stepCount, + (int) minimizer.GetConvergenceCount ()); + Double_t progress = minimizer.GetConvergenceCount() + / settings.convergenceSteps(); + settings.cycle(progress, convText); + converged = minimizer.HasConverged(); } - ptrSettings.reset (); - Log () << kINFO << Endl; + stepCount++; + } + fMonitoring = 0; } - fMonitoring = 0; -} + fWeights.clear(); + size_t weightIndex = 0; + size_t prevLayerWidth = GPUNet.GetInputWidth(); + for (size_t l = 0; l < GPUNet.GetDepth(); l++) { + auto &layer = GPUNet.GetLayer(l); + size_t layerWidth = layer.GetWidth(); + size_t layerSize = prevLayerWidth * layerWidth; + fWeights.reserve(fWeights.size() + layerSize); + TMatrixT Weights(layer.GetWeights()); + for (size_t j = 0; j < (size_t) Weights.GetNcols(); j++) { + for (size_t i = 0; i < (size_t) Weights.GetNrows(); i++) { + fWeights.push_back(Weights(i,j)); + } + } + if (l == 0) { + fWeights.reserve(fWeights.size() + layerWidth); + TMatrixT theta(layer.GetBiases()); + for (size_t i = 0; i < layerWidth; i++) { + fWeights.push_back(theta(i,0)); + } + } + prevLayerWidth = layerWidth; + } + +#else // DNNCUDA flag not set. + + Log() << kFATAL << "CUDA backend not enabled. Please make sure " + "you have CUDA installed and it was successfully " + "detected by CMAKE." << Endl; +#endif // DNNCUDA +} //_______________________________________________________________________ Double_t TMVA::MethodDNN::GetMvaValue( Double_t* /*errLower*/, Double_t* /*errUpper*/ ) diff --git a/tmva/tmva/test/CMakeLists.txt b/tmva/tmva/test/CMakeLists.txt new file mode 100644 index 0000000000000..fc2a3e633d359 --- /dev/null +++ b/tmva/tmva/test/CMakeLists.txt @@ -0,0 +1,9 @@ +############################################################################ +# CMakeLists.txt file for building ROOT TMVA tests. +# @author Simon Pfreundschuh +############################################################################ + +project(tmva-tests) +find_package(ROOT REQUIRED) + +ROOT_ADD_TEST_SUBDIRECTORY(DNN) diff --git a/tmva/tmva/test/DNN/CMakeLists.txt b/tmva/tmva/test/DNN/CMakeLists.txt new file mode 100644 index 0000000000000..d9c36a272c93a --- /dev/null +++ b/tmva/tmva/test/DNN/CMakeLists.txt @@ -0,0 +1,89 @@ +############################################################################ +# CMakeLists.txt file for building TMVA/DNN tests. +# @author Simon Pfreundschuh +############################################################################ + +project(tmva-tests) +find_package(ROOT REQUIRED) + +set(Libraries Core MathCore Matrix TMVA) +include_directories(${ROOT_INCLUDE_DIRS}) + +# DNN - Backpropagation +ROOT_EXECUTABLE(testBackpropagation TestBackpropagation.cxx + LIBRARIES ${Libraries}) +ROOT_ADD_TEST(TMVA-DNN-Backpropagation COMMAND testBackpropagation) + +# DNN - Activation Functions +ROOT_EXECUTABLE(testActivationFunctions TestActivationFunctions.cxx + LIBRARIES ${Libraries}) +ROOT_ADD_TEST(TMVA-DNN-ActivationFunctions COMMAND testActivationFunctions) + +# DNN - Loss Functions +ROOT_EXECUTABLE(testLossFunctions TestLossFunctions.cxx + LIBRARIES ${Libraries}) +ROOT_ADD_TEST(TMVA-DNN-LossFunctions COMMAND testLossFunctions) + +# DNN - Derivatives +ROOT_EXECUTABLE(testDerivatives TestDerivatives.cxx + LIBRARIES ${Libraries}) +ROOT_ADD_TEST(TMVA-DNN-Derivatives COMMAND testDerivatives) + +# DNN - Minimization +ROOT_EXECUTABLE(testMinimization TestMinimization.cxx + LIBRARIES ${Libraries}) +ROOT_ADD_TEST(TMVA-DNN-Minimization COMMAND testMinimization) + +# DNN - Data Loader +ROOT_EXECUTABLE(testDataLoader TestDataLoader.cxx + LIBRARIES ${Libraries}) +ROOT_ADD_TEST(TMVA-DNN-DataLoader COMMAND testDataLoader) + +find_package(CUDA) +if (CUDA_FOUND) + # DNN - Cuda + CUDA_ADD_EXECUTABLE(testCuda TestCuda.cxx) + TARGET_LINK_LIBRARIES(testCuda ${Libraries} dnn_cuda ${CUDA_CUBLAS_LIBRARIES}) + ROOT_ADD_TEST(TMVA-DNN-CUDA COMMAND testCuda) + + # DNN - Activation Functions Cuda + CUDA_ADD_EXECUTABLE(testActivationFunctionsCuda TestActivationFunctionsCuda.cxx) + TARGET_LINK_LIBRARIES(testActivationFunctionsCuda ${Libraries} dnn_cuda + ${CUDA_CUBLAS_LIBRARIES}) + ROOT_ADD_TEST(TMVA-DNN-ActivationFunctionsCuda COMMAND testActivationFunctionsCuda) + + # DNN - Loss Functions Cuda + CUDA_ADD_EXECUTABLE(testLossFunctionsCuda TestLossFunctionsCuda.cxx) + TARGET_LINK_LIBRARIES(testLossFunctionsCuda ${Libraries} dnn_cuda + ${CUDA_CUBLAS_LIBRARIES}) + ROOT_ADD_TEST(TMVA-DNN-LossFunctionsCuda COMMAND testLossFunctionsCuda) + + # DNN - Derivatives Cuda + CUDA_ADD_EXECUTABLE(testDerivativesCuda TestDerivativesCuda.cxx) + TARGET_LINK_LIBRARIES(testDerivativesCuda ${Libraries} dnn_cuda + ${CUDA_CUBLAS_LIBRARIES} dnn_cuda) + ROOT_ADD_TEST(TMVA-DNN-DerivativesCuda COMMAND testDerivativesCuda) + + # DNN - Backpropagation Cuda + CUDA_ADD_EXECUTABLE(testBackpropagationCuda TestBackpropagationCuda.cxx) + TARGET_LINK_LIBRARIES(testBackpropagationCuda ${Libraries} dnn_cuda + ${CUDA_CUBLAS_LIBRARIES}) + ROOT_ADD_TEST(TMVA-DNN-BackpropagationCuda COMMAND testBackpropagationCuda) + + # DNN - Minimization Cuda + CUDA_ADD_EXECUTABLE(testMinimizationCuda TestMinimizationCuda.cxx) + TARGET_LINK_LIBRARIES(testMinimizationCuda ${Libraries} dnn_cuda + ${CUDA_CUBLAS_LIBRARIES}) + ROOT_ADD_TEST(TMVA-DNN-MinimizationCuda COMMAND testMinimizationCuda) + + # DNN - Arithmetic Cuda + CUDA_ADD_EXECUTABLE(testArithmeticCuda TestMatrixArithmeticCuda.cxx) + TARGET_LINK_LIBRARIES(testArithmeticCuda ${Libraries} dnn_cuda + ${CUDA_CUBLAS_LIBRARIES}) + ROOT_ADD_TEST(TMVA-DNN-ArithmeticCuda COMMAND testArithmeticCuda) + + # DNN - Arithmetic Cuda + CUDA_ADD_EXECUTABLE(testDataLoaderCuda TestDataLoaderCuda.cxx) + TARGET_LINK_LIBRARIES(testDataLoaderCuda ${Libraries} dnn_cuda + ${CUDA_CUBLAS_LIBRARIES}) +endif (CUDA_FOUND) \ No newline at end of file diff --git a/tmva/tmva/test/DNN/TestActivationFunctions.cxx b/tmva/tmva/test/DNN/TestActivationFunctions.cxx new file mode 100644 index 0000000000000..e5b9cc769c33e --- /dev/null +++ b/tmva/tmva/test/DNN/TestActivationFunctions.cxx @@ -0,0 +1,127 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic activation function test // +// for the reference architecture. // +////////////////////////////////////////////////////////////////////// + +#include +#include "TestActivationFunctions.h" + +using namespace TMVA::DNN; + +int main() +{ + std::cout << "Testing Activation Functions:" << std::endl; + + double error; + + // Identity. + + error = testIdentity>(10); + std::cout << "Testing identity activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testIdentityDerivative>(10); + std::cout << "Testing identity activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // ReLU. + + error = testRelu>(10); + std::cout << "Testing ReLU activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testReluDerivative>(10); + std::cout << "Testing ReLU activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // Sigmoid. + + error = testSigmoid>(10); + std::cout << "Testing Sigmoid activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testSigmoidDerivative>(10); + std::cout << "Testing Sigmoid activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // TanH. + + error = testTanh>(10); + std::cout << "Testing TanH activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testTanhDerivative>(10); + std::cout << "Testing TanH activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // Symmetric ReLU. + + error = testSymmetricRelu>(10); + std::cout << "Testing Symm. ReLU activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testSymmetricRelu>(10); + std::cout << "Testing Symm. ReLU activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // Soft Sign. + + error = testSoftSign>(10); + std::cout << "Testing Soft Sign activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testSoftSign>(10); + std::cout << "Testing Soft Sign activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // Gauss. + + error = testGauss>(10); + std::cout << "Testing Gauss activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testGauss>(10); + std::cout << "Testing Gauss activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + return 0; +} diff --git a/tmva/tmva/test/DNN/TestActivationFunctions.h b/tmva/tmva/test/DNN/TestActivationFunctions.h new file mode 100644 index 0000000000000..b50a5598d8b8f --- /dev/null +++ b/tmva/tmva/test/DNN/TestActivationFunctions.h @@ -0,0 +1,457 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Generic tests of the layer activation functions // +// // +// Contains tests for each of the layer activation functions that // +// test the evaluation of the function using the evaluate(...) // +// method and the computation of the derivatives using // +// evaluate_derivative(...) on a randomly generated matrix. Each // +// function returns the maximum relative error between the expected // +// result and the result obtained for the given arcthitecture. // +////////////////////////////////////////////////////////////////////// + +#ifndef TMVA_TEST_DNN_TEST_ACTIVATION_FUNCTIONS +#define TMVA_TEST_DNN_TEST_ACTIVATION_FUNCTIONS + +#include "TMatrixT.h" +#include "TMVA/DNN/Architectures/Reference.h" +#include "TMVA/DNN/Functions.h" +#include "TMVA/DNN/Net.h" +#include "Utility.h" + +using namespace TMVA::DNN; + +//______________________________________________________________________________ +// +// Identity Activation Function +//______________________________________________________________________________ + +/*! Test application of identity function to matrix. */ +//______________________________________________________________________________ +template +auto testIdentity(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate(AArch, EActivationFunction::IDENTITY); + + TMatrixT A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the identity function. */ +//______________________________________________________________________________ +template +auto testIdentityDerivative(size_t ntests) + -> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative(BArch, EActivationFunction::IDENTITY, AArch); + evaluateDerivative>(BRef, EActivationFunction::IDENTITY, + ARef); + + TMatrixT B = BArch; + Double_t error = maximumRelativeError(B, BRef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// ReLU Activation Function +//______________________________________________________________________________ + +/*! Test application of ReLU function to matrix. */ +//______________________________________________________________________________ +template +auto testRelu(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate(AArch, EActivationFunction::RELU); + applyMatrix(ARef, [](double x){return x < 0.0 ? 0.0 : x;}); + + TMatrixT A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the ReLU function. */ +//______________________________________________________________________________ +template +auto testReluDerivative(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative(BArch, EActivationFunction::RELU, AArch); + applyMatrix(ARef, [](double x){return x > 0.0 ? 1.0 : 0.0;}); + + TMatrixT B = BArch; + Double_t error = maximumRelativeError(B, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// Sigmoid Activation Function +//______________________________________________________________________________ + +/*! Test application of Sigmoid function to matrix. */ +//______________________________________________________________________________ +template +auto testSigmoid(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate(AArch, EActivationFunction::SIGMOID); + applyMatrix(ARef, [](double x){return 1.0 / (1.0 + std::exp(-x));}); + + TMatrixT A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the ReLU function. */ +//______________________________________________________________________________ +template +auto testSigmoidDerivative(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative(BArch, EActivationFunction::SIGMOID, AArch); + applyMatrix(ARef, [](Double_t x){ + Double_t sig = 1.0 / (1.0 + std::exp(-x)); + return sig * (1.0 - sig); + }); + + TMatrixT B = BArch; + Double_t error = maximumRelativeError(B, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// Tanh Activation Function +//______________________________________________________________________________ + +/*! Test application of tanh function to matrix. */ +//______________________________________________________________________________ +template +auto testTanh(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate(AArch, EActivationFunction::TANH); + applyMatrix(ARef, [](double x){return tanh(x);}); + + TMatrixT A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the tanh function. */ +//______________________________________________________________________________ +template +auto testTanhDerivative(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative(BArch, EActivationFunction::TANH, AArch); + applyMatrix(ARef, [](Double_t x){ + Double_t t = tanh(x); + return 1 - t * t; + }); + + TMatrixT B = BArch; + Double_t error = maximumRelativeError(B, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// Symmetric ReLU Activation Function +//______________________________________________________________________________ + +/*! Test application of symmetric ReLU function to matrix. */ +//______________________________________________________________________________ +template +auto testSymmetricRelu(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate(AArch, EActivationFunction::SYMMRELU); + applyMatrix(ARef, [](double x){return fabs(x);}); + + TMatrixT A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the symmetric ReLU function. */ +//______________________________________________________________________________ +template +auto testSymmetricReluDerivative(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative(BArch, EActivationFunction::SYMMRELU, AArch); + applyMatrix(ARef, [](Double_t x){ + return (x < 0) ? -1.0 : 1.0; + }); + + TMatrixT B = BArch; + Double_t error = maximumRelativeError(B, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// Soft Sign Activation Function +//______________________________________________________________________________ + +/*! Test application of symmetric soft sign function to matrix. */ +//______________________________________________________________________________ +template +auto testSoftSign(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate(AArch, EActivationFunction::SOFTSIGN); + applyMatrix(ARef, [](double x){return x / (1 + fabs(x));}); + + TMatrixT A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the soft sign function. */ +//______________________________________________________________________________ +template +auto testSoftSignDerivative(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative(BArch, EActivationFunction::SOFTSIGN, AArch); + applyMatrix(ARef, [](Double_t x){ + Double_t y = 1 + fabs(x); + return 1.0 / (y * y); + }); + + TMatrixT B = BArch; + Double_t error = maximumRelativeError(B, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// Gauss Activation Functions +//______________________________________________________________________________ + +/*! Test application of Gauss activation function to matrix. */ +//______________________________________________________________________________ +template +auto testGauss(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate(AArch, EActivationFunction::GAUSS); + applyMatrix(ARef, [](double x){return exp(- x * x);}); + + TMatrixT A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the Gauss activation function. */ +//______________________________________________________________________________ +template +auto testGaussDerivative(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative(BArch, EActivationFunction::GAUSS, AArch); + applyMatrix(ARef, [](Double_t x){return -2.0 * x * exp(- x * x);}); + + TMatrixT B = BArch; + Double_t error = maximumRelativeError(B, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} +#endif diff --git a/tmva/tmva/test/DNN/TestActivationFunctionsCuda.cxx b/tmva/tmva/test/DNN/TestActivationFunctionsCuda.cxx new file mode 100644 index 0000000000000..14fdea79e7a5a --- /dev/null +++ b/tmva/tmva/test/DNN/TestActivationFunctionsCuda.cxx @@ -0,0 +1,72 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic activation function test // +// for the TCuda implementation. // +////////////////////////////////////////////////////////////////////// + +#include +#include "TMVA/DNN/Architectures/Cuda.h" +#include "Utility.h" +#include "TestActivationFunctions.h" + +using namespace TMVA::DNN; + +int main() +{ + std::cout << "Testing Activation Functions:" << std::endl; + + double error; + + // Identity. + + error = testIdentity(10); + std::cout << "Testing identity activation: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + error = testIdentityDerivative(10); + std::cout << "Testing identity activation derivative: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + // ReLU. + + error = testRelu(10); + std::cout << "Testing ReLU activation: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + error = testReluDerivative(10); + std::cout << "Testing ReLU activation derivative: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + // Sigmoid. + + error = testSigmoid(10); + std::cout << "Testing Sigmoid activation: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + error = testSigmoidDerivative(10); + std::cout << "Testing Sigmoid activation derivative: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + return 0; +} diff --git a/tmva/tmva/test/DNN/TestBackpropagation.cxx b/tmva/tmva/test/DNN/TestBackpropagation.cxx new file mode 100644 index 0000000000000..638c222609329 --- /dev/null +++ b/tmva/tmva/test/DNN/TestBackpropagation.cxx @@ -0,0 +1,50 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic backpropagation test for // +// the reference architecture. // +//////////////////////////////////////////////////////////////////// + +#include +#include "TMVA/DNN/Architectures/Reference.h" +#include "TestBackpropagation.h" + +using namespace TMVA::DNN; + +int main() +{ + std::cout << "Testing Backpropagation:" << std::endl; + + double error; + + // + // Test backpropagation for linear net. + // + + error = testBackpropagationWeightsLinear>(1.0); + if (error > 1e-7) + return 1; + + error = testBackpropagationL1Regularization>(1e-2); + if (error > 1e-7) + return 1; + + error = testBackpropagationL2Regularization>(1.0); + if (error > 1e-7) + return 1; + + error = testBackpropagationBiasesLinear>(1.0); + if (error > 1e-7) + return 1; + + return 0; +} diff --git a/tmva/tmva/test/DNN/TestBackpropagation.h b/tmva/tmva/test/DNN/TestBackpropagation.h new file mode 100644 index 0000000000000..9082d3a33e315 --- /dev/null +++ b/tmva/tmva/test/DNN/TestBackpropagation.h @@ -0,0 +1,416 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////////// +// Generic tests of the backpropagation algorithm. // +// // +// All tests randomly generate a net with identity activation // +// functions, i.e. which is completely linear and then tests the // +// computed gradients for each layer using numerical // +// derivation. The restriction to linear nets is to avoid the // +// required division by the finite difference interval used to // +// approximate the numerical derivatives, which would otherwise // +// cause precision loss. // +//////////////////////////////////////////////////////////////////// + +#include +#include "TMVA/DNN/Functions.h" +#include "TMVA/DNN/Net.h" +#include "Utility.h" + +using namespace TMVA::DNN; + +/*! Compute the loss of the net as a function of the weight at index (i,j) in + * layer l. dx is added as an offset to the current value of the weight. */ +//______________________________________________________________________________ +template +auto evaluate_net_weight(TNet &net, + typename Architecture::Matrix_t &X, + const typename Architecture::Matrix_t &Y, + size_t l, + size_t i, + size_t j, + typename Architecture::Scalar_t dx) + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + + net.GetLayer(l).GetWeights().operator()(i,j) += dx; + Scalar_t res = net.Loss(X, Y); + net.GetLayer(l).GetWeights().operator()(i,j) -= dx; + return res; +} + +/*! Compute the loss of the net as a function of the weight at index i in + * layer l. dx is added as an offset to the current value of the weight. */ +//______________________________________________________________________________ +template +auto evaluate_net_bias(TNet &net, + typename Architecture::Matrix_t &X, + const typename Architecture::Matrix_t &Y, + size_t l, + size_t i, + typename Architecture::Scalar_t dx) + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + + net.GetLayer(l).GetBiases().operator()(i,0) += dx; + Scalar_t res = net.Loss(X, Y); + net.GetLayer(l).GetBiases().operator()(i,0) -= dx; + return res; +} + +/*! Generate a random net, perform forward and backward propagation and check + * the weight gradients using numerical differentiation. Returns the maximum + * relative gradient error and also prints it to stdout. */ +//______________________________________________________________________________ +template +auto testBackpropagationWeightsLinear(typename Architecture::Scalar_t dx) +-> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + using Net_t = TNet; + + + Net_t net(50, 50, ELossFunction::MEANSQUAREDERROR); + + // Random net. + constructRandomLinearNet(net); + net.Initialize(EInitialization::GAUSS); + + // Random training data. + Matrix_t X(50, 50); + randomBatch(X); + + Matrix_t Y(50, net.GetOutputWidth()); + randomMatrix(Y); + + net.Forward(X); + net.Backward(X,Y); + + Scalar_t maximum_error = 0.0; + + // Compute derivatives for all weights using finite differences and + // compare to result obtained from backpropagation. + for (size_t l = 0; l < net.GetDepth(); l++) + { + std::cout << "\rTesting weight gradients: layer: " << l << " / " << net.GetDepth(); + std::cout << std::flush; + auto & layer = net.GetLayer(l); + auto & W = layer.GetWeightGradients(); + + for (size_t i = 0; i < layer.GetWidth(); i++) + { + for (size_t j = 0; j < layer.GetInputWidth(); j++) + { + auto f = [& net, & X, &Y, l, i, j](Scalar_t x) + { + return evaluate_net_weight(net, X, Y, l, i, j, x); + }; + Scalar_t dy = finiteDifference(f, dx) / (2.0 * dx); + Scalar_t dy_ref = W(i,j); + + // Compute the relative error if dy != 0. + Scalar_t error; + if (std::fabs(dy_ref) > 1e-15) + { + error = std::fabs((dy - dy_ref) / dy_ref); + } + else + { + error = std::fabs(dy - dy_ref); + } + + maximum_error = std::max(error, maximum_error); + } + } + } + + std::cout << "\rTesting weight gradients: "; + std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl; + return maximum_error; +} + +/*! Generate a random, linear net, perform forward and backward propagation with + * L1 regularization and check the weight gradients using numerical + * differentiation. Returns the maximum relative gradient error and + * also prints it to stdout. */ +//______________________________________________________________________________ +template +auto testBackpropagationL1Regularization(typename Architecture::Scalar_t dx) +-> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + using Net_t = TNet; + + Net_t net(50, 50, ELossFunction::MEANSQUAREDERROR, ERegularization::L1, 0.1); + + // Random net. + constructRandomLinearNet(net); + net.Initialize(EInitialization::GAUSS); + + // Random training data. + Matrix_t X(50, 50); + randomBatch(X); + + Matrix_t Y(50, net.GetOutputWidth()); + randomMatrix(Y); + + net.Forward(X); + net.Backward(X,Y); + + Scalar_t maximum_error = 0.0; + + // Compute derivatives for all weights using finite differences and + // compare to result obtained from backpropagation. + for (size_t l = 0; l < net.GetDepth(); l++) + { + std::cout << "\rTesting weight gradients (L1): layer: " << l << " / " << net.GetDepth(); + std::cout << std::flush; + auto & layer = net.GetLayer(l); + auto & W = layer.GetWeights(); + auto & dW = layer.GetWeightGradients(); + + for (size_t i = 0; i < layer.GetWidth(); i++) { + for (size_t j = 0; j < layer.GetInputWidth(); j++) { + // Avoid running into the non-derivable point at 0.0. + if (std::abs(W(i,j)) > dx) { + auto f = [& net, & X, &Y, l, i, j](Scalar_t x) + { + return evaluate_net_weight(net, X, Y, l, i, j, x); + }; + Scalar_t dy = finiteDifference(f, dx) / (2.0 * dx); + Scalar_t dy_ref = dW(i,j); + + // Compute the relative error if dy != 0. + Scalar_t error; + if (std::fabs(dy_ref) > 1e-15) + { + error = std::fabs((dy - dy_ref) / dy_ref); + } + else + { + error = std::fabs(dy - dy_ref); + } + + maximum_error = std::max(error, maximum_error); + } + } + } + } + + std::cout << "\rTesting weight gradients (L1): "; + std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl; + return maximum_error; +} + +/*! Generate a random, linear net, perform forward and backward propagation with + * L2 regularization and check the weight gradients using numerical + * differentiation. Returns the maximum relative gradient error and + * also prints it to stdout. */ +//______________________________________________________________________________ +template +auto testBackpropagationL2Regularization(typename Architecture::Scalar_t dx) +-> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + using Net_t = TNet; + + Net_t net(50, 50, ELossFunction::MEANSQUAREDERROR, ERegularization::L2, 0.1); + + // Random net. + constructRandomLinearNet(net); + net.Initialize(EInitialization::GAUSS); + + // Random training data. + Matrix_t X(50, 50); + randomBatch(X); + + Matrix_t Y(50, net.GetOutputWidth()); + randomMatrix(Y); + + net.Forward(X); + net.Backward(X,Y); + + Scalar_t maximum_error = 0.0; + + // Compute derivatives for all weights using finite differences and + // compare to result obtained from backpropagation. + for (size_t l = 0; l < net.GetDepth(); l++) + { + std::cout << "\rTesting weight gradients (L2): layer: " << l << " / " << net.GetDepth(); + std::cout << std::flush; + auto & layer = net.GetLayer(l); + auto & W = layer.GetWeightGradients(); + + for (size_t i = 0; i < layer.GetWidth(); i++) + { + for (size_t j = 0; j < layer.GetInputWidth(); j++) + { + auto f = [& net, & X, &Y, l, i, j](Scalar_t x) + { + return evaluate_net_weight(net, X, Y, l, i, j, x); + }; + Scalar_t dy = finiteDifference(f, dx) / (2.0 * dx); + Scalar_t dy_ref = W(i,j); + + // Compute the relative error if dy != 0. + Scalar_t error; + if (std::fabs(dy_ref) > 1e-15) + { + error = std::fabs((dy - dy_ref) / dy_ref); + } + else + { + error = std::fabs(dy - dy_ref); + } + + maximum_error = std::max(error, maximum_error); + } + } + } + + std::cout << "\rTesting weight gradients (L2): "; + std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl; + return maximum_error; +} + +/*! Generate a random net, perform forward and backward propagation and check + * the bias gradients using numerical differentiation. Returns the maximum + * relative gradient error and also prints it to stdout. */ +//______________________________________________________________________________ +template +auto testBackpropagationBiasesLinear(typename Architecture::Scalar_t dx) +-> typename Architecture::Scalar_t +{ + using Net_t = TNet; + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + + + Net_t net(50, 50, ELossFunction::MEANSQUAREDERROR); + + // Random net. + constructRandomLinearNet(net); + net.Initialize(EInitialization::GAUSS); + + // Random training data. + Matrix_t X(50, 50); + randomBatch(X); + + Matrix_t Y(50, net.GetOutputWidth()); + randomMatrix(Y); + + net.Forward(X); + net.Backward(X,Y); + + Scalar_t maximum_error = 0.0; + + // Compute derivatives for all bias terms using finite differences and + // compare to result obtained from backpropagation. + for (size_t l = 0; l < net.GetDepth(); l++) + { + std::cout << "\rTesting bias gradients: layer: " << l << " / " << net.GetDepth(); + std::cout << std::flush; + auto & layer = net.GetLayer(l); + auto & dtheta = layer.GetBiasGradients(); + + for (size_t i = 0; i < layer.GetWidth(); i++) + { + auto f = [& net, & X, &Y, l, i](Scalar_t x) + { + return evaluate_net_bias(net, X, Y, l, i, x); + }; + Scalar_t dy = finiteDifference(f, dx); + Scalar_t dy_ref = dtheta(i,0) * 2.0 * dx; + + // Compute the relative error if dy != 0. + Scalar_t error; + if (std::fabs(dy_ref) > 1e-10) + { + error = std::fabs((dy - dy_ref) / dy_ref); + } + else + { + error = std::fabs(dy - dy_ref); + } + + maximum_error = std::max(error, maximum_error); + } + } + + std::cout << "\rTesting bias gradients: "; + std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl; + return maximum_error; +} + +/*! Generate a random net, perform forward and backward propagation and check + * the weight gradients using numerical differentiation. Returns the maximum + * relative gradient error and also prints it to stdout. */ +//______________________________________________________________________________ +template +auto testNumerics(TNet & net, + typename Architecture::Matrix_t X, + typename Architecture::Matrix_t Y, + typename Architecture::Scalar_t dx) + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + + net.Forward(X); + net.Backward(X,Y); + + Scalar_t maximum_error = 0.0; + + // Compute derivatives for all weight using finite differences and + // compare to result obtained from backpropagation. + for (size_t l = 0; l < net.GetDepth(); l++) + { + std::cout << "\rTesting weight gradients: layer: " << l << " / " << net.GetDepth(); + std::cout << std::flush; + auto & layer = net.GetLayer(l); + auto & W = layer.GetWeightGradients(); + + for (size_t i = 0; i < layer.GetWidth(); i++) + { + for (size_t j = 0; j < layer.GetInputWidth(); j++) + { + auto f = [& net, & X, &Y, l, i, j](Scalar_t x) + { + return evaluate_net_weight(net, X, Y, l, i, j, x); + }; + Scalar_t dy = finiteDifference(f, dx) / (2.0 * dx); + Scalar_t dy_ref = W(i,j); + + // Compute the relative error if dy != 0. + Scalar_t error; + if (std::fabs(dy_ref) > 1e-15) + { + error = std::fabs((dy - dy_ref) / dy_ref); + } + else + { + error = std::fabs(dy - dy_ref); + } + + maximum_error = std::max(error, maximum_error); + } + } + } + + std::cout << "\rTesting weight gradients: "; + std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl; + return maximum_error; +} diff --git a/tmva/tmva/test/DNN/TestBackpropagationCuda.cxx b/tmva/tmva/test/DNN/TestBackpropagationCuda.cxx new file mode 100644 index 0000000000000..d2a875565d918 --- /dev/null +++ b/tmva/tmva/test/DNN/TestBackpropagationCuda.cxx @@ -0,0 +1,47 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic backpropagation test for // +// CUDA architectures. // +//////////////////////////////////////////////////////////////////// + +#include +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMatrix.h" +#include "TestBackpropagation.h" + +using namespace TMVA::DNN; + +int main() +{ + std::cout << "Testing Backpropagation:" << std::endl; + + double error; + + error = testBackpropagationWeightsLinear(1.0); + if (error > 1e-7) + return 1; + + error = testBackpropagationL1Regularization(1e-2); + if (error > 1e-7) + return 1; + + error = testBackpropagationL2Regularization(1.0); + if (error > 1e-7) + return 1; + + error = testBackpropagationBiasesLinear(1.0); + if (error > 1e-7) + return 1; + + return 0; +} diff --git a/tmva/tmva/test/DNN/TestCuda.cxx b/tmva/tmva/test/DNN/TestCuda.cxx new file mode 100644 index 0000000000000..140b2d324bdcd --- /dev/null +++ b/tmva/tmva/test/DNN/TestCuda.cxx @@ -0,0 +1,196 @@ +#include "Utility.h" +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Reference.h" +#include + +using namespace TMVA::DNN; + +//_________________________________________________________________________________ +Double_t testMultiply() +{ + const size_t ntests = 100; + + Double_t maximumError = 0; + + for (size_t i = 0; i < ntests; i++) { + size_t m, n, k; + m = rand() % 50 + 1; + n = rand() % 50 + 1; + k = rand() % 50 + 1; + + TMatrixT A(m,k), AT(k,m) , B(k,n), BT(n,k), C(m,n); + randomMatrix(A); + randomMatrix(AT); + randomMatrix(B); + randomMatrix(BT); + TCudaMatrix ACuda(A), ATCuda(AT), BCuda(B), BTCuda(BT), CCuda(C); + + TReference::MultiplyTranspose(C, A, BT); + TCuda::MultiplyTranspose(CCuda, ACuda, BTCuda); + TMatrixT CRef(CCuda); + Double_t error = maximumRelativeError(C, CRef); + maximumError = std::max(error, maximumError); + + C.Mult(A,B); + TCuda::Multiply(CCuda, ACuda, BCuda); + CRef = CCuda; + error = maximumRelativeError(C, CRef); + maximumError = std::max(error, maximumError); + + C.TMult(AT,B); + TCuda::TransposeMultiply(CCuda, ATCuda, BCuda); + CRef = CCuda; + error = maximumRelativeError(C, CRef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//_________________________________________________________________________________ +Double_t testAddRowWise() +{ + const size_t ntests = 10; + + Double_t maximumError = 0; + + for (size_t i = 0; i < ntests; i++) { + size_t m, n; + m = rand() % 50 + 1; + n = rand() % 50 + 1; + + TMatrixT A(m,n), B(m,n), theta(n,1); + //randomMatrix(A); + randomMatrix(theta); + TCudaMatrix ACuda(A), BCuda(B), thetaCuda(theta); + + TReference::AddRowWise(A, theta); + TCuda::AddRowWise(ACuda,thetaCuda); + TMatrixT ARef(ACuda); + + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//_________________________________________________________________________________ +Double_t testHadamard() +{ + const size_t ntests = 10; + Double_t maximumError = 0; + + for (size_t i = 0; i < ntests; i++) { + size_t m, n; + m = rand() % 10 + 1; + n = rand() % 10 + 1; + + TMatrixT A(m,n), B(m,n); + randomMatrix(A); + randomMatrix(B); + TCudaMatrix ACuda(A), BCuda(B); + + for (size_t j = 0; j < (size_t) A.GetNrows(); j++) { + for (size_t k = 0; k < (size_t) A.GetNcols(); k++) { + A(j,k) *= B(j,k); + } + } + + TCuda::Hadamard(ACuda, BCuda); + TMatrixT ARef(ACuda); + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//_________________________________________________________________________________ +Double_t testReduction() +{ + const size_t ntests = 10; + Double_t maximumError = 0; + + for (size_t i = 0; i < ntests; i++) { + size_t m, n; + m = rand() % 1000 + 1; + n = rand() % 1000 + 1; + + TMatrixT A(m,n); + + for (size_t j = 0; j < m; j++) { + for (size_t k = 0; k < n; k++) { + A(j,k) = 1.0; + } + } + TCudaMatrix ACuda(A); + + TCudaMatrix BCuda(1,n); + TCuda::InitializeZero(BCuda); + Double_t s = TCuda::Sum(A); + TCuda::SumColumns(BCuda, ACuda); + TMatrixT B(BCuda); + + Double_t error = s - ((Double_t) m * n); + maximumError = std::max(error, maximumError); + + for (size_t j = 0; j < n; j++) { + //std::cout << B(0,j) << " / " << j * m << std::endl; + error = std::abs(B(0,j) - m); + maximumError = std::max(error, maximumError); + } + } + return maximumError; +} + +//_________________________________________________________________________________ +Double_t testScaleAdd() +{ + const size_t ntests = 10; + Double_t maximumError = 0; + + for (size_t i = 0; i < ntests; i++) { + size_t m, n; + m = rand() % 1000 + 1; + n = rand() % 1000 + 1; + + TMatrixT A(m,n), B(m,n); + + randomMatrix(A); + randomMatrix(B); + + TCudaMatrix ACuda(A); + TCudaMatrix BCuda(B); + + Double_t beta = ((Double_t) rand()) / ((Double_t) RAND_MAX); + TReference::ScaleAdd(A, B, beta); + TCuda::ScaleAdd(ACuda, BCuda, beta); + + Double_t error = maximumRelativeError(A, (TMatrixT) ACuda); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//_________________________________________________________________________________ +int main() +{ + Double_t error; + error = testReduction(); + std::cout << "Testing reduction: max. rel. error = "; + std::cout << error << std::endl; + + error = testScaleAdd(); + std::cout << "Testing scale_add: max. rel. error = "; + std::cout << error << std::endl; + + error = testHadamard(); + std::cout << "Testing hadamard: max. rel. error = "; + std::cout << error << std::endl; + + error = testMultiply(); + std::cout << "Testing multiplication: max. rel. error = "; + std::cout << error << std::endl; + + error = testAddRowWise(); + std::cout << "Testing add_row_wise: max. rel. error = "; + std::cout << error << std::endl; +} diff --git a/tmva/tmva/test/DNN/TestDataLoader.cxx b/tmva/tmva/test/DNN/TestDataLoader.cxx new file mode 100644 index 0000000000000..b68b383303cfa --- /dev/null +++ b/tmva/tmva/test/DNN/TestDataLoader.cxx @@ -0,0 +1,26 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 12/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////// +// Test the reference data loader implementation. // +//////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Reference.h" +#include "TestDataLoader.h" + +using namespace TMVA::DNN; + +int main () +{ + Double_t error = testDataLoader>(); + std::cout << "Testing reference data loader: Mex. rel. error = " << error; + std::cout << std::endl; +} diff --git a/tmva/tmva/test/DNN/TestDataLoader.h b/tmva/tmva/test/DNN/TestDataLoader.h new file mode 100644 index 0000000000000..e3664a8adb787 --- /dev/null +++ b/tmva/tmva/test/DNN/TestDataLoader.h @@ -0,0 +1,56 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 12/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////// +// Generic test for DataLoader implementations. // +////////////////////////////////////////////////// + +#include "TMVA/DNN/Net.h" +#include "Utility.h" + +namespace TMVA +{ +namespace DNN +{ + +template +auto testDataLoader() + -> typename Architecture_t::Scalar_t +{ + + using Scalar_t = typename Architecture_t::Scalar_t; + using Matrix_t = typename Architecture_t::Matrix_t; + using Net_t = TNet; + + using DataLoader_t = typename Architecture_t::template DataLoader_t; + + Matrix_t X(2000, 100); randomMatrix(X); + MatrixInput_t input(X, X); + DataLoader_t loader(input, 2000, 20, 100, 100); + + Net_t net(20, 100, ELossFunction::MEANSQUAREDERROR); + net.AddLayer(100, EActivationFunction::IDENTITY); + net.AddLayer(100, EActivationFunction::IDENTITY); + net.Initialize(EInitialization::IDENTITY); + + Scalar_t maximumError = 0.0; + for (auto b : loader) { + Matrix_t inputMatrix = b.GetInput(); + Matrix_t outputMatrix = b.GetOutput(); + Scalar_t error = net.Loss(inputMatrix, outputMatrix); + maximumError = std::max(error, maximumError); + } + + return maximumError; +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/test/DNN/TestDataLoaderCuda.cxx b/tmva/tmva/test/DNN/TestDataLoaderCuda.cxx new file mode 100644 index 0000000000000..6c5c78027c881 --- /dev/null +++ b/tmva/tmva/test/DNN/TestDataLoaderCuda.cxx @@ -0,0 +1,79 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 08/08/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////// +// Test the generic data loader for the CUDA implementation. // +/////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Net.h" +#include "TMVA/DNN/DataLoader.h" +#include "TMVA/DNN/Architectures/Cuda.h" +#include "Utility.h" +#include "TMatrix.h" + +using namespace TMVA::DNN; + +int main() +{ + using Matrix_t = typename TCuda::Matrix_t; + using DataLoader_t = TDataLoader; + + TMatrixT A(20, 5), B(20, 5); + randomMatrix(A); + randomMatrix(B); + + for(size_t i = 0; i < 20; i++) { + for (size_t j = 0; j < 5; j++) { + A(i,j) = i; + B(i,j) = j; + } + } + + MatrixInput_t data(A, B); + DataLoader_t loader(data, 20, 5, 5, 5, 4); + + TNet net(5, 5, ELossFunction::MEANSQUAREDERROR, ERegularization::NONE); + net.AddLayer(5, EActivationFunction::IDENTITY); + net.Initialize(EInitialization::IDENTITY); + + std::vector> nets{}; + + std::vector ms{}; + nets.reserve(4); + ms.reserve(4); + for (size_t i = 0; i < 4; i++) { + nets.push_back(net); + ms.push_back(TCudaMatrix(5,5)); + for (size_t j = 0; j < net.GetDepth(); j++) { + TCuda::Copy(nets[i].GetLayer(j).GetWeights(), net.GetLayer(j).GetWeights()); + TCuda::Copy(nets[i].GetLayer(j).GetBiases(), net.GetLayer(j).GetBiases()); + } + } + + std::cout << "net size: " << nets.size() << std::endl; + for (size_t i = 0; i < 4; i++) { + } + std::cout << "ms size: " << ms.size() << std::endl; + + for (size_t i = 0; i < 4; i++) { + auto b = loader.GetBatch(); + nets[i].Forward(b.GetInput()); + TCuda::Copy(ms[i], nets[i].GetOutput()); + } + for (size_t i = 0; i < 4; i++) { + ((TMatrixT) ms[i]).Print(); + } +} + + + + + diff --git a/tmva/tmva/test/DNN/TestDataStreamCuda.cxx b/tmva/tmva/test/DNN/TestDataStreamCuda.cxx new file mode 100644 index 0000000000000..d67612595c310 --- /dev/null +++ b/tmva/tmva/test/DNN/TestDataStreamCuda.cxx @@ -0,0 +1,50 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Test TCudaDataLoader class for streaming data to CUDA devices. // +/////////////////////////////////////////////////////////////////// + +#include "TMatrix.h" +#include "TMVA/DNN/Architectures/Cuda.h" +#include "Utility.h" + +using namespace TMVA::DNN; + +int main() +{ + TMatrixT X(20,5), Y(20,10); + randomMatrix(X); + randomMatrix(Y); + MatrixInput_t inputData(X, Y); + + TCudaDataLoader dataLoader(inputData, 20, 5, 10, 2, 2, 1); + + CudaDouble_t maximumError = 0.0; + CudaDouble_t error = 0.0; + + for (auto b : dataLoader) { + TMatrixT X2(b.GetInput()); + TMatrixT Y2(b.GetOutput()); + + std::cout << "::: BATCH :::" << std::endl; + + X.Print(); + X2.Print(); + Y.Print(); + Y2.Print(); + maximumError = std::max(error, maximumError); + maximumError = std::max(error, maximumError); + } + + std::cout << "Test data stream: Max. rel. error = " << maximumError << std::endl; + +} diff --git a/tmva/tmva/test/DNN/TestDerivatives.cxx b/tmva/tmva/test/DNN/TestDerivatives.cxx new file mode 100644 index 0000000000000..3a3e742469a69 --- /dev/null +++ b/tmva/tmva/test/DNN/TestDerivatives.cxx @@ -0,0 +1,65 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic derivative test for the // +// reference implementation. // +/////////////////////////////////////////////////////////////////// + +#include +#include "TMVA/DNN/Architectures/Reference.h" +#include "TestDerivatives.h" + +using namespace TMVA::DNN; + +int main() +{ + + double error; + + // + // Activation Functions + // + + std::cout << "Activation Functions:" << std::endl; + error = testActivationFunctionDerivatives>(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << print_error(error); + std::cout << std::endl << std::endl; + if (error > 1e-5) + return 1; + + // + // Loss Functions + // + + std::cout << "Loss Functions:" << std::endl; + error = testLossFunctionGradients>(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << print_error(error); + std::cout << std::endl << std::endl; + if (error > 1e-5) + return 1; + + // + // Regularization Functions + // + + std::cout << "Regularization:" << std::endl; + error = testRegularizationGradients>(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << print_error(error); + std::cout << std::endl << std::endl; + if (error > 1e-5) + return 1; + + return 0; +} diff --git a/tmva/tmva/test/DNN/TestDerivatives.h b/tmva/tmva/test/DNN/TestDerivatives.h new file mode 100644 index 0000000000000..07688e668642d --- /dev/null +++ b/tmva/tmva/test/DNN/TestDerivatives.h @@ -0,0 +1,260 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Generic tests for the derivatives and gradiens of acitvation, // +// loss and regularization functions. Each function generates a // +// random 10 x 10 matrix and uses a central finite difference and // +// to numerically compute the derivative of the function // +// w.r.t. this element. The result is compared to the result // +// obtained by the corresponding analytic derivative implemented by // +// the evaluateDerivative(...), evaluateGradients(...), // +// addRegularizationGradients(...) functions. // +////////////////////////////////////////////////////////////////////// + +#include +#include "TMVA/DNN/Functions.h" +#include "TMVA/DNN/Net.h" +#include "Utility.h" + +using namespace TMVA::DNN; + +//______________________________________________________________________________ +// +// Activation Functions +//______________________________________________________________________________ + +/*! Generic function that numerically computes the derivative of a matrix + * function f and the analytical solution given by df the function signatures + * are assumed to be + * - void f(Matrix_t &X) + * - void df(Matrix_t &Y, const Matrix_t &X) -> derivative of f at X(i,j) is + * The function f is supposed to apply the corresponding mathematical function + * to each element in the provided matrix X. The function df is expected to + * set each element in Y to the derivative of the corresponding mathematical + * function evaluated at the corresponding element in X. + */ +template + auto testDerivatives(F f, dF df, + typename Architecture::Scalar_t dx) + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + + Scalar_t maximum_error = 0.0; + + for (size_t i = 0; i < 100; i++) + { + Matrix_t X(10,10), Y(10,10); + randomMatrix(Y); + + df(X, Y); + Scalar_t dy = X(0,0); + + copyMatrix(X, Y); + X(0,0) += dx; + f(X); + Scalar_t y1 = X(0,0); + copyMatrix(X, Y); + X(0,0) -= dx; + f(X); + Scalar_t y0 = X(0,0); + Scalar_t dy_num = (y1 - y0) / (2.0 * dx); + + Scalar_t error = 0.0; + if (std::fabs(dy) > 0) + { + error = std::fabs((dy_num - dy) / dy); + } + else + error = dy_num - dy; + + maximum_error = std::max(maximum_error, error); + } + + return maximum_error; +} + +/*! Test derivatives of all activation functions and return the maximum relative + * error. Prints the result for each function to the stdout. */ +//______________________________________________________________________________ +template +auto testActivationFunctionDerivatives() + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + + std::vector EActivationFunctions + = {EActivationFunction::IDENTITY, + EActivationFunction::SIGMOID, + EActivationFunction::TANH, + EActivationFunction::SOFTSIGN, + EActivationFunction::GAUSS}; + + Scalar_t error, maximum_error; + maximum_error = 0.0; + + for (auto & af : EActivationFunctions) + { + auto f = [& af](Matrix_t &X){ evaluate(X, af);}; + auto df = [& af](Matrix_t &X, const Matrix_t &Y) + { + evaluateDerivative(X, af, Y); + }; + error = testDerivatives(f, df, 5e-6); + + std::cout << "Testing " << static_cast(af) << ": "; + std::cout << "Maximum Relative Error = " << print_error(error) << std::endl; + + maximum_error = std::max(maximum_error, error); + } + + return maximum_error; +} + +//______________________________________________________________________________ +// +// Loss functions. +//______________________________________________________________________________ + +/*! Similar to testDerivatives only that here the mathematical function is + * expected to be a matrix functional, i.e. to be mapping a matrix to a + * scalar value. The scalar value is supposed to be computed by the provided + * function object f, while the function object is just like above. */ +template + auto testGradients(F f, dF df, + typename Architecture::Scalar_t dx) + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + + Scalar_t maximum_error = 0.0; + + for (size_t i = 0; i < 100; i++) + { + Matrix_t X(10,10), Y(10,10), Z(10,10); + randomMatrix(X); + randomMatrix(Y); + + df(Z, Y, X); + Scalar_t dy = Z(0,0); + + X(0,0) += dx; + Scalar_t y1 = f(Y,X); + X(0,0) -= 2.0 * dx; + Scalar_t y0 = f(Y,X); + Scalar_t dy_num = (y1 - y0) / (2.0 * dx); + + Scalar_t error = 0.0; + if (std::fabs(dy) > 0) + { + error = std::fabs((dy_num - dy) / dy); + } + else + error = dy_num - dy; + + maximum_error = std::max(maximum_error, error); + } + + return maximum_error; +} + +/*! Test gradients of all loss function for the given architecture type and + * return the maximum relative error. Prints results for each function to + * standard out. */ +//______________________________________________________________________________ +template +auto testLossFunctionGradients() + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + + std::vector LossFunctions + = {ELossFunction::MEANSQUAREDERROR, + ELossFunction::CROSSENTROPY}; + + Scalar_t error, maximum_error; + maximum_error = 0.0; + + for (auto & lf : LossFunctions) + { + auto f = [lf](const Matrix_t &Y, const Matrix_t &Z) + { + return evaluate(lf, Y, Z); + }; + auto df = [& lf](Matrix_t &X, + const Matrix_t &Y, + const Matrix_t &Z) + { + evaluateGradients(X, lf, Y, Z); + }; + + error = testGradients(f, df, 5e-6); + + std::cout << "Testing " << static_cast(lf) << ": "; + std::cout << "Maximum Relative Error = " << print_error(error) << std::endl; + + maximum_error = std::max(maximum_error, error); + } + + return maximum_error; +} + +//______________________________________________________________________________ +// +// Regularization. +//______________________________________________________________________________ + +/*! Test the computation of gradients for all differentiable regularization types, + * which is so far only L2 and no regularization and print the results to standard + * out */ +template +auto testRegularizationGradients() + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + + std::vector Regularizations + = {ERegularization::NONE, + ERegularization::L2}; + + Scalar_t error, maximum_error; + maximum_error = 0.0; + + for (auto & r : Regularizations) + { + auto f = [r](const Matrix_t & , const Matrix_t & Y) + { + return regularization(Y, r); + }; + auto df = [& r](Matrix_t &X, + const Matrix_t & , + const Matrix_t & Y) + { + applyMatrix(X, [](double){return 0.0;}); + addRegularizationGradients(X, Y, (Scalar_t) 1.0, r); + }; + + error = testGradients(f, df, 1.0); + + std::cout << "Testing " << static_cast(r) << ": "; + std::cout << "Maximum Relative Error = " << print_error(error) << std::endl; + + maximum_error = std::max(maximum_error, error); + } + + return maximum_error; +} diff --git a/tmva/tmva/test/DNN/TestDerivativesCuda.cxx b/tmva/tmva/test/DNN/TestDerivativesCuda.cxx new file mode 100644 index 0000000000000..fec8986e689da --- /dev/null +++ b/tmva/tmva/test/DNN/TestDerivativesCuda.cxx @@ -0,0 +1,64 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic derivative test for the // +// reference implementation. // +/////////////////////////////////////////////////////////////////// + +#include +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TestDerivatives.h" + +using namespace TMVA::DNN; + +int main() +{ + double error; + + // + // Activation Functions + // + + std::cout << "Activation Functions:" << std::endl; + error = testActivationFunctionDerivatives(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << print_error(error); + std::cout << std::endl << std::endl; + if (error > 1e-5) + return 1; + + // + // Loss Functions + // + + std::cout << "Loss Functions:" << std::endl; + error = testLossFunctionGradients(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << print_error(error); + std::cout << std::endl << std::endl; + if (error > 1e-5) + return 1; + + // + // Regularization Functions + // + + std::cout << "Regularization:" << std::endl; + error = testRegularizationGradients(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << print_error(error); + std::cout << std::endl << std::endl; + if (error > 1e-5) + return 1; + + return 0; +} diff --git a/tmva/tmva/test/DNN/TestLossFunctions.cxx b/tmva/tmva/test/DNN/TestLossFunctions.cxx new file mode 100644 index 0000000000000..831ecf22c9c57 --- /dev/null +++ b/tmva/tmva/test/DNN/TestLossFunctions.cxx @@ -0,0 +1,60 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Test for the loss function reference implementation using the // +// generic test defined in TestLossFunctions.h. // +/////////////////////////////////////////////////////////////////// + +#include +#include "TMVA/DNN/Architectures/Reference.h" +#include "TestLossFunctions.h" + +using namespace TMVA::DNN; + +int main() +{ + std::cout << "Testing Loss Functions:" << std::endl << std::endl; + + double error; + + // + // Mean Squared Error. + // + + error = testMeanSquaredError>(10); + std::cout << "Testing mean squared error loss: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testMeanSquaredErrorGradients>(10); + std::cout << "Testing mean squared error gradient: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // + // Cross Entropy. + // + + error = testCrossEntropy>(10); + std::cout << "Testing cross entropy loss: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testCrossEntropyGradients>(10); + std::cout << "Testing mean squared error gradient: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; +} diff --git a/tmva/tmva/test/DNN/TestLossFunctions.h b/tmva/tmva/test/DNN/TestLossFunctions.h new file mode 100644 index 0000000000000..179c92d2638de --- /dev/null +++ b/tmva/tmva/test/DNN/TestLossFunctions.h @@ -0,0 +1,195 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Generic tests of the loss functions // +// // +// Contains generic test for architecture-specific implementations // +// of the loss functions. Requires the architecture-specific matrix // +// type to be constructible and convertible from/to the // +// TMatrixT type. // +////////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Reference.h" +#include "TMVA/DNN/Functions.h" +#include "TMVA/DNN/Net.h" +#include "Utility.h" + +using namespace TMVA::DNN; + +//______________________________________________________________________________ +// +// Mean Squared Error +//______________________________________________________________________________ + +template +auto testMeanSquaredError(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + using Scalar_t = typename Architecture::Scalar_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT X(m, n); + TMatrixT Y(m, n); + TMatrixT Z(m, n); + + randomMatrix(X); + randomMatrix(Y); + + Matrix_t XArch(X); + Matrix_t YArch(Y); + + Scalar_t mse = evaluate(ELossFunction::MEANSQUAREDERROR, + YArch, XArch); + zipWithMatrix(Z, [](Scalar_t x, Scalar_t y){return x - y;}, X, Y); + auto squaredSum = [](Scalar_t x, Scalar_t y){return x + y * y;}; + Scalar_t mseReference = reduceMean(squaredSum, 0.0, Z); + + Double_t error; + if (mseReference != 0.0) + error = std::fabs((mse - mseReference) / mseReference); + else + error = std::fabs(mse - mseReference); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +template +auto testMeanSquaredErrorGradients(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + using Scalar_t = typename Architecture::Scalar_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { +/* size_t m = rand() % 100 + 1; */ +/* size_t n = rand() % 100 + 1; */ + size_t m = 8; //rand() % 100 + 1; + size_t n = 8; //rand() % 100 + 1; + + TMatrixT X(m, n); + TMatrixT Y(m, n); + TMatrixT ZRef(m, n); + + randomMatrix(X); + randomMatrix(Y); + + Matrix_t XArch(X); + Matrix_t YArch(Y); + Matrix_t ZArch(Y); + + evaluateGradients(ZArch, ELossFunction::MEANSQUAREDERROR, + XArch, YArch); + auto normedDifference = [m, n](Scalar_t x, Scalar_t y) { + return 2.0 * (y - x) / (m * n); + }; + zipWithMatrix(ZRef, normedDifference, X, Y); + TMatrixT Z(ZArch); + Double_t error = maximumRelativeError(Z, ZRef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// Cross Entropy +//______________________________________________________________________________ + +template +auto testCrossEntropy(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + using Scalar_t = typename Architecture::Scalar_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT X(m, n); + TMatrixT Y(m, n); + TMatrixT Z(m, n); + + randomMatrix(X); + randomMatrix(Y); + + Matrix_t XArch(X); + Matrix_t YArch(Y); + + Scalar_t ce = evaluate(ELossFunction::CROSSENTROPY, + YArch, XArch); + + auto crossCorrelation = [](Scalar_t x, Scalar_t y) { + Scalar_t sig = 1.0 / (1.0 + std::exp(-x)); + return y * std::log(sig) + (1 - y) * std::log(1 - sig); + }; + zipWithMatrix(Z, crossCorrelation, X, Y); + auto sum = [](Scalar_t x, Scalar_t y) {return x + y;}; + Scalar_t ceReference = - reduceMean(sum, 0.0, Z); + + Double_t error; + if (ceReference != 0.0) + error = std::fabs((ce - ceReference) / ceReference); + else + error = std::fabs(ce - ceReference); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +template +auto testCrossEntropyGradients(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + using Scalar_t = typename Architecture::Scalar_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = 8; //rand() % 100 + 1; + size_t n = 8; //rand() % 100 + 1; + + TMatrixT X(m, n); + TMatrixT Y(m, n); + TMatrixT ZRef(m, n); + + randomMatrix(X); + randomMatrix(Y); + + Matrix_t XArch(X); + Matrix_t YArch(Y); + Matrix_t ZArch(Y); + + evaluateGradients(ZArch, ELossFunction::CROSSENTROPY, + YArch, XArch); + auto crossCorrelationGradient = [m, n](Scalar_t x, Scalar_t y) { + Scalar_t sig = 1.0 / (1.0 + std::exp(-x)); + Scalar_t norm = 1.0 / ((Scalar_t) m * n); + return (sig - y) * norm;}; + zipWithMatrix(ZRef, crossCorrelationGradient, X, Y); + + TMatrixT Z(ZArch); + Double_t error = maximumRelativeError(Z, ZRef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} diff --git a/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx b/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx new file mode 100644 index 0000000000000..e4802770966b5 --- /dev/null +++ b/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx @@ -0,0 +1,60 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Test for the loss function reference implementation using the // +// generic test defined in TestLossFunctions.h. // +/////////////////////////////////////////////////////////////////// + +#include +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TestLossFunctions.h" + +using namespace TMVA::DNN; + +int main() +{ + std::cout << "Testing Loss Functions:" << std::endl << std::endl; + + double error; + + // + // Mean Squared Error. + // + + error = testMeanSquaredError(10); + std::cout << "Testing mean squared error loss: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testMeanSquaredErrorGradients(10); + std::cout << "Testing mean squared error gradient: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // + // Cross Entropy. + // + + error = testCrossEntropy(10); + std::cout << "Testing cross entropy loss: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testCrossEntropyGradients(10); + std::cout << "Testing mean squared error gradient: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; +} diff --git a/tmva/tmva/test/DNN/TestMatrixArithmetic.h b/tmva/tmva/test/DNN/TestMatrixArithmetic.h new file mode 100644 index 0000000000000..7bd351682b4f9 --- /dev/null +++ b/tmva/tmva/test/DNN/TestMatrixArithmetic.h @@ -0,0 +1,119 @@ +// @(#)root/tmva/tmva/dnn:$Id$ // Author: Simon Pfreundschuh 20/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Test arithmetic functions defined on matrices and compare the // +// results to the reference implementation. // +/////////////////////////////////////////////////////////////////// + +#include "TMatrix.h" +#include "Utility.h" +#include "TMVA/DNN/Architectures/Reference.h" + +/** Test multiplication (standard, transposed, hadamard) operation on + * architecture specific matrix types and compare with results + * obtained with TMatrixT. + */ +//______________________________________________________________________________ +template +auto testMultiplication(size_t ntests) + -> typename Architecture_t::Scalar_t +{ + + using Scalar_t = typename Architecture_t::Scalar_t; + using Matrix_t = typename Architecture_t::Matrix_t; + + Scalar_t maximumError = 0.0; + for (size_t t = 0; t < ntests; t++) { + + size_t m, n, k; + m = rand() % 100 + 1; + n = rand() % 100 + 1; + k = rand() % 100 + 1; + + TMatrixT ARef(m,k), A2Ref(m,k), ATRef(k,m) , BRef(k,n), + BTRef(n,k), CRef(m,n); + TMVA::DNN::randomMatrix(ARef); + TMVA::DNN::randomMatrix(A2Ref); + TMVA::DNN::randomMatrix(ATRef); + TMVA::DNN::randomMatrix(BRef); + TMVA::DNN::randomMatrix(BTRef); + Matrix_t A(ARef), A2(A2Ref), AT(ATRef), B(BRef), BT(BTRef), C(CRef); + + // A * B + CRef.Mult(ARef,BRef); + Architecture_t::Multiply(C, A, B); + Scalar_t error = TMVA::DNN::maximumRelativeError((TMatrixT) C, CRef); + maximumError = std::max(error, maximumError); + + // A^T * B + CRef.TMult(ATRef,BRef); + Architecture_t::TransposeMultiply(C, AT, B); + error = TMVA::DNN::maximumRelativeError((TMatrixT) C, CRef); + maximumError = std::max(error, maximumError); + + // A * B^T + CRef.MultT(ARef,BTRef); + Architecture_t::MultiplyTranspose(C, A, BT); + error = TMVA::DNN::maximumRelativeError((TMatrixT) C, CRef); + maximumError = std::max(error, maximumError); + + // A .* B + for (size_t i = 0; i < (size_t) ARef.GetNrows(); i++) { + for (size_t j = 0; j < (size_t) ARef.GetNcols(); j++) { + ARef(i,j) *= A2Ref(i,j); + } + } + Architecture_t::Hadamard(A, A2); + error = TMVA::DNN::maximumRelativeError((TMatrixT) A, ARef); + maximumError = std::max(error, maximumError); + } + + return maximumError; +} + +/** Test the summing over columns by summing by the sums obtained + * from a matrix filled with column indices as elements. + */ +//______________________________________________________________________________ +template +auto testSumColumns(size_t ntests) + -> typename Architecture_t::Scalar_t +{ + + using Scalar_t = typename Architecture_t::Scalar_t; + using Matrix_t = typename Architecture_t::Matrix_t; + + Scalar_t maximumError = 0.0; + for (size_t t = 0; t < ntests; t++) { + + Scalar_t error; + + size_t m, n; + m = rand() % 100 + 1; + n = rand() % 100 + 1; + + TMatrixT ARef(m,n), BRef(n,1); + + for (size_t i = 0; i < (size_t) ARef.GetNrows(); i++) { + for (size_t j = 0; j < (size_t) ARef.GetNcols(); j++) { + ARef(i,j) = j; + if (i == 0) BRef(j, 0) = m * j; + } + } + + Matrix_t A(ARef), B(n, 1); + Architecture_t::SumColumns(B, A); + + error = TMVA::DNN::maximumRelativeError((TMatrixT) B ,BRef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} diff --git a/tmva/tmva/test/DNN/TestMatrixArithmeticCuda.cxx b/tmva/tmva/test/DNN/TestMatrixArithmeticCuda.cxx new file mode 100644 index 0000000000000..58617a28c66ca --- /dev/null +++ b/tmva/tmva/test/DNN/TestMatrixArithmeticCuda.cxx @@ -0,0 +1,32 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic backpropagation test for // +// CUDA architectures. // +//////////////////////////////////////////////////////////////////// + +#include +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TestMatrixArithmetic.h" + +using namespace TMVA::DNN; + +int main() +{ + std::cout << "Testing CUDA Matrix Arithmetic:" << std::endl; + + Double_t error = testMultiplication(10); + std::cout << "Multiplication: " << "Max. rel. error: " << error << std::endl; + + error = testSumColumns(1); + std::cout << "Column Sum: " << "Max. rel. error: " << error << std::endl; +} diff --git a/tmva/tmva/test/DNN/TestMinimization.cxx b/tmva/tmva/test/DNN/TestMinimization.cxx new file mode 100644 index 0000000000000..a2e5b36b7a249 --- /dev/null +++ b/tmva/tmva/test/DNN/TestMinimization.cxx @@ -0,0 +1,29 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////// +// Test the Neural Network training using the reference // +// implementation. // +// // +// Calls the generic testMinimization function defined in // +// TestMinimization.cpp for the reference architecture. // +//////////////////////////////////////////////////////////// + +#include +#include "TMVA/DNN/Architectures/Reference.h" +#include "TestMinimization.h" + +using namespace TMVA::DNN; + +int main() +{ + testMinimization>(); +} diff --git a/tmva/tmva/test/DNN/TestMinimization.h b/tmva/tmva/test/DNN/TestMinimization.h new file mode 100644 index 0000000000000..202dc0603f34d --- /dev/null +++ b/tmva/tmva/test/DNN/TestMinimization.h @@ -0,0 +1,64 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////////////// +// Test Standard Minimizer // +// // +// This test trains a linear neural network on a linear function // +// F(x) = W * x and computes the relative error between the matrix // +// W' representing the linear function learned by the net to the // +// orignal matrix W. // +///////////////////////////////////////////////////////////////////// + +#include "TMatrix.h" +#include "TMVA/DNN/Minimizers.h" +#include "TMVA/DNN/Net.h" +#include "Utility.h" + +using namespace TMVA::DNN; + +template + auto testMinimization() + -> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + using Net_t = TNet; + + size_t nSamples = 1000000; + size_t nFeatures = 20; + size_t batchSize = 1024; + + TMatrixT XTrain(nSamples, nFeatures), YTrain(nSamples, 1), + XTest(batchSize, nFeatures), YTest(batchSize, 1), W(1, nFeatures); + + randomMatrix(W); + randomMatrix(XTrain); + randomMatrix(XTest); + YTrain.MultT(XTrain, W); + YTest.MultT(XTest, W); + + Net_t net(batchSize, nFeatures, ELossFunction::MEANSQUAREDERROR); + net.AddLayer(512, EActivationFunction::TANH); + net.AddLayer(512, EActivationFunction::TANH); + net.AddLayer(512, EActivationFunction::TANH); + net.AddLayer(512, EActivationFunction::TANH); + net.AddLayer(512, EActivationFunction::TANH); + net.AddLayer(1, EActivationFunction::IDENTITY); + net.Initialize(EInitialization::GAUSS); + + TGradientDescent minimizer(0.000001, 1, 10); + MatrixInput_t trainingData(XTrain, YTrain); + MatrixInput_t testData(XTest, YTest); + minimizer.Train(trainingData, nSamples, testData, batchSize, net, 8); + + return 0.0; +} + diff --git a/tmva/tmva/test/DNN/TestMinimizationCuda.cxx b/tmva/tmva/test/DNN/TestMinimizationCuda.cxx new file mode 100644 index 0000000000000..7fddad1cdd329 --- /dev/null +++ b/tmva/tmva/test/DNN/TestMinimizationCuda.cxx @@ -0,0 +1,29 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////////////// +// Use the generic tests defined in TestMinimization.h to test the // +// training of Neural Networks for CUDA architectures. // +///////////////////////////////////////////////////////////////////// + +#include +#include "TMVA/DNN/Architectures/Reference.h" +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Minimizers.h" +#include "TestMinimization.h" + +using namespace TMVA::DNN; + +int main() +{ + testMinimization(); + +} diff --git a/tmva/tmva/test/DNN/Utility.h b/tmva/tmva/test/DNN/Utility.h new file mode 100644 index 0000000000000..0f66b7839955b --- /dev/null +++ b/tmva/tmva/test/DNN/Utility.h @@ -0,0 +1,247 @@ +#ifndef TMVA_TEST_DNN_UTILITY +#define TMVA_TEST_DNN_UTILITY + +#include +#include +#include +#include "stdlib.h" +#include "TRandom.h" +#include "TMVA/DNN/Architectures/Reference.h" +#include "TMVA/DNN/Functions.h" +#include "TMVA/DNN/Net.h" + +namespace TMVA +{ +namespace DNN +{ + + template + void constructRandomLinearNet(TNet & net) + { + int nlayers = rand() % 5 + 1; + + std::vector ActivationFunctions + = {EActivationFunction::IDENTITY}; + + for (int i = 0; i < nlayers; i++) + { + int width = rand() % 10 + 1; + EActivationFunction f = + ActivationFunctions[rand() % ActivationFunctions.size()]; + net.AddLayer(width, f); + } + } + + /*! Set matrix to the identity matrix */ + template + void identityMatrix(MatrixType &X) + { + size_t m, n; + m = X.GetNrows(); + n = X.GetNcols(); + + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + X(i,j) = 0.0; + } + if (i < n) { + X(i,i) = 1.0; + } + } + } + + /*! Generate a random batch as input for a neural net. */ + template + void randomMatrix(MatrixType &X) + { + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + TRandom rand(clock()); + + Double_t sigma = sqrt(10.0); + + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + X(i,j) = rand.Gaus(0.0, sigma); + } + } + } + + /*! Generate a random batch as input for a neural net. */ + template + void randomBatch(MatrixType &X) + { + randomMatrix(X); + } + + /*! Generate a random batch as input for a neural net. */ + template + void copyMatrix(MatrixType &X, const MatrixType &Y) + { + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + X(i,j) = Y(i,j); + } + } + } + + /*! Apply functional to each element in the matrix. */ + template + void applyMatrix(MatrixType &X, F f) + { + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + X(i,j) = f(X(i,j)); + } + } + } + + /*! Combine elements of two given matrices into a single matrix using + * the given function f. */ + template + void zipWithMatrix(MatrixType &Z, + F f, + const MatrixType &X, + const MatrixType &Y) + { + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + for (size_t i = 0; i < m; i++) {for (size_t j = 0; j < n; j++) + { + Z(i,j) = f(X(i,j), Y(i,j)); + } + } + } + + /*! Generate a random batch as input for a neural net. */ + template + RealType reduce(F f, RealType start, const MatrixType &X) + { + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + RealType result = start; + + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + result = f(result, X(i,j)); + } + } + return result; + } + + /*! Generate a random batch as input for a neural net. */ + template + RealType reduceMean(F f, RealType start, const MatrixType &X) + { + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + RealType result = start; + + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + result = f(result, X(i,j)); + } + } + return result / (RealType) (m * n); + } + + /*! Compute the relative error of x and y normalized by y. Protected against + * division by zero. */ + template + inline auto relativeError(const RealType &x, + const RealType &y) + -> RealType + { + if (y != 0.0) + return std::fabs((x - y) / y); + else + return std::fabs(x - y); + } + + /*! Compute the maximum, element-wise relative error of the matrices + * X and Y normalized by the element of Y. Protected against division + * by zero. */ + template + auto maximumRelativeError(const MatrixType &X, + const MatrixType &Y) + -> decltype(X(0,0)) + { + + using RealType = decltype(X(0,0)); + + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + RealType maximumError = 0.0; + + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + RealType error = relativeError(X(i,j), Y(i,j)); + maximumError = std::max(error, maximumError); + } + } + return maximumError; + } + + /*! Numerically compute the derivative of the functional f using finite + * differences. */ + template + inline RealType finiteDifference(F f, RealType dx) + { + return f(dx) - f(0.0 - dx); + } + + /*! Color code error. */ + template + std::string print_error(RealType &e) + { + std::ostringstream out{}; + + out << ("\e["); + + if (e > 1e-5) + out << "31m"; + else if (e > 1e-9) + out << "33m"; + else + out << "32m"; + + out << e; + out << "\e[39m"; + + return out.str(); + } +} +} + +#endif