Skip to content
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
7101abb
Added GPU implementation of DNNs.
Jul 17, 2016
49906d9
Removed space before variable.
Jul 17, 2016
d4b6866
Removed Print() statement on weight matrices after training.
Jul 17, 2016
5690a9c
Removed output.
Jul 17, 2016
578e317
Removed explicit setting of the backend compiler.
Jul 18, 2016
a09f2fa
Added include guards for Cuda architecture header.
Jul 18, 2016
0146c55
Added missing test file.
Jul 18, 2016
f8d2317
Added missing file.
Jul 18, 2016
fb77aa8
Removed profiling switch in TestDerivativesCuda.
Jul 19, 2016
7f559f7
Fixed naming of Cuda kernels.
Jul 19, 2016
66587d0
Some optimzations in the training routine.
Jul 21, 2016
9d162bf
Applied stash.
Jul 21, 2016
ab09b0d
Fixed out of bounds memory access in vertical reduction kernel.
Jul 25, 2016
56d0579
Fixed out of bounds memory access in vertical reduction kernel.
Jul 25, 2016
92be232
Merge branch 'tmva_gpu' of https://github.com/simonpf/root into tmva_gpu
Jul 25, 2016
9eb6a50
Minor cosmetics.
Jul 26, 2016
c6ae8ed
Some more cosmetics.
Jul 26, 2016
ab85e89
Cleaned up output.
Jul 28, 2016
4a5822a
Merge branch 'tmva_gpu' of https://github.com/simonpf/root into tmva_gpu
Jul 28, 2016
6c8abba
Fixed minimization test.
Jul 28, 2016
e594dda
Fixed formatting in CudaMatrix.h
Jul 28, 2016
0c7667f
Enlarged batch size in minimization test.
Jul 28, 2016
f9b95e9
:Merge branch 'tmva_gpu' of github.com:simonpf/root into tmva_gpu
Jul 28, 2016
46ac988
Generic data loader.
Aug 9, 2016
8e1edd2
Added TestDataLoaderCuda.cxx.
Aug 9, 2016
dfe0f09
Made copy async.
Aug 9, 2016
b9528e5
Smaller fixes.
Aug 9, 2016
0a3ae51
Added flop counter.
Aug 9, 2016
7cef87b
Merge branch 'tmva_gpu' of github.com:simonpf/root into tmva_gpu
Aug 9, 2016
3cb26ba
Fixed flop rate computation.
Aug 9, 2016
0b23d06
Testing different curand initialization.
Aug 11, 2016
df80c5e
Testing different parallelization scheme.
Aug 11, 2016
dcbf1c6
Minor fixes and modifications.
Aug 13, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tmva/tmva/inc/TMVA/DNN/Layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ template<typename Architecture_t>
{
std::cout << "Width: " << fWeights.GetNrows();
std::cout << ", activation function: ";
std::cout << static_cast<char>(fF) << std::endl;
std::cout << static_cast<int>(fF) << std::endl;
}

//______________________________________________________________________________
Expand Down
15 changes: 7 additions & 8 deletions tmva/tmva/inc/TMVA/DNN/Minimizers.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,9 @@ class TGradientDescent
rate \f$\alpha\f$ and subtracted from the weights and bias values of each
layer. */
template <typename Net_t>
Scalar_t Step(Net_t &net,
Matrix_t &input,
const Matrix_t &output);
void Step(Net_t &net,
Matrix_t &input,
const Matrix_t &output);
/** Does not evaluate the loss and therefore not trigger a possible synchronization
* with the device. Trains the weights of each layer, but only the bias terms of
* the first layer for compatibility with the previous implementation. */
Expand Down Expand Up @@ -197,13 +197,13 @@ template <typename Data_t, typename Net_t>
//______________________________________________________________________________
template<typename Architecture_t>
template <typename Net_t>
auto inline TGradientDescent<Architecture_t>::Step(Net_t & net,
void inline TGradientDescent<Architecture_t>::Step(Net_t & net,
Matrix_t &input,
const Matrix_t &output)
-> Scalar_t
{
Scalar_t loss = net.Loss(input, output);
fTrainingError = loss;
//Scalar_t loss = net.Loss(input, output);
//fTrainingError = loss;
net.Forward(input);
net.Backward(input, output);

for (size_t i = 0; i < net.GetDepth(); i++)
Expand All @@ -216,7 +216,6 @@ template<typename Architecture_t>
layer.GetBiasGradients(),
-fLearningRate);
}
return loss;
}

//______________________________________________________________________________
Expand Down
3 changes: 3 additions & 0 deletions tmva/tmva/src/MethodDNN.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,9 @@ void TMVA::MethodDNN::TrainGPU()
case DNN::EnumFunction::RELU :
GPUNet.AddLayer((*itLayout).first, EActivationFunction::RELU);
break;
case DNN::EnumFunction::TANH :
GPUNet.AddLayer((*itLayout).first, EActivationFunction::TANH);
break;
case DNN::EnumFunction::SYMMRELU :
GPUNet.AddLayer((*itLayout).first, EActivationFunction::SYMMRELU);
break;
Expand Down
24 changes: 10 additions & 14 deletions tmva/tmva/test/DNN/TestMinimizationCuda.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,10 @@ int main()
using Matrix_t = TMatrixT<Double_t>;
using Net_t = TNet<TCuda>;

Matrix_t XTrain(4000,20), YTrain(4000,20), XTest(20,20), YTest(20,20), W(20, 20);
Matrix_t XTrain(100000,20), YTrain(100000,20), XTest(20,20), YTest(20,20), W(20, 20);

randomMatrix(W);

for (size_t i = 0; i < 4000; i++) {
for (size_t j = 0; j < 20; j++) {
XTrain(i,j) = i;
YTrain(i,j) = i;
}
}

randomMatrix(XTrain);
randomMatrix(XTest);

Expand All @@ -47,23 +40,26 @@ int main()
MatrixInput_t trainData(XTrain, YTrain);
MatrixInput_t testData(XTest, YTest);

Net_t net(20, 20, ELossFunction::MEANSQUAREDERROR);
net.AddLayer(100, EActivationFunction::IDENTITY);
net.AddLayer(100, EActivationFunction::IDENTITY);
Net_t net(1000, 20, ELossFunction::MEANSQUAREDERROR);

net.AddLayer(200, EActivationFunction::IDENTITY);
net.AddLayer(200, EActivationFunction::IDENTITY);
net.AddLayer(200, EActivationFunction::IDENTITY);
net.AddLayer(20, EActivationFunction::IDENTITY);
net.Initialize(EInitialization::GAUSS);
auto testnet = net.CreateClone(20);

TGradientDescent<TCuda> minimizer(0.001, 20, 20);
minimizer.Train(trainData, 4000, testData, 20, net);
minimizer.Train(trainData, 100000, testData, 20, net);

TMatrixT<Double_t> I(20,20); identityMatrix(I);
TCudaMatrix ICuda(I);

net.Forward(ICuda);
testnet.Forward(ICuda);

TMatrixT<Double_t> WT(20, 20);
WT.Transpose(W);

auto error = maximumRelativeError((TMatrixT<Double_t>) net.GetOutput(), WT);
auto error = maximumRelativeError((TMatrixT<Double_t>) testnet.GetOutput(), WT);
std::cout << "Maximum relative error: " << error << std::endl;
}