Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
7101abb
Added GPU implementation of DNNs.
Jul 17, 2016
49906d9
Removed space before variable.
Jul 17, 2016
d4b6866
Removed Print() statement on weight matrices after training.
Jul 17, 2016
5690a9c
Removed output.
Jul 17, 2016
578e317
Removed explicit setting of the backend compiler.
Jul 18, 2016
a09f2fa
Added include guards for Cuda architecture header.
Jul 18, 2016
0146c55
Added missing test file.
Jul 18, 2016
f8d2317
Added missing file.
Jul 18, 2016
fb77aa8
Removed profiling switch in TestDerivativesCuda.
Jul 19, 2016
7f559f7
Fixed naming of Cuda kernels.
Jul 19, 2016
66587d0
Some optimzations in the training routine.
Jul 21, 2016
9d162bf
Applied stash.
Jul 21, 2016
ab09b0d
Fixed out of bounds memory access in vertical reduction kernel.
Jul 25, 2016
56d0579
Fixed out of bounds memory access in vertical reduction kernel.
Jul 25, 2016
92be232
Merge branch 'tmva_gpu' of https://github.com/simonpf/root into tmva_gpu
Jul 25, 2016
9eb6a50
Minor cosmetics.
Jul 26, 2016
c6ae8ed
Some more cosmetics.
Jul 26, 2016
ab85e89
Cleaned up output.
Jul 28, 2016
4a5822a
Merge branch 'tmva_gpu' of https://github.com/simonpf/root into tmva_gpu
Jul 28, 2016
6c8abba
Fixed minimization test.
Jul 28, 2016
e594dda
Fixed formatting in CudaMatrix.h
Jul 28, 2016
0c7667f
Enlarged batch size in minimization test.
Jul 28, 2016
f9b95e9
:Merge branch 'tmva_gpu' of github.com:simonpf/root into tmva_gpu
Jul 28, 2016
46ac988
Generic data loader.
Aug 9, 2016
8e1edd2
Added TestDataLoaderCuda.cxx.
Aug 9, 2016
dfe0f09
Made copy async.
Aug 9, 2016
b9528e5
Smaller fixes.
Aug 9, 2016
0a3ae51
Added flop counter.
Aug 9, 2016
7cef87b
Merge branch 'tmva_gpu' of github.com:simonpf/root into tmva_gpu
Aug 9, 2016
3cb26ba
Fixed flop rate computation.
Aug 9, 2016
0b23d06
Testing different curand initialization.
Aug 11, 2016
df80c5e
Testing different parallelization scheme.
Aug 11, 2016
dcbf1c6
Minor fixes and modifications.
Aug 13, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Testing different parallelization scheme.
  • Loading branch information
Simon Pfreundschuh committed Aug 11, 2016
commit df80c5e79cbfefd2d3fc371c72f59f37e35bbd74
103 changes: 73 additions & 30 deletions tmva/tmva/inc/TMVA/DNN/Minimizers.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#define TMVA_DNN_MINIMIZERS

#include "DataLoader.h"
#include "Functions.h"
#include <chrono>

namespace TMVA {
Expand Down Expand Up @@ -88,7 +89,9 @@ class TGradientDescent
template <typename Net_t>
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output);
template <typename Net_t>
void Step(Net_t &master, Net_t &net, Matrix_t &input, const Matrix_t &output);
void Step(Net_t &master,
std::vector<Net_t> &nets,
std::vector<TBatch<Architecture_t>> &batches);
/** Does not evaluate the loss and therefore not trigger a possible synchronization
* with the device. Trains the weights of each layer, but only the bias terms of
* the first layer for compatibility with the previous implementation. */
Expand Down Expand Up @@ -190,17 +193,21 @@ template <typename Data_t, typename Net_t>
std::chrono::time_point<std::chrono::system_clock> start, end;
start = std::chrono::system_clock::now();


while (!converged)
{
fStepCount++;

size_t netIndex = 0;
for (auto b : trainLoader) {
// Perform minimization step.
auto inputMatrix = b.GetInput();
auto outputMatrix = b.GetOutput();
Step(net, nets[netIndex % nThreads], inputMatrix, outputMatrix);
netIndex++;
std::vector<TBatch<Architecture_t>> batches{};
for (size_t i = 0; i < nTrainingSamples / net.GetBatchSize(); i += nThreads) {
batches.clear();
for (size_t j = 0; j < nThreads; j++) {
batches.reserve(nThreads);
batches.push_back(trainLoader.GetBatch());
}
Step(net, nets, batches);
std::cout << "epoch." << std::endl;
}

// Compute test error.
Expand Down Expand Up @@ -254,31 +261,67 @@ template<typename Architecture_t>
//______________________________________________________________________________
template<typename Architecture_t>
template <typename Net_t>
void inline TGradientDescent<Architecture_t>::Step(Net_t & master,
Net_t & net,
Matrix_t &input,
const Matrix_t &output)
void inline TGradientDescent<Architecture_t>::Step(
Net_t & master,
std::vector<Net_t> & nets,
std::vector<TBatch<Architecture_t>> & batches)
{
//Scalar_t loss = net.Loss(input, output);
//fTrainingError = loss;
net.Forward(input);
net.Backward(input, output);
typename Architecture_t::Matrix_t dummy(0,0);
size_t depth = master.GetDepth();

for (size_t i = 0; i < net.GetDepth(); i++)
{
auto &masterLayer = master.GetLayer(i);
auto &layer = net.GetLayer(i);
Architecture_t::ScaleAdd(masterLayer.GetWeights(),
layer.GetWeightGradients(),
-fLearningRate);
Architecture_t::Copy(layer.GetWeights(),
masterLayer.GetWeights());
Architecture_t::ScaleAdd(masterLayer.GetBiases(),
layer.GetBiasGradients(),
-fLearningRate);
Architecture_t::Copy(layer.GetBiases(),
masterLayer.GetBiases());
}
// Forward
for (size_t j = 0; j < nets.size(); j++) {
nets[j].GetLayer(0).Forward(batches[j].GetInput());
}

for (size_t i = 1; i < depth; i++)
{
for (size_t j = 0; j < nets.size(); j++) {
nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput());
}
}
// Gradients
for (size_t j = 0; j < nets.size(); j++) {
evaluateGradients<Architecture_t>(
nets[j].GetLayer(depth-1).GetActivationGradients(),
nets[j].GetLossFunction(),
batches[j].GetOutput(),
nets[j].GetLayer(depth-1).GetOutput());
}
// Backward
for (size_t i = depth - 1; i > 0; i--)
{
for (size_t j = 0; j < nets.size(); j++) {
nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
nets[j].GetLayer(i-1).GetOutput(),
nets[j].GetRegularization(),
nets[j].GetWeightDecay());
}
}
for (size_t j = 0; j < nets.size(); j++) {
nets[j].GetLayer(0).Backward(dummy,
batches[j].GetInput(),
nets[j].GetRegularization(),
nets[j].GetWeightDecay());
}

for (size_t j = 0; j < nets.size(); j++) {
for (size_t i = 0; i < depth; i++)
{
auto &masterLayer = master.GetLayer(i);
auto &layer = nets[j].GetLayer(i);
Architecture_t::ScaleAdd(masterLayer.GetWeights(),
layer.GetWeightGradients(),
-fLearningRate);
Architecture_t::Copy(layer.GetWeights(),
masterLayer.GetWeights());
Architecture_t::ScaleAdd(masterLayer.GetBiases(),
layer.GetBiasGradients(),
-fLearningRate);
Architecture_t::Copy(layer.GetBiases(),
masterLayer.GetBiases());
}
}
}


Expand Down
2 changes: 2 additions & 0 deletions tmva/tmva/inc/TMVA/DNN/Net.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ template<typename Architecture_t, typename Layer_t = TLayer<Architecture_t>>
Matrix_t & GetOutput() {return fLayers.back().GetOutput();}
size_t GetInputWidth() const {return fInputWidth;}
size_t GetOutputWidth() const {return fLayers.back().GetWidth();}
ERegularization GetRegularization() {return fR;}
Scalar_t GetWeightDecay() {return fWeightDecay;}

void SetInputWidth(size_t InputWidth) {fInputWidth = InputWidth;}
void SetRegularization(ERegularization R) {fR = R;}
Expand Down
4 changes: 2 additions & 2 deletions tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,11 @@ void TCuda::SumColumns(TCudaMatrix &B, const TCudaMatrix &A)
//____________________________________________________________________________
void TCuda::ScaleAdd(TCudaMatrix &B, const TCudaMatrix &A, CudaDouble_t alpha)
{
cudaStream_t s = A.GetComputeStream();
cudaStream_t s = 0; //A.GetComputeStream();
cublasDaxpy(A.GetCublasHandle(), A.GetNoElements(), &alpha,
A.GetDataPointer(), 1,
B.GetDataPointer(), 1);
B.SetComputeStream(s);
//B.SetComputeStream(s);
}

} // DNN
Expand Down
2 changes: 1 addition & 1 deletion tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ void TCuda::Copy(TCudaMatrix & B, const TCudaMatrix & A)
size_t n = B.GetNcols();
cudaMemcpyAsync(B.GetDataPointer(), A.GetDataPointer(),
m * n * sizeof(CudaDouble_t), cudaMemcpyDeviceToDevice,
A.GetComputeStream());
0);
}

} // namespace DNN
Expand Down
6 changes: 3 additions & 3 deletions tmva/tmva/test/DNN/TestMinimization.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ template <typename Architecture>
using Matrix_t = typename Architecture::Matrix_t;
using Net_t = TNet<Architecture>;

size_t nSamples = 100000;
size_t nSamples = 1000;
size_t nFeatures = 20;
size_t batchSize = 1000;
size_t batchSize = 100;

TMatrixT<Double_t> XTrain(nSamples, nFeatures), YTrain(nSamples, 1),
XTest(batchSize, nFeatures), YTest(batchSize, 1), W(1, nFeatures);
Expand All @@ -52,7 +52,7 @@ template <typename Architecture>
net.AddLayer(1, EActivationFunction::IDENTITY);
net.Initialize(EInitialization::GAUSS);

TGradientDescent<Architecture> minimizer(0.000001, 1, 10);
TGradientDescent<Architecture> minimizer(0.000001, 1, 1);
MatrixInput_t trainingData(XTrain, YTrain);
MatrixInput_t testData(XTest, YTest);
minimizer.Train(trainingData, nSamples, testData, batchSize, net, 4);
Expand Down