Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
7101abb
Added GPU implementation of DNNs.
Jul 17, 2016
49906d9
Removed space before variable.
Jul 17, 2016
d4b6866
Removed Print() statement on weight matrices after training.
Jul 17, 2016
5690a9c
Removed output.
Jul 17, 2016
578e317
Removed explicit setting of the backend compiler.
Jul 18, 2016
a09f2fa
Added include guards for Cuda architecture header.
Jul 18, 2016
0146c55
Added missing test file.
Jul 18, 2016
f8d2317
Added missing file.
Jul 18, 2016
fb77aa8
Removed profiling switch in TestDerivativesCuda.
Jul 19, 2016
7f559f7
Fixed naming of Cuda kernels.
Jul 19, 2016
66587d0
Some optimzations in the training routine.
Jul 21, 2016
9d162bf
Applied stash.
Jul 21, 2016
ab09b0d
Fixed out of bounds memory access in vertical reduction kernel.
Jul 25, 2016
56d0579
Fixed out of bounds memory access in vertical reduction kernel.
Jul 25, 2016
92be232
Merge branch 'tmva_gpu' of https://github.com/simonpf/root into tmva_gpu
Jul 25, 2016
9eb6a50
Minor cosmetics.
Jul 26, 2016
c6ae8ed
Some more cosmetics.
Jul 26, 2016
ab85e89
Cleaned up output.
Jul 28, 2016
4a5822a
Merge branch 'tmva_gpu' of https://github.com/simonpf/root into tmva_gpu
Jul 28, 2016
6c8abba
Fixed minimization test.
Jul 28, 2016
e594dda
Fixed formatting in CudaMatrix.h
Jul 28, 2016
0c7667f
Enlarged batch size in minimization test.
Jul 28, 2016
f9b95e9
:Merge branch 'tmva_gpu' of github.com:simonpf/root into tmva_gpu
Jul 28, 2016
46ac988
Generic data loader.
Aug 9, 2016
8e1edd2
Added TestDataLoaderCuda.cxx.
Aug 9, 2016
dfe0f09
Made copy async.
Aug 9, 2016
b9528e5
Smaller fixes.
Aug 9, 2016
0a3ae51
Added flop counter.
Aug 9, 2016
7cef87b
Merge branch 'tmva_gpu' of github.com:simonpf/root into tmva_gpu
Aug 9, 2016
3cb26ba
Fixed flop rate computation.
Aug 9, 2016
0b23d06
Testing different curand initialization.
Aug 11, 2016
df80c5e
Testing different parallelization scheme.
Aug 11, 2016
dcbf1c6
Minor fixes and modifications.
Aug 13, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Added flop counter.
  • Loading branch information
Simon Pfreundschuh committed Aug 9, 2016
commit 0a3ae5103a3f539ee45e4fb7cfbfcc56aae01551
15 changes: 14 additions & 1 deletion tmva/tmva/inc/TMVA/DNN/Minimizers.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#define TMVA_DNN_MINIMIZERS

#include "DataLoader.h"
#include <chrono>

namespace TMVA {
namespace DNN {
Expand Down Expand Up @@ -186,8 +187,13 @@ template <typename Data_t, typename Net_t>
}
}

std::chrono::time_point<std::chrono::system_clock> start, end;
start = std::chrono::system_clock::now();

while (!converged)
{
fStepCount++;

size_t netIndex = 0;
for (auto b : trainLoader) {
// Perform minimization step.
Expand All @@ -199,6 +205,13 @@ template <typename Data_t, typename Net_t>

// Compute test error.
if ((fStepCount % fTestInterval) == 0) {
end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
start = std::chrono::system_clock::now();
double seconds = elapsed_seconds.count();
std::cout << "Elapsed time for " << fTestInterval << " Epochs: "
<< seconds << " [s] => " << net.GetNFlops() * 1e-6 / seconds
<< " GFlop/s" << std::endl;
auto b = *testLoader.begin();
auto inputMatrix = b.GetInput();
auto outputMatrix = b.GetOutput();
Expand All @@ -207,7 +220,7 @@ template <typename Data_t, typename Net_t>
std::cout << fStepCount << ": " << loss << std::endl;
converged = HasConverged();
}
fStepCount++;

}
return fMinimumError;
}
Expand Down
32 changes: 32 additions & 0 deletions tmva/tmva/inc/TMVA/DNN/Net.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ template<typename Architecture_t, typename Layer_t = TLayer<Architecture_t>>
* function f to the activation of the last layer in the network. */
inline void Prediction(Matrix_t &Y_hat, EOutputFunction f) const;

Scalar_t GetNFlops();

size_t GetDepth() const {return fLayers.size();}
size_t GetBatchSize() const {return fBatchSize;}
Layer_t & GetLayer(size_t i) {return fLayers[i];}
Expand Down Expand Up @@ -309,7 +311,37 @@ template<typename Architecture_t, typename Layer_t>
}

//______________________________________________________________________________
template<typename Architecture_t, typename Layer_t>
auto TNet<Architecture_t, Layer_t>::GetNFlops()
-> Scalar_t
{
Scalar_t flops = 0;

Scalar_t nb = (Scalar_t) fBatchSize;
Scalar_t nlp = (Scalar_t) fInputWidth;

for(size_t i = 0; i < fLayers.size(); i++) {
Layer_t & layer = fLayers[i];
Scalar_t nl = (Scalar_t) layer.GetWidth();

// Forward propagation.
flops += nb * nl * (2.0 * nlp - 1); // Matrix mult.
flops += nb * nl; // Add bias values.
flops += 2 * nb * nl; // Apply activation function and compute
// derivative.
// Backward propagation.
flops += nb * nl; // Hadamard
flops += nlp * nb * (2.0 * nlp - 1.0); // Weight gradients
flops += nl * (nb - 1); // Bias gradients
if (i > 0) {
flops += nlp * nb * (2.0 * nl - 1.0); // Previous layer gradients.
}
nlp = nl;
}
return flops;
}

//______________________________________________________________________________
template<typename Architecture_t, typename Layer_t>
void TNet<Architecture_t, Layer_t>::Print()
{
Expand Down