Testing different parallelization scheme.

root-project · simonpf · Jul 17, 2016 · Jul 17, 2016 · Jul 17, 2016 · Jul 17, 2016
commit df80c5e79cbfefd2d3fc371c72f59f37e35bbd74
@@ -12,6 +12,7 @@
 #define TMVA_DNN_MINIMIZERS
 
 #include "DataLoader.h"
+#include "Functions.h"
 #include <chrono>
 
 namespace TMVA {
@@ -88,7 +89,9 @@ class TGradientDescent
     template <typename Net_t>
     void Step(Net_t &net, Matrix_t &input, const Matrix_t &output);
     template <typename Net_t>
-    void Step(Net_t &master, Net_t &net, Matrix_t &input, const Matrix_t &output);
+    void Step(Net_t &master,
+              std::vector<Net_t> &nets,
+              std::vector<TBatch<Architecture_t>> &batches);
     /** Does not evaluate the loss and therefore not trigger a possible synchronization
      *  with the device. Trains the weights of each layer, but only the bias terms of
      *  the first layer for compatibility with the previous implementation. */
@@ -190,17 +193,21 @@ template <typename Data_t, typename Net_t>
    std::chrono::time_point<std::chrono::system_clock> start, end;
    start = std::chrono::system_clock::now();
 
+
    while (!converged)
    {
       fStepCount++;
 
       size_t netIndex = 0;
-      for (auto b : trainLoader) {
-         // Perform minimization step.
-         auto inputMatrix  = b.GetInput();
-         auto outputMatrix = b.GetOutput();
-         Step(net, nets[netIndex % nThreads], inputMatrix, outputMatrix);
-         netIndex++;
+      std::vector<TBatch<Architecture_t>> batches{};
+      for (size_t i = 0; i < nTrainingSamples / net.GetBatchSize(); i += nThreads) {
+         batches.clear();
+         for (size_t j = 0; j < nThreads; j++) {
+            batches.reserve(nThreads);
+            batches.push_back(trainLoader.GetBatch());
+         }
+         Step(net, nets, batches);
+         std::cout << "epoch." << std::endl;
       }
 
       // Compute test error.
@@ -254,31 +261,67 @@ template<typename Architecture_t>
 //______________________________________________________________________________
 template<typename Architecture_t>
     template <typename Net_t>
-    void inline TGradientDescent<Architecture_t>::Step(Net_t & master,
-                                                       Net_t & net,
-                                                       Matrix_t &input,
-                                                       const Matrix_t &output)
+    void inline TGradientDescent<Architecture_t>::Step(
+        Net_t & master,
+        std::vector<Net_t> & nets,
+        std::vector<TBatch<Architecture_t>> & batches)
 {
-    //Scalar_t loss = net.Loss(input, output);
-    //fTrainingError = loss;
-    net.Forward(input);
-    net.Backward(input, output);
+   typename Architecture_t::Matrix_t dummy(0,0);
+   size_t depth = master.GetDepth();
 
-    for (size_t i = 0; i < net.GetDepth(); i++)
-    {
-        auto &masterLayer = master.GetLayer(i);
-        auto &layer = net.GetLayer(i);
-        Architecture_t::ScaleAdd(masterLayer.GetWeights(),
-                                 layer.GetWeightGradients(),
-                                 -fLearningRate);
-        Architecture_t::Copy(layer.GetWeights(),
-                             masterLayer.GetWeights());
-        Architecture_t::ScaleAdd(masterLayer.GetBiases(),
-                                 layer.GetBiasGradients(),
-                                 -fLearningRate);
-        Architecture_t::Copy(layer.GetBiases(),
-                             masterLayer.GetBiases());
-    }
+   // Forward
+   for (size_t j = 0; j < nets.size(); j++) {
+      nets[j].GetLayer(0).Forward(batches[j].GetInput());
+   }
+
+   for (size_t i = 1; i < depth; i++)
+   {
+      for (size_t j = 0; j < nets.size(); j++) {
+         nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput());
+      }
+   }
+   // Gradients
+   for (size_t j = 0; j < nets.size(); j++) {
+      evaluateGradients<Architecture_t>(
+          nets[j].GetLayer(depth-1).GetActivationGradients(),
+          nets[j].GetLossFunction(),
+          batches[j].GetOutput(),
+          nets[j].GetLayer(depth-1).GetOutput());
+   }
+   // Backward
+   for (size_t i = depth - 1; i > 0; i--)
+   {
+      for (size_t j = 0; j < nets.size(); j++) {
+         nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
+                                      nets[j].GetLayer(i-1).GetOutput(),
+                                      nets[j].GetRegularization(),
+                                      nets[j].GetWeightDecay());
+      }
+   }
+   for (size_t j = 0; j < nets.size(); j++) {
+      nets[j].GetLayer(0).Backward(dummy,
+                                   batches[j].GetInput(),
+                                   nets[j].GetRegularization(),
+                                   nets[j].GetWeightDecay());
+   }
+
+   for (size_t j = 0; j < nets.size(); j++) {
+      for (size_t i = 0; i < depth; i++)
+      {
+         auto &masterLayer = master.GetLayer(i);
+         auto &layer       = nets[j].GetLayer(i);
+         Architecture_t::ScaleAdd(masterLayer.GetWeights(),
+                                  layer.GetWeightGradients(),
+                                  -fLearningRate);
+         Architecture_t::Copy(layer.GetWeights(),
+                              masterLayer.GetWeights());
+         Architecture_t::ScaleAdd(masterLayer.GetBiases(),
+                                  layer.GetBiasGradients(),
+                                  -fLearningRate);
+         Architecture_t::Copy(layer.GetBiases(),
+                              masterLayer.GetBiases());
+      }
+   }
 }
 
 

@@ -135,6 +135,8 @@ template<typename Architecture_t, typename Layer_t = TLayer<Architecture_t>>
    Matrix_t &          GetOutput()              {return fLayers.back().GetOutput();}
    size_t              GetInputWidth() const    {return fInputWidth;}
    size_t              GetOutputWidth() const   {return fLayers.back().GetWidth();}
+   ERegularization     GetRegularization()      {return fR;}
+   Scalar_t            GetWeightDecay()         {return fWeightDecay;}
 
    void SetInputWidth(size_t InputWidth)     {fInputWidth = InputWidth;}
    void SetRegularization(ERegularization R) {fR = R;}

@@ -123,11 +123,11 @@ void TCuda::SumColumns(TCudaMatrix &B, const TCudaMatrix &A)
 //____________________________________________________________________________
 void TCuda::ScaleAdd(TCudaMatrix &B, const TCudaMatrix &A, CudaDouble_t alpha)
 {
-   cudaStream_t s = A.GetComputeStream();
+   cudaStream_t s = 0; //A.GetComputeStream();
    cublasDaxpy(A.GetCublasHandle(), A.GetNoElements(), &alpha,
                A.GetDataPointer(), 1,
                B.GetDataPointer(), 1);
-   B.SetComputeStream(s);
+   //B.SetComputeStream(s);
 }
 
 } // DNN

@@ -89,7 +89,7 @@ void TCuda::Copy(TCudaMatrix & B, const TCudaMatrix & A)
    size_t n = B.GetNcols();
    cudaMemcpyAsync(B.GetDataPointer(), A.GetDataPointer(),
                    m * n * sizeof(CudaDouble_t), cudaMemcpyDeviceToDevice,
-                   A.GetComputeStream());
+                   0);
 }
 
 } // namespace DNN

@@ -32,9 +32,9 @@ template <typename Architecture>
    using Matrix_t = typename Architecture::Matrix_t;
    using Net_t    = TNet<Architecture>;
 
-   size_t nSamples  = 100000;
+   size_t nSamples  = 1000;
    size_t nFeatures = 20;
-   size_t batchSize = 1000;
+   size_t batchSize = 100;
 
    TMatrixT<Double_t> XTrain(nSamples, nFeatures), YTrain(nSamples, 1),
     XTest(batchSize, nFeatures), YTest(batchSize, 1), W(1, nFeatures);
@@ -52,7 +52,7 @@ template <typename Architecture>
    net.AddLayer(1, EActivationFunction::IDENTITY);
    net.Initialize(EInitialization::GAUSS);
 
-   TGradientDescent<Architecture> minimizer(0.000001, 1, 10);
+   TGradientDescent<Architecture> minimizer(0.000001, 1, 1);
    MatrixInput_t trainingData(XTrain, YTrain);
    MatrixInput_t testData(XTest, YTest);
    minimizer.Train(trainingData, nSamples, testData, batchSize, net, 4);