zh4ngx
diff --git a/‎tensorflow/core/common_runtime/executor.cc‎
Lines changed: 7 additions & 2 deletions b/‎tensorflow/core/common_runtime/executor.cc‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎tensorflow/core/common_runtime/gpu/gpu_device.cc‎
Lines changed: 41 additions & 23 deletions b/‎tensorflow/core/common_runtime/gpu/gpu_device.cc‎
Lines changed: 41 additions & 23 deletions
diff --git a/‎tensorflow/core/common_runtime/gpu/gpu_device.h‎
Lines changed: 6 additions & 5 deletions b/‎tensorflow/core/common_runtime/gpu/gpu_device.h‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎tensorflow/core/framework/device_base.h‎
Lines changed: 13 additions & 9 deletions b/‎tensorflow/core/framework/device_base.h‎
Lines changed: 13 additions & 9 deletions
@@ -823,6 +823,9 @@ namespace {
 OpKernelContext::Params* CopyParams(const OpKernelContext::Params& p) {
   OpKernelContext::Params* ret = new OpKernelContext::Params;
   *ret = p;
+  // Ensure the copy of Params will make a new eigen GPU device if
+  // necessary.
+  ret->eigen_gpu_device = nullptr;
   ret->inputs = new TensorValueVec(*p.inputs);
   ret->input_device_contexts = new DeviceContextVec(*p.input_device_contexts);
   ret->input_alloc_attrs = new AllocatorAttributeVec(*p.input_alloc_attrs);
@@ -831,6 +834,8 @@ OpKernelContext::Params* CopyParams(const OpKernelContext::Params& p) {
 
 // Helpers to delete 'p' and copies made by CopyParams.
 void DeleteParams(OpKernelContext::Params* p) {
+  // No need to delete p->eigen_gpu_device since that is deleted in
+  // p's destructor
   delete p->inputs;
   delete p->input_device_contexts;
   delete p->input_alloc_attrs;
@@ -929,7 +934,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
       if (async) {
         // Asynchronous computes.
         auto pcopy = CopyParams(params);
-        auto ctx = new OpKernelContext(*pcopy);
+        auto ctx = new OpKernelContext(pcopy);
         auto done = [this, tagged_node, item, first_input, ctx, stats, pcopy,
                      device]() {
           VLOG(2) << this << " Async kernel done: "
@@ -967,7 +972,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
         device->ComputeAsync(async, ctx, done);
       } else {
         // Synchronous computes.
-        OpKernelContext ctx(params);
+        OpKernelContext ctx(&params);
         if (stats_collector_) nodestats::SetOpStart(stats);
         device->Compute(CHECK_NOTNULL(op_kernel), &ctx);
         if (stats_collector_) nodestats::SetOpEnd(stats);
 
@@ -73,9 +73,14 @@ namespace tensorflow {
 #if defined(__GCUDACC__) || defined(__GCUDACC_HOST__)
 class EigenAllocator : public ::Eigen::Allocator {
  public:
-  explicit EigenAllocator(gpu::Stream* stream, ::tensorflow::Allocator* alloc,
-                          EventMgr* em)
-      : stream_(stream), allocator_(alloc), em_(em) {}
+  EigenAllocator() {}
+
+  void Reinitialize(gpu::Stream* stream, ::tensorflow::Allocator* alloc,
+                    EventMgr* em) {
+    stream_ = stream;
+    allocator_ = alloc;
+    em_ = em;
+  }
 
   void* allocate(size_t num_bytes) const override {
     void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes);
@@ -103,10 +108,12 @@ class EigenAllocator : public ::Eigen::Allocator {
 #else
 class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
  public:
-  EigenCudaStreamDevice(const cudaStream_t* cuda_stream, int gpu_id,
-                        ::tensorflow::Allocator* alloc)
-      : stream_(cuda_stream), allocator_(alloc) {
-    Eigen::initializeDeviceProp();
+  EigenCudaStreamDevice() { Eigen::initializeDeviceProp(); }
+
+  void Reinitialize(const cudaStream_t* cuda_stream, int gpu_id,
+                    ::tensorflow::Allocator* alloc) {
+    stream_ = cuda_stream;
+    allocator_ = alloc;
     device_prop_ = &Eigen::m_deviceProperties[gpu_id];
   }
 
@@ -391,10 +398,11 @@ namespace {
 #if defined(__GCUDACC__) || defined(__GCUDACC_HOST__)
 class ConcretePerOpGpuDevice : public PerOpGpuDevice {
  public:
-  explicit ConcretePerOpGpuDevice(gpu::Stream* stream,
-                                  Allocator* base_allocator,
-                                  ::tensorflow::EventMgr* em)
-      : allocator_(stream, base_allocator, em), device_(stream, &allocator_) {}
+  void Reinitialize(gpu::Stream* stream, Allocator* base_allocator,
+                    ::tensorflow::EventMgr* em) {
+    allocator_.Reinitialize(stream, base_allocator, em);
+    device_.Reinitialize(stream, &allocator_);
+  }
 
   const Eigen::GpuDevice& device() const override { return device_; }
 
@@ -405,10 +413,12 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
 #else
 class ConcretePerOpGpuDevice : public PerOpGpuDevice {
  public:
-  explicit ConcretePerOpGpuDevice(const cudaStream_t* cuda_stream, int gpu_id,
-                                  Allocator* base_allocator)
-      : stream_device_(cuda_stream, gpu_id, base_allocator),
-        device_(&stream_device_) {}
+  ConcretePerOpGpuDevice() : device_(&stream_device_) {}
+
+  void Reinitialize(const cudaStream_t* cuda_stream, int gpu_id,
+                    Allocator* base_allocator) {
+    stream_device_.Reinitialize(cuda_stream, gpu_id, base_allocator);
+  }
 
   const Eigen::GpuDevice& device() const override { return device_; }
 
@@ -419,28 +429,36 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
 #endif
 }  // namespace
 
-const PerOpGpuDevice* BaseGPUDevice::NewDevice(int stream_id,
-                                               Allocator* allocator) {
+void BaseGPUDevice::ReinitializeDevice(PerOpGpuDevice* device, int stream_id,
+                                       Allocator* allocator) {
+  ConcretePerOpGpuDevice* concrete_device =
+      dynamic_cast<ConcretePerOpGpuDevice*>(device);
+  DCHECK(concrete_device);
 #if defined(__GCUDACC__) || defined(__GCUDACC_HOST__)
-  return new ConcretePerOpGpuDevice(streams_[stream_id], allocator, em_.get());
+  concrete_device->Reinitialize(streams_[stream_id], allocator, em_.get());
 #else
   const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
       streams_[stream_id]->implementation()->CudaStreamMemberHack());
-  return new ConcretePerOpGpuDevice(cuda_stream, gpu_id_, allocator);
+  concrete_device->Reinitialize(cuda_stream, gpu_id_, allocator);
 #endif
 }
 
-const PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice(DeviceContext* dc,
-                                                   Allocator* allocator) {
+PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() {
+  return new ConcretePerOpGpuDevice();
+}
+
+void BaseGPUDevice::ReinitializeGpuDevice(PerOpGpuDevice* device,
+                                          DeviceContext* dc,
+                                          Allocator* allocator) {
   if (dc) {
     const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc);
     const int stream_id = gpu_dc->stream_id();
     VLOG(1) << "  eigen_gpu_device(" << dc << ") => stream[" << stream_id
             << "]";
     CHECK_LT(stream_id, streams_.size());
-    return NewDevice(stream_id, allocator);
+    ReinitializeDevice(device, stream_id, allocator);
   } else {
-    return NewDevice(0, allocator);
+    ReinitializeDevice(device, 0, allocator);
   }
 }
 
 
@@ -38,8 +38,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-class EigenAllocator;
-
 class BaseGPUDevice : public LocalDevice {
  public:
   BaseGPUDevice(const SessionOptions& options, const string& name,
@@ -74,8 +72,10 @@ class BaseGPUDevice : public LocalDevice {
                              Tensor* tensor) override;
 
   // The caller owns the returned device.
-  const PerOpGpuDevice* MakeGpuDevice(DeviceContext* dc,
-                                      Allocator* allocator) override;
+  PerOpGpuDevice* MakeGpuDevice() override;
+
+  void ReinitializeGpuDevice(PerOpGpuDevice* device, DeviceContext* dc,
+                             Allocator* allocator) override;
 
  protected:
   Allocator* gpu_allocator_;  // not owned
@@ -90,7 +90,8 @@ class BaseGPUDevice : public LocalDevice {
   const bool sync_every_op_ = false;
   std::unique_ptr<EventMgr> em_;
 
-  const PerOpGpuDevice* NewDevice(int stream_id, Allocator* allocator);
+  void ReinitializeDevice(PerOpGpuDevice* device, int stream_id,
+                          Allocator* allocator);
 };
 
 class BaseGPUDeviceFactory : public DeviceFactory {
 
@@ -48,7 +48,9 @@ namespace thread {
 class ThreadPool;
 }
 
-// A wrapper for an Eigen Gpu Device that includes per-op state
+// A wrapper for an Eigen Gpu Device that includes per-op state. The
+// class is defined even for non-GPU devices since the
+// OpKernelContext::Params structure wants to fill it in.
 class PerOpGpuDevice {
  public:
   virtual ~PerOpGpuDevice() {}
@@ -161,14 +163,16 @@ class DeviceBase {
     return eigen_cpu_device_;
   }
 
-  // The caller owns the returned device and must free it by calling
-  // DisposeGpuDevice below
-  virtual const PerOpGpuDevice* MakeGpuDevice(DeviceContext* /*dc*/,
-                                              Allocator* /*allocator*/) {
-    // The OpKernelContext calls this even for devices that do not
-    // implement an eigen_gpu_device
-    return nullptr;
-  }
+  // Caller owns the return value. The OpKernelContext calls this even
+  // for devices that do not implement an eigen_gpu_device. Overridden
+  // by GPU devices to return a derived type.
+  virtual PerOpGpuDevice* MakeGpuDevice() { return nullptr; }
+
+  // This is overridden by GPU devices to reinitialize the derived
+  // type returned by MakeGpuDevice.
+  virtual void ReinitializeGpuDevice(PerOpGpuDevice* /*device*/,
+                                     DeviceContext* /*dc*/,
+                                     Allocator* /*allocator*/) {}
 
   virtual const DeviceAttributes& attributes() const {
     LOG(FATAL) << "Device does not implement attributes()";