Skip to content

Commit ff8522d

Browse files
A. Unique TensorFlowerVijay Vasudevan
authored andcommitted
Eliminate per-op allocation of gpu device wrapper. The PerOpGpuDevice is allocated once in the OpKernelContext::Params struct, then re-used every time a new OpKernelContext uses the Params. Thus in the executor, as long as there is more work to do the PerOpGpuDevice is not freed.
Change: 112909215
1 parent 62e7dec commit ff8522d

File tree

12 files changed

+215
-146
lines changed

12 files changed

+215
-146
lines changed

tensorflow/core/common_runtime/executor.cc

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -823,6 +823,9 @@ namespace {
823823
OpKernelContext::Params* CopyParams(const OpKernelContext::Params& p) {
824824
OpKernelContext::Params* ret = new OpKernelContext::Params;
825825
*ret = p;
826+
// Ensure the copy of Params will make a new eigen GPU device if
827+
// necessary.
828+
ret->eigen_gpu_device = nullptr;
826829
ret->inputs = new TensorValueVec(*p.inputs);
827830
ret->input_device_contexts = new DeviceContextVec(*p.input_device_contexts);
828831
ret->input_alloc_attrs = new AllocatorAttributeVec(*p.input_alloc_attrs);
@@ -831,6 +834,8 @@ OpKernelContext::Params* CopyParams(const OpKernelContext::Params& p) {
831834

832835
// Helpers to delete 'p' and copies made by CopyParams.
833836
void DeleteParams(OpKernelContext::Params* p) {
837+
// No need to delete p->eigen_gpu_device since that is deleted in
838+
// p's destructor
834839
delete p->inputs;
835840
delete p->input_device_contexts;
836841
delete p->input_alloc_attrs;
@@ -929,7 +934,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
929934
if (async) {
930935
// Asynchronous computes.
931936
auto pcopy = CopyParams(params);
932-
auto ctx = new OpKernelContext(*pcopy);
937+
auto ctx = new OpKernelContext(pcopy);
933938
auto done = [this, tagged_node, item, first_input, ctx, stats, pcopy,
934939
device]() {
935940
VLOG(2) << this << " Async kernel done: "
@@ -967,7 +972,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
967972
device->ComputeAsync(async, ctx, done);
968973
} else {
969974
// Synchronous computes.
970-
OpKernelContext ctx(params);
975+
OpKernelContext ctx(&params);
971976
if (stats_collector_) nodestats::SetOpStart(stats);
972977
device->Compute(CHECK_NOTNULL(op_kernel), &ctx);
973978
if (stats_collector_) nodestats::SetOpEnd(stats);

tensorflow/core/common_runtime/gpu/gpu_device.cc

Lines changed: 41 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,14 @@ namespace tensorflow {
7373
#if defined(__GCUDACC__) || defined(__GCUDACC_HOST__)
7474
class EigenAllocator : public ::Eigen::Allocator {
7575
public:
76-
explicit EigenAllocator(gpu::Stream* stream, ::tensorflow::Allocator* alloc,
77-
EventMgr* em)
78-
: stream_(stream), allocator_(alloc), em_(em) {}
76+
EigenAllocator() {}
77+
78+
void Reinitialize(gpu::Stream* stream, ::tensorflow::Allocator* alloc,
79+
EventMgr* em) {
80+
stream_ = stream;
81+
allocator_ = alloc;
82+
em_ = em;
83+
}
7984

8085
void* allocate(size_t num_bytes) const override {
8186
void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes);
@@ -103,10 +108,12 @@ class EigenAllocator : public ::Eigen::Allocator {
103108
#else
104109
class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
105110
public:
106-
EigenCudaStreamDevice(const cudaStream_t* cuda_stream, int gpu_id,
107-
::tensorflow::Allocator* alloc)
108-
: stream_(cuda_stream), allocator_(alloc) {
109-
Eigen::initializeDeviceProp();
111+
EigenCudaStreamDevice() { Eigen::initializeDeviceProp(); }
112+
113+
void Reinitialize(const cudaStream_t* cuda_stream, int gpu_id,
114+
::tensorflow::Allocator* alloc) {
115+
stream_ = cuda_stream;
116+
allocator_ = alloc;
110117
device_prop_ = &Eigen::m_deviceProperties[gpu_id];
111118
}
112119

@@ -391,10 +398,11 @@ namespace {
391398
#if defined(__GCUDACC__) || defined(__GCUDACC_HOST__)
392399
class ConcretePerOpGpuDevice : public PerOpGpuDevice {
393400
public:
394-
explicit ConcretePerOpGpuDevice(gpu::Stream* stream,
395-
Allocator* base_allocator,
396-
::tensorflow::EventMgr* em)
397-
: allocator_(stream, base_allocator, em), device_(stream, &allocator_) {}
401+
void Reinitialize(gpu::Stream* stream, Allocator* base_allocator,
402+
::tensorflow::EventMgr* em) {
403+
allocator_.Reinitialize(stream, base_allocator, em);
404+
device_.Reinitialize(stream, &allocator_);
405+
}
398406

399407
const Eigen::GpuDevice& device() const override { return device_; }
400408

@@ -405,10 +413,12 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
405413
#else
406414
class ConcretePerOpGpuDevice : public PerOpGpuDevice {
407415
public:
408-
explicit ConcretePerOpGpuDevice(const cudaStream_t* cuda_stream, int gpu_id,
409-
Allocator* base_allocator)
410-
: stream_device_(cuda_stream, gpu_id, base_allocator),
411-
device_(&stream_device_) {}
416+
ConcretePerOpGpuDevice() : device_(&stream_device_) {}
417+
418+
void Reinitialize(const cudaStream_t* cuda_stream, int gpu_id,
419+
Allocator* base_allocator) {
420+
stream_device_.Reinitialize(cuda_stream, gpu_id, base_allocator);
421+
}
412422

413423
const Eigen::GpuDevice& device() const override { return device_; }
414424

@@ -419,28 +429,36 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
419429
#endif
420430
} // namespace
421431

422-
const PerOpGpuDevice* BaseGPUDevice::NewDevice(int stream_id,
423-
Allocator* allocator) {
432+
void BaseGPUDevice::ReinitializeDevice(PerOpGpuDevice* device, int stream_id,
433+
Allocator* allocator) {
434+
ConcretePerOpGpuDevice* concrete_device =
435+
dynamic_cast<ConcretePerOpGpuDevice*>(device);
436+
DCHECK(concrete_device);
424437
#if defined(__GCUDACC__) || defined(__GCUDACC_HOST__)
425-
return new ConcretePerOpGpuDevice(streams_[stream_id], allocator, em_.get());
438+
concrete_device->Reinitialize(streams_[stream_id], allocator, em_.get());
426439
#else
427440
const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
428441
streams_[stream_id]->implementation()->CudaStreamMemberHack());
429-
return new ConcretePerOpGpuDevice(cuda_stream, gpu_id_, allocator);
442+
concrete_device->Reinitialize(cuda_stream, gpu_id_, allocator);
430443
#endif
431444
}
432445

433-
const PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice(DeviceContext* dc,
434-
Allocator* allocator) {
446+
PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() {
447+
return new ConcretePerOpGpuDevice();
448+
}
449+
450+
void BaseGPUDevice::ReinitializeGpuDevice(PerOpGpuDevice* device,
451+
DeviceContext* dc,
452+
Allocator* allocator) {
435453
if (dc) {
436454
const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc);
437455
const int stream_id = gpu_dc->stream_id();
438456
VLOG(1) << " eigen_gpu_device(" << dc << ") => stream[" << stream_id
439457
<< "]";
440458
CHECK_LT(stream_id, streams_.size());
441-
return NewDevice(stream_id, allocator);
459+
ReinitializeDevice(device, stream_id, allocator);
442460
} else {
443-
return NewDevice(0, allocator);
461+
ReinitializeDevice(device, 0, allocator);
444462
}
445463
}
446464

tensorflow/core/common_runtime/gpu/gpu_device.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@ limitations under the License.
3838

3939
namespace tensorflow {
4040

41-
class EigenAllocator;
42-
4341
class BaseGPUDevice : public LocalDevice {
4442
public:
4543
BaseGPUDevice(const SessionOptions& options, const string& name,
@@ -74,8 +72,10 @@ class BaseGPUDevice : public LocalDevice {
7472
Tensor* tensor) override;
7573

7674
// The caller owns the returned device.
77-
const PerOpGpuDevice* MakeGpuDevice(DeviceContext* dc,
78-
Allocator* allocator) override;
75+
PerOpGpuDevice* MakeGpuDevice() override;
76+
77+
void ReinitializeGpuDevice(PerOpGpuDevice* device, DeviceContext* dc,
78+
Allocator* allocator) override;
7979

8080
protected:
8181
Allocator* gpu_allocator_; // not owned
@@ -90,7 +90,8 @@ class BaseGPUDevice : public LocalDevice {
9090
const bool sync_every_op_ = false;
9191
std::unique_ptr<EventMgr> em_;
9292

93-
const PerOpGpuDevice* NewDevice(int stream_id, Allocator* allocator);
93+
void ReinitializeDevice(PerOpGpuDevice* device, int stream_id,
94+
Allocator* allocator);
9495
};
9596

9697
class BaseGPUDeviceFactory : public DeviceFactory {

tensorflow/core/framework/device_base.h

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ namespace thread {
4848
class ThreadPool;
4949
}
5050

51-
// A wrapper for an Eigen Gpu Device that includes per-op state
51+
// A wrapper for an Eigen Gpu Device that includes per-op state. The
52+
// class is defined even for non-GPU devices since the
53+
// OpKernelContext::Params structure wants to fill it in.
5254
class PerOpGpuDevice {
5355
public:
5456
virtual ~PerOpGpuDevice() {}
@@ -161,14 +163,16 @@ class DeviceBase {
161163
return eigen_cpu_device_;
162164
}
163165

164-
// The caller owns the returned device and must free it by calling
165-
// DisposeGpuDevice below
166-
virtual const PerOpGpuDevice* MakeGpuDevice(DeviceContext* /*dc*/,
167-
Allocator* /*allocator*/) {
168-
// The OpKernelContext calls this even for devices that do not
169-
// implement an eigen_gpu_device
170-
return nullptr;
171-
}
166+
// Caller owns the return value. The OpKernelContext calls this even
167+
// for devices that do not implement an eigen_gpu_device. Overridden
168+
// by GPU devices to return a derived type.
169+
virtual PerOpGpuDevice* MakeGpuDevice() { return nullptr; }
170+
171+
// This is overridden by GPU devices to reinitialize the derived
172+
// type returned by MakeGpuDevice.
173+
virtual void ReinitializeGpuDevice(PerOpGpuDevice* /*device*/,
174+
DeviceContext* /*dc*/,
175+
Allocator* /*allocator*/) {}
172176

173177
virtual const DeviceAttributes& attributes() const {
174178
LOG(FATAL) << "Device does not implement attributes()";

0 commit comments

Comments
 (0)