Skip to content

Commit 854f49b

Browse files
committed
TensorFlow: Upstream changes to git
Changes: - Updates to docs - Several changes for Python 3 compatibility - Added license headers Base CL: 108710566
1 parent 9c3043f commit 854f49b

File tree

309 files changed

+4712
-2396
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

309 files changed

+4712
-2396
lines changed

tensorflow/core/BUILD

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,8 @@ tf_cuda_library(
144144
name = "gpu_runtime",
145145
srcs = glob(
146146
[
147-
"common_runtime/gpu/**/*.h",
148-
"common_runtime/gpu/**/*.cc",
147+
"common_runtime/gpu/*.h",
148+
"common_runtime/gpu/*.cc",
149149
],
150150
exclude = [
151151
"**/*main.cc",
@@ -628,6 +628,7 @@ filegroup(
628628
"//tensorflow/core:kernels/relu_op.h",
629629
"//tensorflow/core:kernels/softplus_op.cc",
630630
"//tensorflow/core:kernels/softplus_op.h",
631+
"//tensorflow/core:kernels/stack_ops.cc",
631632
"//tensorflow/core:kernels/transpose_op.cc",
632633
"//tensorflow/core:kernels/transpose_op.h",
633634
"//tensorflow/core:kernels/transpose_op_functor.h",
@@ -673,6 +674,7 @@ cc_library(
673674
copts = [
674675
"-mfpu=neon",
675676
"-std=c++11",
677+
"-O2",
676678
],
677679
tags = [
678680
"manual",

tensorflow/core/framework/op.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ const OpDef* OpRegistry::LookUp(const string& op_type_name,
6060
if (op_def == nullptr) {
6161
status->Update(
6262
errors::NotFound("Op type not registered '", op_type_name, "'"));
63+
LOG(INFO) << status->ToString();
6364
static bool first_unregistered = true;
6465
if (first_unregistered) {
6566
OpList op_list;

tensorflow/core/framework/op_kernel.h

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -817,6 +817,11 @@ class OpKernelContext {
817817
return output_allocation_types_[index];
818818
}
819819

820+
// Per-step resource manager for use by white-listed internal ops.
821+
ResourceMgr* step_resource_manager() const {
822+
return params_.step_resource_manager;
823+
}
824+
820825
private:
821826
Allocator* get_allocator(AllocatorAttributes attr) {
822827
Allocator* allocator = params_.device->GetAllocator(attr);
@@ -836,13 +841,6 @@ class OpKernelContext {
836841
}
837842
}
838843

839-
// Per-step resource manager for use by white-listed internal ops.
840-
friend class TemporaryVariableOp;
841-
friend class DestroyTemporaryVariableOp;
842-
ResourceMgr* step_resource_manager() const {
843-
return params_.step_resource_manager;
844-
}
845-
846844
// Internal common method used when allocating tensor memory
847845
Status allocate_tensor(DataType type, const TensorShape& shape,
848846
Tensor* out_tensor, AllocatorAttributes attr);

tensorflow/core/framework/tensor_util_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ TEST(TensorUtil, Concat) {
140140
std::vector<Tensor> to_concat;
141141
int64 total_size = 0;
142142
int offset = 0;
143-
for (int entry = 0; entry < sizes.size(); ++entry) {
143+
for (size_t entry = 0; entry < sizes.size(); ++entry) {
144144
const int64 size = sizes[entry];
145145
Tensor tensor(DT_INT32, TensorShape({size, 2}));
146146
for (int i = offset; i < offset + size; ++i) {
@@ -175,7 +175,7 @@ TEST(TensorUtil, Split) {
175175
ASSERT_EQ(sizes.size(), splits.size());
176176

177177
int offset = 0;
178-
for (int entry = 0; entry < splits.size(); ++entry) {
178+
for (size_t entry = 0; entry < splits.size(); ++entry) {
179179
const int64 size = sizes[entry];
180180
const Tensor& split = splits[entry];
181181

tensorflow/core/graph/graph_partition.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1011,6 +1011,10 @@ Status Partition(const PartitionOptions& opts, Graph* g,
10111011

10121012
if (!edge->IsControlEdge() &&
10131013
IsRefType(src->output_type(edge->src_output()))) {
1014+
AddNodeAttr("_start_time", recv_start_time, recv);
1015+
if (real_recv != recv) {
1016+
AddNodeAttr("_start_time", recv_start_time, real_recv);
1017+
}
10141018
// If src is of ref type and the edge is not a control edge, dst has
10151019
// read semantics and therefore we must control the recv.
10161020
ref_recvs.push_back(real_recv);

tensorflow/core/kernels/bias_op_gpu.cu.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ __global__ void BiasOpCustomKernel(int nthreads, const T* input, const T* bias,
3737
T* output) {
3838
CUDA_1D_KERNEL_LOOP(index, nthreads) {
3939
int bias_offset = index % bias_size;
40-
output[index] = __ldg(input + index) + __ldg(bias + bias_offset);
40+
output[index] = ldg(input + index) + ldg(bias + bias_offset);
4141
}
4242
}
4343

tensorflow/core/kernels/constant_op_gpu.cu.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,10 @@ struct scalar_const_op {
4242
return *val;
4343
}
4444

45-
template <typename Index>
46-
EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const {
47-
return internal::pset1<Packet>(*val);
45+
template <typename Index, typename PacketType = Packet>
46+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType
47+
packetOp(Index, Index = 0) const {
48+
return internal::pset1<PacketType>(*val);
4849
}
4950
};
5051

tensorflow/core/kernels/conv_grad_ops.cc

Lines changed: 110 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -383,12 +383,53 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
383383
// The output image size is the spatial size of the output.
384384
const int output_image_size = out_rows * out_cols;
385385

386+
// TODO(andydavis) Get L2/L3 cache sizes from device.
387+
const size_t l2_cache_size = 256LL << 10;
388+
const size_t l3_cache_size = 30LL << 20;
389+
390+
// Use L3 cache size as target working set size.
391+
const size_t target_working_set_size = l3_cache_size / sizeof(T);
392+
393+
// Calculate size of matrices involved in MatMul: C = A x B.
394+
const size_t size_A = output_image_size * out_depth;
395+
396+
const size_t size_B = filter_total_size * out_depth;
397+
398+
const size_t size_C = output_image_size * filter_total_size;
399+
400+
const size_t work_unit_size = size_A + size_B + size_C;
401+
402+
auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
403+
404+
// Calculate per-thread work unit size.
405+
const size_t thread_work_unit_size =
406+
work_unit_size / worker_threads.num_threads;
407+
408+
// Set minimum per-thread work unit size to size of L2 cache.
409+
const size_t min_thread_work_unit_size = l2_cache_size / sizeof(T);
410+
411+
// Use parallel tensor contractions if there is no batching, or if the
412+
// minimum per-thread work unit size threshold has been exceeded.
413+
// Otherwise, revert to multiple single-threaded matmul ops running in
414+
// parallel to keep all threads busy.
415+
// TODO(andydavis) Explore alternatives to branching the code in this way
416+
// (i.e. run multiple, parallel tensor contractions in another thread pool).
417+
const bool use_parallel_contraction =
418+
batch == 1 || thread_work_unit_size >= min_thread_work_unit_size;
419+
420+
const size_t shard_size =
421+
use_parallel_contraction
422+
? 1
423+
: (target_working_set_size + work_unit_size - 1) / work_unit_size;
424+
386425
Tensor col_buffer;
387-
OP_REQUIRES_OK(
388-
context,
389-
context->allocate_temp(
390-
DataTypeToEnum<T>::value,
391-
TensorShape({output_image_size, filter_total_size}), &col_buffer));
426+
OP_REQUIRES_OK(context,
427+
context->allocate_temp(
428+
DataTypeToEnum<T>::value,
429+
TensorShape({static_cast<int64>(shard_size),
430+
static_cast<int64>(output_image_size),
431+
static_cast<int64>(filter_total_size)}),
432+
&col_buffer));
392433

393434
// The input offset corresponding to a single input image.
394435
const int input_offset = input_rows * input_cols * in_depth;
@@ -400,31 +441,74 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
400441
auto* out_backprop_data = out_backprop.template flat<T>().data();
401442
auto* input_backprop_data = in_backprop->template flat<T>().data();
402443

403-
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
404-
Eigen::Unaligned> TensorMap;
405-
typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
406-
Eigen::Unaligned> ConstTensorMap;
444+
if (use_parallel_contraction) {
445+
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
446+
Eigen::Unaligned> TensorMap;
447+
typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
448+
Eigen::Unaligned> ConstTensorMap;
407449

408-
// Initialize contraction dims (we need to transpose 'B' below).
409-
Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims;
410-
contract_dims[0].first = 1;
411-
contract_dims[0].second = 1;
450+
// Initialize contraction dims (we need to transpose 'B' below).
451+
Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims;
452+
contract_dims[0].first = 1;
453+
contract_dims[0].second = 1;
412454

413-
for (int image_id = 0; image_id < batch; ++image_id) {
414-
// Compute gradient into col_buffer.
415-
TensorMap C(col_buffer_data, output_image_size, filter_total_size);
455+
for (int image_id = 0; image_id < batch; ++image_id) {
456+
// Compute gradient into col_buffer.
457+
TensorMap C(col_buffer_data, output_image_size, filter_total_size);
416458

417-
ConstTensorMap A(out_backprop_data + output_offset * image_id,
418-
output_image_size, out_depth);
419-
ConstTensorMap B(filter_data, filter_total_size, out_depth);
459+
ConstTensorMap A(out_backprop_data + output_offset * image_id,
460+
output_image_size, out_depth);
461+
ConstTensorMap B(filter_data, filter_total_size, out_depth);
420462

421-
C.device(context->eigen_cpu_device()) = A.contract(B, contract_dims);
463+
C.device(context->eigen_cpu_device()) = A.contract(B, contract_dims);
422464

423-
Col2im<T>(col_buffer_data, in_depth, input_rows, input_cols, filter_rows,
424-
filter_cols, pad_top, pad_left, pad_bottom, pad_right, stride,
425-
stride, input_backprop_data);
465+
Col2im<T>(col_buffer_data, in_depth, input_rows, input_cols,
466+
filter_rows, filter_cols, pad_top, pad_left, pad_bottom,
467+
pad_right, stride, stride, input_backprop_data);
426468

427-
input_backprop_data += input_offset;
469+
input_backprop_data += input_offset;
470+
}
471+
} else {
472+
typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
473+
Eigen::RowMajor>> MatrixMap;
474+
typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
475+
Eigen::RowMajor>> ConstMatrixMap;
476+
477+
for (int image_id = 0; image_id < batch; image_id += shard_size) {
478+
const int shard_limit = std::min(static_cast<int>(shard_size),
479+
static_cast<int>(batch) - image_id);
480+
481+
auto shard = [&in_depth, &input_rows, &input_cols, &filter_rows,
482+
&filter_cols, &pad_top, &pad_left, &pad_bottom,
483+
&pad_right, &stride, &output_image_size,
484+
&filter_total_size, &out_depth, &input_backprop_data,
485+
&col_buffer_data, &out_backprop_data, &filter_data,
486+
&input_offset, &output_offset,
487+
&size_C](int64 start, int64 limit) {
488+
for (int shard_id = start; shard_id < limit; ++shard_id) {
489+
T* im2col_buf = col_buffer_data + shard_id * size_C;
490+
T* input_data = input_backprop_data + shard_id * input_offset;
491+
const T* out_data = out_backprop_data + shard_id * output_offset;
492+
493+
// Compute gradient into 'im2col_buf'.
494+
MatrixMap C(im2col_buf, output_image_size, filter_total_size);
495+
496+
ConstMatrixMap A(out_data, output_image_size, out_depth);
497+
ConstMatrixMap B(filter_data, filter_total_size, out_depth);
498+
499+
C.noalias() = A * B.transpose();
500+
501+
Col2im<T>(im2col_buf, in_depth, input_rows, input_cols, filter_rows,
502+
filter_cols, pad_top, pad_left, pad_bottom, pad_right,
503+
stride, stride, input_data);
504+
}
505+
};
506+
Shard(worker_threads.num_threads, worker_threads.workers, shard_limit,
507+
work_unit_size, shard);
508+
509+
input_backprop_data += input_offset * shard_limit;
510+
out_backprop_data += output_offset * shard_limit;
511+
}
428512
}
429513
}
430514

@@ -620,8 +704,8 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
620704
&pad_left, &pad_bottom, &pad_right, &stride, &input_offset,
621705
&size_A](int64 start, int64 limit) {
622706
for (int shard_id = start; shard_id < limit; ++shard_id) {
623-
auto input_data_shard = input_data + shard_id * input_offset;
624-
auto col_data_shard = col_buffer_data + shard_id * size_A;
707+
const T* input_data_shard = input_data + shard_id * input_offset;
708+
T* col_data_shard = col_buffer_data + shard_id * size_A;
625709

626710
// When we compute the gradient with respect to the filters, we need
627711
// to do im2col to allow gemm-type computation.

0 commit comments

Comments
 (0)