typingduck
diff --git a/‎tensorflow/core/BUILD‎
Lines changed: 4 additions & 2 deletions b/‎tensorflow/core/BUILD‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎tensorflow/core/framework/op.cc‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow/core/framework/op.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorflow/core/framework/op_kernel.h‎
Lines changed: 5 additions & 7 deletions b/‎tensorflow/core/framework/op_kernel.h‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎tensorflow/core/framework/tensor_util_test.cc‎
Lines changed: 2 additions & 2 deletions b/‎tensorflow/core/framework/tensor_util_test.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tensorflow/core/graph/graph_partition.cc‎
Lines changed: 4 additions & 0 deletions b/‎tensorflow/core/graph/graph_partition.cc‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tensorflow/core/kernels/bias_op_gpu.cu.cc‎
Lines changed: 1 addition & 1 deletion b/‎tensorflow/core/kernels/bias_op_gpu.cu.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/core/kernels/constant_op_gpu.cu.cc‎
Lines changed: 4 additions & 3 deletions b/‎tensorflow/core/kernels/constant_op_gpu.cu.cc‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎tensorflow/core/kernels/conv_grad_ops.cc‎
Lines changed: 110 additions & 26 deletions b/‎tensorflow/core/kernels/conv_grad_ops.cc‎
Lines changed: 110 additions & 26 deletions
@@ -144,8 +144,8 @@ tf_cuda_library(
     name = "gpu_runtime",
     srcs = glob(
         [
-            "common_runtime/gpu/**/*.h",
-            "common_runtime/gpu/**/*.cc",
+            "common_runtime/gpu/*.h",
+            "common_runtime/gpu/*.cc",
         ],
         exclude = [
             "**/*main.cc",
@@ -628,6 +628,7 @@ filegroup(
         "//tensorflow/core:kernels/relu_op.h",
         "//tensorflow/core:kernels/softplus_op.cc",
         "//tensorflow/core:kernels/softplus_op.h",
+        "//tensorflow/core:kernels/stack_ops.cc",
         "//tensorflow/core:kernels/transpose_op.cc",
         "//tensorflow/core:kernels/transpose_op.h",
         "//tensorflow/core:kernels/transpose_op_functor.h",
@@ -673,6 +674,7 @@ cc_library(
     copts = [
         "-mfpu=neon",
         "-std=c++11",
+        "-O2",
     ],
     tags = [
         "manual",
 
@@ -60,6 +60,7 @@ const OpDef* OpRegistry::LookUp(const string& op_type_name,
   if (op_def == nullptr) {
     status->Update(
         errors::NotFound("Op type not registered '", op_type_name, "'"));
+    LOG(INFO) << status->ToString();
     static bool first_unregistered = true;
     if (first_unregistered) {
       OpList op_list;
 
@@ -817,6 +817,11 @@ class OpKernelContext {
     return output_allocation_types_[index];
   }
 
+  // Per-step resource manager for use by white-listed internal ops.
+  ResourceMgr* step_resource_manager() const {
+    return params_.step_resource_manager;
+  }
+
  private:
   Allocator* get_allocator(AllocatorAttributes attr) {
     Allocator* allocator = params_.device->GetAllocator(attr);
@@ -836,13 +841,6 @@ class OpKernelContext {
     }
   }
 
-  // Per-step resource manager for use by white-listed internal ops.
-  friend class TemporaryVariableOp;
-  friend class DestroyTemporaryVariableOp;
-  ResourceMgr* step_resource_manager() const {
-    return params_.step_resource_manager;
-  }
-
   // Internal common method used when allocating tensor memory
   Status allocate_tensor(DataType type, const TensorShape& shape,
                          Tensor* out_tensor, AllocatorAttributes attr);
 
@@ -140,7 +140,7 @@ TEST(TensorUtil, Concat) {
   std::vector<Tensor> to_concat;
   int64 total_size = 0;
   int offset = 0;
-  for (int entry = 0; entry < sizes.size(); ++entry) {
+  for (size_t entry = 0; entry < sizes.size(); ++entry) {
     const int64 size = sizes[entry];
     Tensor tensor(DT_INT32, TensorShape({size, 2}));
     for (int i = offset; i < offset + size; ++i) {
@@ -175,7 +175,7 @@ TEST(TensorUtil, Split) {
   ASSERT_EQ(sizes.size(), splits.size());
 
   int offset = 0;
-  for (int entry = 0; entry < splits.size(); ++entry) {
+  for (size_t entry = 0; entry < splits.size(); ++entry) {
     const int64 size = sizes[entry];
     const Tensor& split = splits[entry];
 
 
@@ -1011,6 +1011,10 @@ Status Partition(const PartitionOptions& opts, Graph* g,
 
       if (!edge->IsControlEdge() &&
           IsRefType(src->output_type(edge->src_output()))) {
+        AddNodeAttr("_start_time", recv_start_time, recv);
+        if (real_recv != recv) {
+          AddNodeAttr("_start_time", recv_start_time, real_recv);
+        }
         // If src is of ref type and the edge is not a control edge, dst has
         // read semantics and therefore we must control the recv.
         ref_recvs.push_back(real_recv);
 
@@ -37,7 +37,7 @@ __global__ void BiasOpCustomKernel(int nthreads, const T* input, const T* bias,
                                    T* output) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int bias_offset = index % bias_size;
-    output[index] = __ldg(input + index) + __ldg(bias + bias_offset);
+    output[index] = ldg(input + index) + ldg(bias + bias_offset);
   }
 }
 
 
@@ -42,9 +42,10 @@ struct scalar_const_op {
     return *val;
   }
 
-  template <typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const {
-    return internal::pset1<Packet>(*val);
+  template <typename Index, typename PacketType = Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType
+      packetOp(Index, Index = 0) const {
+    return internal::pset1<PacketType>(*val);
   }
 };
 
 
@@ -383,12 +383,53 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
     // The output image size is the spatial size of the output.
     const int output_image_size = out_rows * out_cols;
 
+    // TODO(andydavis) Get L2/L3 cache sizes from device.
+    const size_t l2_cache_size = 256LL << 10;
+    const size_t l3_cache_size = 30LL << 20;
+
+    // Use L3 cache size as target working set size.
+    const size_t target_working_set_size = l3_cache_size / sizeof(T);
+
+    // Calculate size of matrices involved in MatMul: C = A x B.
+    const size_t size_A = output_image_size * out_depth;
+
+    const size_t size_B = filter_total_size * out_depth;
+
+    const size_t size_C = output_image_size * filter_total_size;
+
+    const size_t work_unit_size = size_A + size_B + size_C;
+
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+
+    // Calculate per-thread work unit size.
+    const size_t thread_work_unit_size =
+        work_unit_size / worker_threads.num_threads;
+
+    // Set minimum per-thread work unit size to size of L2 cache.
+    const size_t min_thread_work_unit_size = l2_cache_size / sizeof(T);
+
+    // Use parallel tensor contractions if there is no batching, or if the
+    // minimum per-thread work unit size threshold has been exceeded.
+    // Otherwise, revert to multiple single-threaded matmul ops running in
+    // parallel to keep all threads busy.
+    // TODO(andydavis) Explore alternatives to branching the code in this way
+    // (i.e. run multiple, parallel tensor contractions in another thread pool).
+    const bool use_parallel_contraction =
+        batch == 1 || thread_work_unit_size >= min_thread_work_unit_size;
+
+    const size_t shard_size =
+        use_parallel_contraction
+            ? 1
+            : (target_working_set_size + work_unit_size - 1) / work_unit_size;
+
     Tensor col_buffer;
-    OP_REQUIRES_OK(
-        context,
-        context->allocate_temp(
-            DataTypeToEnum<T>::value,
-            TensorShape({output_image_size, filter_total_size}), &col_buffer));
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(
+                       DataTypeToEnum<T>::value,
+                       TensorShape({static_cast<int64>(shard_size),
+                                    static_cast<int64>(output_image_size),
+                                    static_cast<int64>(filter_total_size)}),
+                       &col_buffer));
 
     // The input offset corresponding to a single input image.
     const int input_offset = input_rows * input_cols * in_depth;
@@ -400,31 +441,74 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
     auto* out_backprop_data = out_backprop.template flat<T>().data();
     auto* input_backprop_data = in_backprop->template flat<T>().data();
 
-    typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
-                             Eigen::Unaligned> TensorMap;
-    typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
-                             Eigen::Unaligned> ConstTensorMap;
+    if (use_parallel_contraction) {
+      typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
+                               Eigen::Unaligned> TensorMap;
+      typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
+                               Eigen::Unaligned> ConstTensorMap;
 
-    // Initialize contraction dims (we need to transpose 'B' below).
-    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims;
-    contract_dims[0].first = 1;
-    contract_dims[0].second = 1;
+      // Initialize contraction dims (we need to transpose 'B' below).
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims;
+      contract_dims[0].first = 1;
+      contract_dims[0].second = 1;
 
-    for (int image_id = 0; image_id < batch; ++image_id) {
-      // Compute gradient into col_buffer.
-      TensorMap C(col_buffer_data, output_image_size, filter_total_size);
+      for (int image_id = 0; image_id < batch; ++image_id) {
+        // Compute gradient into col_buffer.
+        TensorMap C(col_buffer_data, output_image_size, filter_total_size);
 
-      ConstTensorMap A(out_backprop_data + output_offset * image_id,
-                       output_image_size, out_depth);
-      ConstTensorMap B(filter_data, filter_total_size, out_depth);
+        ConstTensorMap A(out_backprop_data + output_offset * image_id,
+                         output_image_size, out_depth);
+        ConstTensorMap B(filter_data, filter_total_size, out_depth);
 
-      C.device(context->eigen_cpu_device()) = A.contract(B, contract_dims);
+        C.device(context->eigen_cpu_device()) = A.contract(B, contract_dims);
 
-      Col2im<T>(col_buffer_data, in_depth, input_rows, input_cols, filter_rows,
-                filter_cols, pad_top, pad_left, pad_bottom, pad_right, stride,
-                stride, input_backprop_data);
+        Col2im<T>(col_buffer_data, in_depth, input_rows, input_cols,
+                  filter_rows, filter_cols, pad_top, pad_left, pad_bottom,
+                  pad_right, stride, stride, input_backprop_data);
 
-      input_backprop_data += input_offset;
+        input_backprop_data += input_offset;
+      }
+    } else {
+      typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
+                                       Eigen::RowMajor>> MatrixMap;
+      typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
+                                             Eigen::RowMajor>> ConstMatrixMap;
+
+      for (int image_id = 0; image_id < batch; image_id += shard_size) {
+        const int shard_limit = std::min(static_cast<int>(shard_size),
+                                         static_cast<int>(batch) - image_id);
+
+        auto shard = [&in_depth, &input_rows, &input_cols, &filter_rows,
+                      &filter_cols, &pad_top, &pad_left, &pad_bottom,
+                      &pad_right, &stride, &output_image_size,
+                      &filter_total_size, &out_depth, &input_backprop_data,
+                      &col_buffer_data, &out_backprop_data, &filter_data,
+                      &input_offset, &output_offset,
+                      &size_C](int64 start, int64 limit) {
+          for (int shard_id = start; shard_id < limit; ++shard_id) {
+            T* im2col_buf = col_buffer_data + shard_id * size_C;
+            T* input_data = input_backprop_data + shard_id * input_offset;
+            const T* out_data = out_backprop_data + shard_id * output_offset;
+
+            // Compute gradient into 'im2col_buf'.
+            MatrixMap C(im2col_buf, output_image_size, filter_total_size);
+
+            ConstMatrixMap A(out_data, output_image_size, out_depth);
+            ConstMatrixMap B(filter_data, filter_total_size, out_depth);
+
+            C.noalias() = A * B.transpose();
+
+            Col2im<T>(im2col_buf, in_depth, input_rows, input_cols, filter_rows,
+                      filter_cols, pad_top, pad_left, pad_bottom, pad_right,
+                      stride, stride, input_data);
+          }
+        };
+        Shard(worker_threads.num_threads, worker_threads.workers, shard_limit,
+              work_unit_size, shard);
+
+        input_backprop_data += input_offset * shard_limit;
+        out_backprop_data += output_offset * shard_limit;
+      }
     }
   }
 
@@ -620,8 +704,8 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
                     &pad_left, &pad_bottom, &pad_right, &stride, &input_offset,
                     &size_A](int64 start, int64 limit) {
         for (int shard_id = start; shard_id < limit; ++shard_id) {
-          auto input_data_shard = input_data + shard_id * input_offset;
-          auto col_data_shard = col_buffer_data + shard_id * size_A;
+          const T* input_data_shard = input_data + shard_id * input_offset;
+          T* col_data_shard = col_buffer_data + shard_id * size_A;
 
           // When we compute the gradient with respect to the filters, we need
           // to do im2col to allow gemm-type computation.
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ __global__ void BiasOpCustomKernel(int nthreads, const T* input, const T* bias,`
`37`	`37`	`T* output) {`
`38`	`38`	`CUDA_1D_KERNEL_LOOP(index, nthreads) {`
`39`	`39`	`int bias_offset = index % bias_size;`
`40`		`- output[index] = __ldg(input + index) + __ldg(bias + bias_offset);`
	`40`	`+ output[index] = ldg(input + index) + ldg(bias + bias_offset);`
`41`	`41`	`}`
`42`	`42`	`}`
`43`	`43`
Original file line number	Diff line number	Diff line change
`@@ -42,9 +42,10 @@ struct scalar_const_op {`
`42`	`42`	`return *val;`
`43`	`43`	`}`
`44`	`44`
`45`		`- template <typename Index>`
`46`		`- EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const {`
`47`		`- return internal::pset1<Packet>(*val);`
	`45`	`+ template <typename Index, typename PacketType = Packet>`
	`46`	`+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType`
	`47`	`+ packetOp(Index, Index = 0) const {`
	`48`	`+ return internal::pset1<PacketType>(*val);`
`48`	`49`	`}`
`49`	`50`	`};`
`50`	`51`