2025.5.9

huangruiteng · huangruiteng · commit 09cc69f30173 · 2025-05-09T23:31:59.000+08:00
diff --git a/Notes/snippets/code-reading-pytorch-compile-reduce.log b/Notes/snippets/code-reading-pytorch-compile-reduce.log
diff --git a/Notes/snippets/code-reading-pytorch-compile.py b/Notes/snippets/code-reading-pytorch-compile.py
@@ -1,4 +1,5 @@
-https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html
+tutorial: https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html
+API: https://docs.pytorch.org/docs/stable/generated/torch.compile.html#torch.compile
 
 ### depyf: Debugging Tool
 
diff --git a/Notes/snippets/gpu-ops/code-reading-pytorch-gpu-op.cc b/Notes/snippets/gpu-ops/code-reading-pytorch-gpu-op.cc
@@ -14,16 +14,68 @@
 
 *** reduce_ops
 
+- Implementation is accumulator and reduction op agnostic
+- TensorIterator to iterate over tensor elements
+- ReduceConfig: Has kernel launch parameters like block size and number of threads, grid etc.. and its set in setReduceConfig
+- Reduce_kernel is where it gets launched
+- Reduction strategies: thread level, block level x,y, or global reduce
+- Vectorization: Over input and/or output
+
+
 * min的实现：
 aten/src/ATen/native/cuda/ReduceOps.cpp
 ->
 aten/src/ATen/native/cuda/ReduceMinValuesKernel.cu
 ->
 aten/src/ATen/native/cuda/Reduce.cuh: 
 
-gpu_reduce_kernel
 struct ReduceOp
 
+auto config = ReduceConfig(sizeof(arg_t), num_outputs, inputs_per_output);
+
+template<int max_threads, typename R>
+static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction) {
+  dim3 block = config.block();
+  dim3 grid = config.grid();
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  int shared_memory = config.shared_memory_size();
+
+  switch(config.output_vec_size) {
+  case 4:
+    reduce_kernel<max_threads / 4, 4, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+    break;
+  case 2:
+    reduce_kernel<max_threads / 2, 2, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+    break;
+  default:
+    reduce_kernel<max_threads / 1, 1, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+gpu_reduce_kernel
+  - can_accumulate_in_output
+  // at::Half/at::ComplexHalf overflows easily as it's range is very small.
+  // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_half_or_chalf =
+      (std::is_same_v<at::Half, scalar_t> &&
+       std::is_same_v<at::Half, out_scalar_t>) ||
+      (std::is_same_v<c10::complex<Half>, scalar_t> &&
+       std::is_same_v<c10::complex<Half>, out_scalar_t>);
+  // at::BFloat16 has lower precision and can lead to rounding errors.
+  // So when scalar_t and out_scalar_t are at::BFloat16, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_bfloat16 =
+      (std::is_same_v<at::BFloat16, scalar_t> &&
+       std::is_same_v<at::BFloat16, out_scalar_t>);
+  static constexpr bool can_accumulate_in_output =
+      std::is_convertible_v<arg_t, out_scalar_t> &&
+      !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16);
+
 
 
 *** _foreach_add_
diff --git a/Notes/snippets/gpu-ops/code-reading-triton.py b/Notes/snippets/gpu-ops/code-reading-triton.py
@@ -7,3 +7,28 @@
 
 - test_core中的用法测试
 
+
+
+# reduce op
+
+lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
+
+
+struct ReduceOpConversion
+    : public ConvertTritonGPUReduceScanToLLVMPattern<triton::ReduceOp>
+   	LogicalResult
+  matchAndRewrite(triton::ReduceOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+  	// First reduce all the values along axis within each thread.
+    reduceWithinThreads(helper, srcValues, accs, indices, rewriter);
+
+    // Then reduce across threads within a warp.
+    reduceWithinWarps(helper, accs, rewriter);
+
+    storeWarpReduceToSharedMemory(helper, accs, indices, smemBases, rewriter);
+
+
+    accumulatePartialReductions(helper, smemBases, rewriter);
+
+    loadReductionAndPackResult(helper, smemShape, smemBases, rewriter);
diff --git a/Notes/snippets/gpu-ops/cpp-template.cc b/Notes/snippets/gpu-ops/cpp-template.cc
@@ -0,0 +1,12 @@
+*** 模版特化 template specialization
+
+//template for changing MAX_NUM_THREADS based on op dtype
+template <typename T>
+struct mnt_wrapper {
+  static constexpr int MAX_NUM_THREADS = 512;
+};
+
+template <>
+struct mnt_wrapper <c10::complex<double>>{
+  static constexpr int MAX_NUM_THREADS = 256;
+};
diff --git a/Notes/snippets/gpu-ops/cuda-reduce.py b/Notes/snippets/gpu-ops/cuda-reduce.py