Skip to content

Commit 09cc69f

Browse files
author
huangruiteng
committed
2025.5.9
1 parent ec816ba commit 09cc69f

File tree

6 files changed

+200
-52
lines changed

6 files changed

+200
-52
lines changed

Notes/snippets/code-reading-pytorch-compile-reduce.log

Whitespace-only changes.

Notes/snippets/code-reading-pytorch-compile.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html
1+
tutorial: https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html
2+
API: https://docs.pytorch.org/docs/stable/generated/torch.compile.html#torch.compile
23

34
### depyf: Debugging Tool
45

Notes/snippets/gpu-ops/code-reading-pytorch-gpu-op.cc

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,68 @@
1414

1515
*** reduce_ops
1616

17+
- Implementation is accumulator and reduction op agnostic
18+
- TensorIterator to iterate over tensor elements
19+
- ReduceConfig: Has kernel launch parameters like block size and number of threads, grid etc.. and its set in setReduceConfig
20+
- Reduce_kernel is where it gets launched
21+
- Reduction strategies: thread level, block level x,y, or global reduce
22+
- Vectorization: Over input and/or output
23+
24+
1725
* min的实现:
1826
aten/src/ATen/native/cuda/ReduceOps.cpp
1927
->
2028
aten/src/ATen/native/cuda/ReduceMinValuesKernel.cu
2129
->
2230
aten/src/ATen/native/cuda/Reduce.cuh:
2331

24-
gpu_reduce_kernel
2532
struct ReduceOp
2633

34+
auto config = ReduceConfig(sizeof(arg_t), num_outputs, inputs_per_output);
35+
36+
template<int max_threads, typename R>
37+
static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction) {
38+
dim3 block = config.block();
39+
dim3 grid = config.grid();
40+
41+
auto stream = at::cuda::getCurrentCUDAStream();
42+
int shared_memory = config.shared_memory_size();
43+
44+
switch(config.output_vec_size) {
45+
case 4:
46+
reduce_kernel<max_threads / 4, 4, R><<<grid, block, shared_memory, stream>>>(reduction);
47+
C10_CUDA_KERNEL_LAUNCH_CHECK();
48+
break;
49+
case 2:
50+
reduce_kernel<max_threads / 2, 2, R><<<grid, block, shared_memory, stream>>>(reduction);
51+
C10_CUDA_KERNEL_LAUNCH_CHECK();
52+
break;
53+
default:
54+
reduce_kernel<max_threads / 1, 1, R><<<grid, block, shared_memory, stream>>>(reduction);
55+
C10_CUDA_KERNEL_LAUNCH_CHECK();
56+
}
57+
}
58+
59+
gpu_reduce_kernel
60+
- can_accumulate_in_output
61+
// at::Half/at::ComplexHalf overflows easily as it's range is very small.
62+
// So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we
63+
// set can_accumulate_in_output to False.
64+
static constexpr bool is_inp_out_type_half_or_chalf =
65+
(std::is_same_v<at::Half, scalar_t> &&
66+
std::is_same_v<at::Half, out_scalar_t>) ||
67+
(std::is_same_v<c10::complex<Half>, scalar_t> &&
68+
std::is_same_v<c10::complex<Half>, out_scalar_t>);
69+
// at::BFloat16 has lower precision and can lead to rounding errors.
70+
// So when scalar_t and out_scalar_t are at::BFloat16, we
71+
// set can_accumulate_in_output to False.
72+
static constexpr bool is_inp_out_type_bfloat16 =
73+
(std::is_same_v<at::BFloat16, scalar_t> &&
74+
std::is_same_v<at::BFloat16, out_scalar_t>);
75+
static constexpr bool can_accumulate_in_output =
76+
std::is_convertible_v<arg_t, out_scalar_t> &&
77+
!(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16);
78+
2779

2880

2981
*** _foreach_add_

Notes/snippets/gpu-ops/code-reading-triton.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,28 @@
77

88
- test_core中的用法测试
99

10+
11+
12+
# reduce op
13+
14+
lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
15+
16+
17+
struct ReduceOpConversion
18+
: public ConvertTritonGPUReduceScanToLLVMPattern<triton::ReduceOp>
19+
LogicalResult
20+
matchAndRewrite(triton::ReduceOp op, OpAdaptor adaptor,
21+
ConversionPatternRewriter &rewriter) const override {
22+
23+
// First reduce all the values along axis within each thread.
24+
reduceWithinThreads(helper, srcValues, accs, indices, rewriter);
25+
26+
// Then reduce across threads within a warp.
27+
reduceWithinWarps(helper, accs, rewriter);
28+
29+
storeWarpReduceToSharedMemory(helper, accs, indices, smemBases, rewriter);
30+
31+
32+
accumulatePartialReductions(helper, smemBases, rewriter);
33+
34+
loadReductionAndPackResult(helper, smemShape, smemBases, rewriter);
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
*** 模版特化 template specialization
2+
3+
//template for changing MAX_NUM_THREADS based on op dtype
4+
template <typename T>
5+
struct mnt_wrapper {
6+
static constexpr int MAX_NUM_THREADS = 512;
7+
};
8+
9+
template <>
10+
struct mnt_wrapper <c10::complex<double>>{
11+
static constexpr int MAX_NUM_THREADS = 256;
12+
};

0 commit comments

Comments
 (0)