| 
14 | 14 | 
 
  | 
15 | 15 | *** reduce_ops  | 
16 | 16 | 
 
  | 
 | 17 | +- Implementation is accumulator and reduction op agnostic  | 
 | 18 | +- TensorIterator to iterate over tensor elements  | 
 | 19 | +- ReduceConfig: Has kernel launch parameters like block size and number of threads, grid etc.. and its set in setReduceConfig  | 
 | 20 | +- Reduce_kernel is where it gets launched  | 
 | 21 | +- Reduction strategies: thread level, block level x,y, or global reduce  | 
 | 22 | +- Vectorization: Over input and/or output  | 
 | 23 | + | 
 | 24 | + | 
17 | 25 | * min的实现:  | 
18 | 26 | aten/src/ATen/native/cuda/ReduceOps.cpp  | 
19 | 27 | ->  | 
20 | 28 | aten/src/ATen/native/cuda/ReduceMinValuesKernel.cu  | 
21 | 29 | ->  | 
22 | 30 | aten/src/ATen/native/cuda/Reduce.cuh:   | 
23 | 31 | 
 
  | 
24 |  | -gpu_reduce_kernel  | 
25 | 32 | struct ReduceOp  | 
26 | 33 | 
 
  | 
 | 34 | +auto config = ReduceConfig(sizeof(arg_t), num_outputs, inputs_per_output);  | 
 | 35 | + | 
 | 36 | +template<int max_threads, typename R>  | 
 | 37 | +static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction) {  | 
 | 38 | +  dim3 block = config.block();  | 
 | 39 | +  dim3 grid = config.grid();  | 
 | 40 | + | 
 | 41 | +  auto stream = at::cuda::getCurrentCUDAStream();  | 
 | 42 | +  int shared_memory = config.shared_memory_size();  | 
 | 43 | + | 
 | 44 | +  switch(config.output_vec_size) {  | 
 | 45 | +  case 4:  | 
 | 46 | +    reduce_kernel<max_threads / 4, 4, R><<<grid, block, shared_memory, stream>>>(reduction);  | 
 | 47 | +    C10_CUDA_KERNEL_LAUNCH_CHECK();  | 
 | 48 | +    break;  | 
 | 49 | +  case 2:  | 
 | 50 | +    reduce_kernel<max_threads / 2, 2, R><<<grid, block, shared_memory, stream>>>(reduction);  | 
 | 51 | +    C10_CUDA_KERNEL_LAUNCH_CHECK();  | 
 | 52 | +    break;  | 
 | 53 | +  default:  | 
 | 54 | +    reduce_kernel<max_threads / 1, 1, R><<<grid, block, shared_memory, stream>>>(reduction);  | 
 | 55 | +    C10_CUDA_KERNEL_LAUNCH_CHECK();  | 
 | 56 | +  }  | 
 | 57 | +}  | 
 | 58 | + | 
 | 59 | +gpu_reduce_kernel  | 
 | 60 | +  - can_accumulate_in_output  | 
 | 61 | +  // at::Half/at::ComplexHalf overflows easily as it's range is very small.  | 
 | 62 | +  // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we  | 
 | 63 | +  // set can_accumulate_in_output to False.  | 
 | 64 | +  static constexpr bool is_inp_out_type_half_or_chalf =  | 
 | 65 | +      (std::is_same_v<at::Half, scalar_t> &&  | 
 | 66 | +       std::is_same_v<at::Half, out_scalar_t>) ||  | 
 | 67 | +      (std::is_same_v<c10::complex<Half>, scalar_t> &&  | 
 | 68 | +       std::is_same_v<c10::complex<Half>, out_scalar_t>);  | 
 | 69 | +  // at::BFloat16 has lower precision and can lead to rounding errors.  | 
 | 70 | +  // So when scalar_t and out_scalar_t are at::BFloat16, we  | 
 | 71 | +  // set can_accumulate_in_output to False.  | 
 | 72 | +  static constexpr bool is_inp_out_type_bfloat16 =  | 
 | 73 | +      (std::is_same_v<at::BFloat16, scalar_t> &&  | 
 | 74 | +       std::is_same_v<at::BFloat16, out_scalar_t>);  | 
 | 75 | +  static constexpr bool can_accumulate_in_output =  | 
 | 76 | +      std::is_convertible_v<arg_t, out_scalar_t> &&  | 
 | 77 | +      !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16);  | 
 | 78 | + | 
27 | 79 | 
 
  | 
28 | 80 | 
 
  | 
29 | 81 | *** _foreach_add_  | 
 | 
0 commit comments