PaddlePaddle · WintersMontagne10335 · Nov 18, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
diff --git a/paddle/fluid/pir/drr/src/ir_operation_factory.cc b/paddle/fluid/pir/drr/src/ir_operation_factory.cc
@@ -448,6 +448,11 @@ void OperationFactory::RegisterManualOpCreator() {
                             common::errors::InvalidArgument(
                                 "'paddings' Attribute is expected for "
                                 "Pool2dOp. "));
+          PADDLE_ENFORCE_EQ(attrs.find("dilations") != attrs.end(),
+                            true,
+                            common::errors::InvalidArgument(
+                                "'dilations' Attribute is expected for "
+                                "Pool2dOp. "));
           std::vector<int64_t> paddings;
           for (size_t i = 0;
                i < attrs.at("paddings").dyn_cast<pir::ArrayAttribute>().size();
@@ -459,6 +464,17 @@ void OperationFactory::RegisterManualOpCreator() {
                                    .data());
           }
 
+          std::vector<int64_t> dilations;
+          for (size_t i = 0;
+               i < attrs.at("dilations").dyn_cast<pir::ArrayAttribute>().size();
+               i++) {
+            dilations.push_back(attrs.at("dilations")
+                                    .dyn_cast<pir::ArrayAttribute>()
+                                    .at(i)
+                                    .dyn_cast<pir::Int64Attribute>()
+                                    .data());
+          }
+
           PADDLE_ENFORCE_EQ(attrs.find("ceil_mode") != attrs.end(),
                             true,
                             common::errors::InvalidArgument(
@@ -554,6 +570,7 @@ void OperationFactory::RegisterManualOpCreator() {
               inputs[1],
               strides,
               paddings,
+              dilations,
               ceil_mode,
               exclusive,
               data_format,

diff --git a/paddle/phi/kernels/funcs/pooling.cc b/paddle/phi/kernels/funcs/pooling.cc
@@ -34,6 +34,7 @@ class Pool2dFunctor<CPUContext, PoolProcess, T> {
                   const std::vector<int64_t>& ksize,
                   const std::vector<int64_t>& strides,
                   const std::vector<int64_t>& paddings,
+                  const std::vector<int64_t>& dilations,
                   const std::string data_format,
                   bool exclusive,
                   bool adaptive,
@@ -65,6 +66,9 @@ class Pool2dFunctor<CPUContext, PoolProcess, T> {
     const int64_t padding_height = paddings[0];
     const int64_t padding_width = paddings[1];
 
+    const int64_t dilation_height = dilations[0];
+    const int64_t dilation_width = dilations[1];
+
     const T* input_data = input.data<T>();
     T* output_data = context.template Alloc<T>(output);
 
@@ -94,15 +98,19 @@ class Pool2dFunctor<CPUContext, PoolProcess, T> {
                     std::min(wstart + ksize_width, input_width + padding_width);
                 pool_size = (hend - hstart) * (wend - wstart);
 
-                wstart = std::max(wstart, static_cast<int64_t>(0));
-                hstart = std::max(hstart, static_cast<int64_t>(0));
-                hend = std::min(hend, input_height);
-                wend = std::min(wend, input_width);
+                hend = hstart + (ksize_height - 1) * dilation_height + 1;
+                wend = wstart + (ksize_width - 1) * dilation_width + 1;
+                while (hstart < static_cast<int64_t>(0))
+                  hstart += dilation_height;
+                while (hend > input_height) hend -= dilation_height;
+                while (wstart < static_cast<int64_t>(0))
+                  wstart += dilation_width;
+                while (wend > input_width) wend -= dilation_width;
               }
 
               T ele = pool_process.initial();
-              for (int64_t h = hstart; h < hend; ++h) {
-                for (int64_t w = wstart; w < wend; ++w) {
+              for (int64_t h = hstart; h < hend; h += dilation_height) {
+                for (int64_t w = wstart; w < wend; w += dilation_width) {
                   pool_process.compute(input_data[h * input_width + w], &ele);
                 }
               }
@@ -142,14 +150,18 @@ class Pool2dFunctor<CPUContext, PoolProcess, T> {
                     std::min(wstart + ksize_width, input_width + padding_width);
                 pool_size = (hend - hstart) * (wend - wstart);
 
-                wstart = std::max(wstart, static_cast<int64_t>(0));
-                hstart = std::max(hstart, static_cast<int64_t>(0));
-                hend = std::min(hend, input_height);
-                wend = std::min(wend, input_width);
+                hend = hstart + (ksize_height - 1) * dilation_height + 1;
+                wend = wstart + (ksize_width - 1) * dilation_width + 1;
+                while (hstart < static_cast<int64_t>(0))
+                  hstart += dilation_height;
+                while (hend > input_height) hend -= dilation_height;
+                while (wstart < static_cast<int64_t>(0))
+                  wstart += dilation_width;
+                while (wend > input_width) wend -= dilation_width;
               }
               T ele = pool_process.initial();
-              for (int64_t h = hstart; h < hend; ++h) {
-                for (int64_t w = wstart; w < wend; ++w) {
+              for (int64_t h = hstart; h < hend; h += dilation_height) {
+                for (int64_t w = wstart; w < wend; w += dilation_width) {
                   pool_process.compute(
                       input_data[h * input_width * input_channels +
                                  w * input_channels + c],
@@ -189,6 +201,7 @@ class Pool2dGradFunctor<CPUContext, PoolProcess, T> {
                   const std::vector<int64_t>& ksize,
                   const std::vector<int64_t>& strides,
                   const std::vector<int64_t>& paddings,
+                  const std::vector<int64_t>& dilations UNUSED,
                   const std::string data_format,
                   bool exclusive,
                   bool adaptive,
@@ -355,6 +368,7 @@ class MaxPool2dGradFunctor<CPUContext, T> {
                   const std::vector<int64_t>& ksize,
                   const std::vector<int64_t>& strides,
                   const std::vector<int64_t>& paddings,
+                  const std::vector<int64_t>& dilations,
                   const std::string data_format,
                   DenseTensor* input_grad) {
     bool channel_last = (data_format == "NHWC");
@@ -384,6 +398,9 @@ class MaxPool2dGradFunctor<CPUContext, T> {
     const int64_t padding_height = paddings[0];
     const int64_t padding_width = paddings[1];
 
+    const int64_t dilation_height = dilations[0];
+    const int64_t dilation_width = dilations[1];
+
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
@@ -396,16 +413,21 @@ class MaxPool2dGradFunctor<CPUContext, T> {
         for (int64_t c = 0; c < output_channels; ++c) {
           for (int64_t ph = 0; ph < output_height; ++ph) {
             int64_t hstart = ph * stride_height - padding_height;
-            int64_t hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, static_cast<int64_t>(0));
+            int64_t hend = hstart + (ksize_height - 1) * dilation_height + 1;
+            while (hstart < static_cast<int64_t>(0)) hstart += dilation_height;
+            while (hend > input_height) hend -= dilation_height;
+
             for (int64_t pw = 0; pw < output_width; ++pw) {
               int64_t wstart = pw * stride_width - padding_width;
-              int64_t wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, static_cast<int64_t>(0));
+              int64_t wend = wstart + (ksize_width - 1) * dilation_width + 1;
+              while (wstart < static_cast<int64_t>(0)) wstart += dilation_width;
+              while (wend > input_width) wend -= dilation_width;
 
               bool stop = false;
-              for (int64_t h = hstart; h < hend && !stop; ++h) {
-                for (int64_t w = wstart; w < wend && !stop; ++w) {
+              for (int64_t h = hstart; h < hend && !stop;
+                   h += dilation_height) {
+                for (int64_t w = wstart; w < wend && !stop;
+                     w += dilation_width) {
                   int64_t input_idx = h * input_width + w;
                   int64_t output_idx = ph * output_width + pw;
                   if (input_data[input_idx] == output_data[output_idx]) {
@@ -430,16 +452,20 @@ class MaxPool2dGradFunctor<CPUContext, T> {
         for (int64_t c = 0; c < output_channels; ++c) {
           for (int64_t ph = 0; ph < output_height; ++ph) {
             int64_t hstart = ph * stride_height - padding_height;
-            int64_t hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, static_cast<int64_t>(0));
+            int64_t hend = hstart + (ksize_height - 1) * dilation_height + 1;
+            while (hstart < static_cast<int64_t>(0)) hstart += dilation_height;
+            while (hend > input_height) hend -= dilation_height;
             for (int64_t pw = 0; pw < output_width; ++pw) {
               int64_t wstart = pw * stride_width - padding_width;
-              int64_t wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, static_cast<int64_t>(0));
+              int64_t wend = wstart + (ksize_width - 1) * dilation_width + 1;
+              while (wstart < static_cast<int64_t>(0)) wstart += dilation_width;
+              while (wend > input_width) wend -= dilation_width;
 
               bool stop = false;
-              for (int64_t h = hstart; h < hend && !stop; ++h) {
-                for (int64_t w = wstart; w < wend && !stop; ++w) {
+              for (int64_t h = hstart; h < hend && !stop;
+                   h += dilation_height) {
+                for (int64_t w = wstart; w < wend && !stop;
+                     w += dilation_width) {
                   int64_t input_idx =
                       h * input_width * input_channels + w * input_channels + c;
                   int64_t output_idx = ph * output_width * output_channels +
@@ -1103,6 +1129,102 @@ template class Pool3dGradFunctor<phi::CPUContext,
 template class Pool3dGradFunctor<phi::CPUContext,
                                  AvgPoolGrad<dtype::float16>,
                                  dtype::float16>;
+
+// native版本，先不删
+// /*
+//  * All tensors are in NCHW format.
+//  * Ksize, strides, paddings, dilations are two elements. These two elements
+//  represent
+//  * height and width, respectively.
+//  */
+// template <typename T1, typename T2>
+// class MaxPool2dWithIndexFunctor<CPUContext, T1, T2> {
+//  public:
+//   void operator()(const CPUContext& context,
+//                   const DenseTensor& input,
+//                   const std::vector<int64_t>& ksize,
+//                   const std::vector<int64_t>& strides,
+//                   const std::vector<int64_t>& paddings,
+//                   const std::vector<int64_t>& dilations,
+//                   bool adaptive,
+//                   DenseTensor* output,
+//                   DenseTensor* mask) {
+//     const int64_t batch_size = input.dims()[0];
+//     const int64_t input_height = input.dims()[2];
+//     const int64_t input_width = input.dims()[3];
+//     const int64_t output_channels = output->dims()[1];
+//     const int64_t output_height = output->dims()[2];
+//     const int64_t output_width = output->dims()[3];
+//     const int64_t ksize_height = ksize[0];
+//     const int64_t ksize_width = ksize[1];
+//     const int64_t stride_height = strides[0];
+//     const int64_t stride_width = strides[1];
+//     const int64_t padding_height = paddings[0];
+//     const int64_t padding_width = paddings[1];
+//     const int64_t dilation_height = dilations[0];
+//     const int64_t dilation_width = dilations[1];
+//     const int64_t input_stride = input_height * input_width;
+//     const int64_t output_stride = output_height * output_width;
+
+//     const T1* input_data = input.data<T1>();
+//     T1* output_data = context.template Alloc<T1>(output);
+//     T2* mask_data = context.template Alloc<T2>(mask);
+
+//     int64_t hstart = 0, hend = 0;
+//     int64_t wstart = 0, wend = 0;
+//     for (int64_t i = 0; i < batch_size; i++) {
+//       for (int64_t c = 0; c < output_channels; ++c) {
+//         for (int64_t ph = 0; ph < output_height; ++ph) {
+//           if(adaptive){
+//             hstart = AdaptStartIndex(ph, input_height, output_height);
+//             hend = AdaptEndIndex(ph, input_height, output_height);
+//           }else{
+//             hstart = ph * stride_height - padding_height;
+//           }
+//           for(int64_t pw=0;pw<output_width;++pw){
+//             T1 ele = static_cast<T1>(-FLT_MAX);
+//             int64_t index = -1;
+//             if(adaptive){
+//               wstart = AdaptStartIndex(pw, input_width, output_width);
+//               wend = AdaptEndIndex(pw, input_width, output_width);
+//               for (int64_t h = hstart; h < hend; ++h) {
+//                 for (int64_t w = wstart; w < wend; ++w) {
+//                   if (ele < input_data[h * input_width + w]) {
+//                     ele = input_data[h * input_width + w];
+//                     index = h * input_width + w;
+//                   }
+//                 }
+//               }
+//             }else{
+//               wstart=pw*stride_width-padding_width;
+//               for (int64_t kh=0;kh<ksize_height;++kh){
+//                 int64_t h=hstart+kh*dilation_height;
+//                 if (h < 0 || h >= input_height)
+//                     continue;
+//                 for(int64_t kw=0;kw<ksize_width;++kw){
+//                   int64_t w=wstart+kw*dilation_width;
+//                   if(w<0||w>=input_width)
+//                     continue;
+//                   if (ele < input_data[h * input_width + w]) {
+//                     ele = input_data[h * input_width + w];
+//                     index = h * input_width + w;
+//                   }
+//                 }
+//               }
+//             }
+//             output_data[ph * output_width + pw] = ele;
+//             mask_data[ph * output_width + pw] = index;
+//           }
+//         }
+//         // offset
+//         input_data += input_stride;
+//         output_data += output_stride;
+//         mask_data += output_stride;
+//       }
+//     }
+//   }
+// };
+
 /*
  * All tensors are in NCHW format.
  * Ksize, strides, paddings are two elements. These two elements represent
@@ -1116,6 +1238,7 @@ class MaxPool2dWithIndexFunctor<CPUContext, T1, T2> {
                   const std::vector<int64_t>& ksize,
                   const std::vector<int64_t>& strides,
                   const std::vector<int64_t>& paddings,
+                  const std::vector<int64_t>& dilations,
                   bool adaptive,
                   DenseTensor* output,
                   DenseTensor* mask) {
@@ -1131,6 +1254,8 @@ class MaxPool2dWithIndexFunctor<CPUContext, T1, T2> {
     const int64_t stride_width = strides[1];
     const int64_t padding_height = paddings[0];
     const int64_t padding_width = paddings[1];
+    const int64_t dilation_height = dilations[0];
+    const int64_t dilation_width = dilations[1];
     const int64_t input_stride = input_height * input_width;
     const int64_t output_stride = output_height * output_width;
 
@@ -1148,23 +1273,25 @@ class MaxPool2dWithIndexFunctor<CPUContext, T1, T2> {
             hend = AdaptEndIndex(ph, input_height, output_height);
           } else {
             hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, static_cast<int64_t>(0));
+            hend = hstart + (ksize_height - 1) * dilation_height + 1;
+            while (hstart < static_cast<int64_t>(0)) hstart += dilation_height;
+            while (hend > input_height) hend -= dilation_height;
           }
           for (int64_t pw = 0; pw < output_width; ++pw) {
             if (adaptive) {
               wstart = AdaptStartIndex(pw, input_width, output_width);
               wend = AdaptEndIndex(pw, input_width, output_width);
             } else {
               wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, static_cast<int64_t>(0));
+              wend = wstart + (ksize_width - 1) * dilation_width + 1;
+              while (wstart < static_cast<int64_t>(0)) wstart += dilation_width;
+              while (wend > input_width) wend -= dilation_width;
             }
 
             T1 ele = static_cast<T1>(-FLT_MAX);
             int64_t index = -1;
-            for (int64_t h = hstart; h < hend; ++h) {
-              for (int64_t w = wstart; w < wend; ++w) {
+            for (int64_t h = hstart; h < hend; h += dilation_height) {
+              for (int64_t w = wstart; w < wend; w += dilation_width) {
                 if (ele < input_data[h * input_width + w]) {
                   ele = input_data[h * input_width + w];
                   index = h * input_width + w;
@@ -1198,6 +1325,7 @@ class MaxPool2dWithIndexGradFunctor<CPUContext, T1, T2> {
                   const std::vector<int64_t>& ksize UNUSED,
                   const std::vector<int64_t>& strides UNUSED,
                   const std::vector<int64_t>& paddings UNUSED,
+                  const std::vector<int64_t>& dilations UNUSED,
                   bool adaptive UNUSED,
                   DenseTensor* input_grad) {
     const int64_t batch_size = input_grad->dims()[0];