PaddlePaddle · changeyoung98 · Jul 9, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc
@@ -216,6 +216,8 @@ OP_SAME_OPERANDS_AND_RESULT(IndexElementwisePut)
 OP_SAME_OPERANDS_AND_RESULT(IndexElementwisePut_)
 OP_SAME_OPERANDS_AND_RESULT(IndexElementwisePutWithTensor)
 OP_SAME_OPERANDS_AND_RESULT(IndexElementwisePutWithTensor_)
+OP_SAME_OPERANDS_AND_RESULT(MaskedFillElementwise)
+OP_SAME_OPERANDS_AND_RESULT(MaskedFillElementwise_)
 
 bool ScaleOpInferSymbolicShape(pir::Operation *op,
                                pir::InferSymbolicShapeContext *infer_context) {

diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.h
@@ -213,6 +213,8 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(IndexElementwisePut)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(IndexElementwisePut_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(IndexElementwisePutWithTensor)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(IndexElementwisePutWithTensor_)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(MaskedFillElementwise)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(MaskedFillElementwise_)
 
 }  // namespace paddle::dialect
 

diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
@@ -2096,14 +2096,29 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
       }
     } else {
       paddle::Tensor mask_tensor;
-      if (!out_is_view &&
-          MaskedFillValueDispatching(
+      if (MaskedFillValueDispatching(
               transed_sub_tensor, transed_index, &mask_tensor)) {
-        masked_fill_shortcut = true;
-        paddle::Tensor value_tmp_tensor =
-            full_ad_func({1}, values[0], tensor.dtype(), tensor.place());
-        transed_sub_tensor = masked_fill__ad_func(
-            transed_sub_tensor, mask_tensor, value_tmp_tensor);
+        if (!out_is_view) {
+          masked_fill_shortcut = true;
+          paddle::Tensor value_tmp_tensor =
+              full_ad_func({1}, values[0], tensor.dtype(), tensor.place());
+          transed_sub_tensor = masked_fill__ad_func(
+              transed_sub_tensor, mask_tensor, value_tmp_tensor);
+        } else {
+          masked_fill_shortcut = true;
+          mask_tensor = expand_inplace(transed_sub_tensor, mask_tensor);
+          int64_t slice_offset = static_cast<int64_t>(
+              reinterpret_cast<char*>(transed_sub_tensor.data()) -
+              reinterpret_cast<char*>(tensor.data()));
+          transed_sub_tensor = masked_fill_elementwise__ad_func(
+              tensor,
+              mask_tensor,
+              values[0],
+              common::vectorize<int64_t>(transed_sub_tensor.dims()),
+              common::vectorize<int64_t>(transed_sub_tensor.strides()),
+              slice_offset);
+          out_is_view = false;
+        }
       }
     }
 

diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
@@ -72,6 +72,18 @@ static inline common::DDim infer_size_symdimvector(common::DDim a,
   return expandedSizes;
 }
 
+static inline paddle::Tensor expand_inplace(paddle::Tensor tensor,
+                                            paddle::Tensor to_expand) {
+  if (tensor.dims() == to_expand.dims()) {
+    return to_expand;
+  } else if (tensor.dims()[0] == to_expand.dims()[0]) {
+    return expand_ad_func(to_expand, common::vectorize<int64_t>(tensor.dims()));
+  } else {
+    to_expand = squeeze_ad_func(to_expand, {-1});
+    return expand_ad_func(to_expand, common::vectorize<int64_t>(tensor.dims()));
+  }
+}
+
 static inline std::vector<paddle::Tensor> expandTensors(
     std::vector<paddle::Tensor> indices) {
   // expands bool to int tensors;

diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
@@ -2959,6 +2959,17 @@ void MaskedFillInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void MaskedFillElementwiseInferMeta(const MetaTensor& x,
+                                    const MetaTensor& mask,
+                                    const Scalar& value,
+                                    const std::vector<int64_t>& input_dims,
+                                    const std::vector<int64_t>& input_strides,
+                                    const int64_t slice_offset,
+                                    MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+}
+
 void MatmulInferMeta(const MetaTensor& x,
                      const MetaTensor& y,
                      bool trans_x,

diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
@@ -565,6 +565,14 @@ void MaskedFillInferMeta(const MetaTensor& x,
                          const MetaTensor& value,
                          MetaTensor* out);
 
+void MaskedFillElementwiseInferMeta(const MetaTensor& x,
+                                    const MetaTensor& mask,
+                                    const Scalar& value,
+                                    const std::vector<int64_t>& input_dims,
+                                    const std::vector<int64_t>& input_strides,
+                                    const int64_t slice_offset,
+                                    MetaTensor* out);
+
 void MatmulInferMeta(const MetaTensor& x,
                      const MetaTensor& y,
                      bool trans_x,

diff --git a/paddle/phi/kernels/cpu/masked_fill_elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/masked_fill_elementwise_grad_kernel.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/index_elementwise.h"
+#include "paddle/phi/kernels/funcs/stride_utils.h"
+#include "paddle/phi/kernels/masked_fill_grad_kernel.h"
+
+namespace phi {
+template <typename T>
+void CPUMaskedFillElementwiseGrad(const phi::CPUContext& dev_ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& mask,
+                                  const DenseTensor& out_grad,
+                                  const std::vector<int64_t>& input_dims,
+                                  const std::vector<int64_t>& input_strides,
+                                  const int64_t slice_offset,
+                                  DenseTensor* x_grad) {
+  const bool* mask_data = mask.data<bool>();
+  T* x_grad_data = x_grad->data<T>();
+  int64_t numel = 0;
+  std::array<int64_t*, 3> strides_array;
+  std::vector<int64_t> desired_shape;
+  std::array<std::vector<int64_t>, 3> strides_vec;
+  funcs::IndexPutStride<3>(input_dims,
+                           input_strides,
+                           phi::SizeOf(x.dtype()),
+                           std::vector<int64_t>(),
+                           std::vector<int64_t>(),
+                           4,
+                           common::vectorize<int64_t>(mask.dims()),
+                           common::vectorize<int64_t>(mask.strides()),
+                           phi::SizeOf(mask.dtype()),
+                           &desired_shape,
+                           &strides_array,
+                           &numel,
+                           strides_vec);
+  auto offset_calc =
+      funcs::CPUmake_offset_calculator_put<3>(desired_shape, strides_array);
+  const int64_t N = numel;
+  char* out_ptr = reinterpret_cast<char*>(x_grad_data);
+  for (int64_t idx = 0; idx < N; idx++) {
+    const auto offsets = offset_calc.cpu_get(idx);
+    char* const out_data = out_ptr + offsets[0] + slice_offset;
+    if (mask_data[idx]) {
+      *reinterpret_cast<T*>(out_data) = T{0};
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MaskedFillElementwiseGradKernel(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& mask,
+                                     const DenseTensor& out_grad,
+                                     const Scalar& value UNUSED,
+                                     const std::vector<int64_t>& input_dims,
+                                     const std::vector<int64_t>& input_strides,
+                                     const int64_t slice_offset,
+                                     DenseTensor* x_grad) {
+  if (out_grad.numel() == 0 || mask.numel() == 0) {
+    // x shape [2, 1, 3], mask shape [2, 0, 3], x_grad shape [2, 1, 3]
+    if (x_grad) {
+      phi::Full<T, Context>(
+          dev_ctx, phi::IntArray(common::vectorize(x_grad->dims())), 0, x_grad);
+    }
+  }
+
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+  }
+  CPUMaskedFillElementwiseGrad<T>(dev_ctx,
+                                  x,
+                                  mask,
+                                  out_grad,
+                                  input_dims,
+                                  input_strides,
+                                  slice_offset,
+                                  x_grad);
+  return;
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(masked_fill_elementwise_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaskedFillElementwiseGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int8_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(1).SetDataType(phi::DataType::BOOL);
+}
diff --git a/paddle/phi/kernels/cpu/masked_fill_elementwise_kernel.cc b/paddle/phi/kernels/cpu/masked_fill_elementwise_kernel.cc
@@ -0,0 +1,112 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/index_elementwise.h"
+#include "paddle/phi/kernels/funcs/stride_utils.h"
+#include "paddle/phi/kernels/masked_fill_kernel.h"
+
+namespace phi {
+
+template <typename T>
+void CPUMaskedFillElementwise(const phi::CPUContext& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& mask,
+                              const Scalar& value,
+                              const std::vector<int64_t>& input_dims,
+                              const std::vector<int64_t>& input_strides,
+                              const int64_t slice_offset,
+                              DenseTensor* output) {
+  const bool* mask_data = mask.data<bool>();
+  bool is_initialized = output->initialized();
+  bool is_same_place = true;
+  if (is_initialized) {
+    is_same_place = (x.place() == output->place());
+  }
+  dev_ctx.template Alloc<T>(output);
+  T* output_data = output->data<T>();
+  const T value_data = value.to<T>();
+  if (!is_initialized || !is_same_place) {
+    phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, output);
+  }
+  int64_t numel = 0;
+  std::array<int64_t*, 3> strides_array;
+  std::vector<int64_t> desired_shape;
+  std::array<std::vector<int64_t>, 3> strides_vec;
+  funcs::IndexPutStride<3>(input_dims,
+                           input_strides,
+                           phi::SizeOf(x.dtype()),
+                           std::vector<int64_t>(),
+                           std::vector<int64_t>(),
+                           phi::SizeOf(value.dtype()),
+                           common::vectorize<int64_t>(mask.dims()),
+                           common::vectorize<int64_t>(mask.strides()),
+                           phi::SizeOf(mask.dtype()),
+                           &desired_shape,
+                           &strides_array,
+                           &numel,
+                           strides_vec);
+  auto offset_calc =
+      funcs::CPUmake_offset_calculator_put<3>(desired_shape, strides_array);
+  const int64_t N = numel;
+  char* out_ptr = reinterpret_cast<char*>(output_data);
+  for (int64_t idx = 0; idx < N; idx++) {
+    const auto offsets = offset_calc.cpu_get(idx);
+    char* const out_data = out_ptr + offsets[0] + slice_offset;
+    if (mask_data[idx]) {
+      *reinterpret_cast<T*>(out_data) = value_data;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MaskedFillElementwiseKernel(const Context& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& mask,
+                                 const Scalar& value,
+                                 const std::vector<int64_t>& input_dims,
+                                 const std::vector<int64_t>& input_strides,
+                                 const int64_t slice_offset,
+                                 DenseTensor* out) {
+  if (x.numel() == 0 || mask.numel() == 0) {
+    dev_ctx.template Alloc<T>(out);
+    return;
+  }
+
+  CPUMaskedFillElementwise<T>(
+      dev_ctx, x, mask, value, input_dims, input_strides, slice_offset, out);
+  return;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(masked_fill_elementwise,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaskedFillElementwiseKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int8_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(1).SetDataType(phi::DataType::BOOL);
+}