[CIR][CUDA] Support device-side printf

AdUhTkJm · AdUhTkJm · commit 408ca37e050f · 2025-03-13T15:13:34.000Z
diff --git a/clang/lib/CIR/CodeGen/CIRGPUBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGPUBuiltin.cpp
@@ -0,0 +1,98 @@
+#include "CIRGenFunction.h"
+
+using namespace cir;
+using namespace clang;
+using namespace clang::CIRGen;
+
+// vprintf takes two args: A format string, and a pointer to a buffer containing
+// the varargs.
+//
+// For example, the call
+//
+//   printf("format string", arg1, arg2, arg3);
+//
+// is converted into something resembling
+//
+//   struct Tmp {
+//     Arg1 a1;
+//     Arg2 a2;
+//     Arg3 a3;
+//   };
+//   char* buf = alloca(sizeof(Tmp));
+//   *(Tmp*)buf = {a1, a2, a3};
+//   vprintf("format string", buf);
+//
+// `buf` is aligned to the max of {alignof(Arg1), ...}.  Furthermore, each of
+// the args is itself aligned to its preferred alignment.
+//
+// Note that by the time this function runs, the arguments have already
+// undergone the standard C vararg promotion (short -> int, float -> double
+// etc). In this function we pack the arguments into the buffer described above.
+mlir::Value packArgsIntoNVPTXFormatBuffer(CIRGenFunction &cgf,
+                                          const CallArgList &args,
+                                          mlir::Location loc) {
+  const CIRDataLayout &dataLayout = cgf.CGM.getDataLayout();
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+
+  if (args.size() <= 1)
+    // If there are no arguments other than the format string,
+    // pass a nullptr to vprintf.
+    return builder.getNullPtr(cgf.VoidPtrTy, loc);
+
+  llvm::SmallVector<mlir::Type, 8> argTypes;
+  for (auto arg : llvm::drop_begin(args))
+    argTypes.push_back(arg.getRValue(cgf, loc).getScalarVal().getType());
+
+  // We can directly store the arguments into a struct, and the alignment
+  // would automatically be correct. That's because vprintf does not
+  // accept aggregates.
+  mlir::Type allocaTy =
+      cir::StructType::get(&cgf.getMLIRContext(), argTypes, /*packed=*/false,
+                           /*padded=*/false, StructType::Struct);
+  mlir::Value alloca =
+      cgf.CreateTempAlloca(allocaTy, loc, "printf_args", nullptr);
+
+  for (auto [i, arg] : llvm::enumerate(llvm::drop_begin(args))) {
+    mlir::Value member =
+        builder.createGetMember(loc, cir::PointerType::get(argTypes[i]), alloca,
+                                /*name=*/"", /*index=*/i);
+    auto preferredAlign = clang::CharUnits::fromQuantity(
+        dataLayout.getPrefTypeAlign(argTypes[i]).value());
+    builder.createAlignedStore(loc, arg.getRValue(cgf, loc).getScalarVal(),
+                               member, preferredAlign);
+  }
+
+  return builder.createBitcast(alloca, cgf.VoidPtrTy);
+}
+
+mlir::Value
+CIRGenFunction::emitNVPTXDevicePrintfCallExpr(const CallExpr *expr) {
+  assert(CGM.getTriple().isNVPTX());
+  CallArgList args;
+  emitCallArgs(args,
+               expr->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
+               expr->arguments(), expr->getDirectCallee());
+
+  mlir::Location loc = getLoc(expr->getBeginLoc());
+
+  // Except the format string, no non-scalar arguments are allowed for
+  // device-side printf.
+  bool hasNonScalar =
+      llvm::any_of(llvm::drop_begin(args), [&](const CallArg &A) {
+        return !A.getRValue(*this, loc).isScalar();
+      });
+  if (hasNonScalar) {
+    CGM.ErrorUnsupported(expr, "non-scalar args to printf");
+    return builder.getConstInt(loc, SInt32Ty, 0);
+  }
+
+  mlir::Value packedData = packArgsIntoNVPTXFormatBuffer(*this, args, loc);
+
+  // int vprintf(char *format, void *packedData);
+  auto vprintf = CGM.createRuntimeFunction(
+      FuncType::get({cir::PointerType::get(SInt8Ty), VoidPtrTy}, SInt32Ty),
+      "vprintf");
+  auto formatString = args[0].getRValue(*this, loc).getScalarVal();
+  return builder.createCallOp(loc, vprintf, {formatString, packedData})
+      .getResult();
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -2340,12 +2340,14 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm_unreachable("BI__builtin_load_halff NYI");
 
   case Builtin::BI__builtin_printf:
-    llvm_unreachable("BI__builtin_printf NYI");
   case Builtin::BIprintf:
-    if (getTarget().getTriple().isNVPTX() ||
-        getTarget().getTriple().isAMDGCN()) {
+    assert(E->getNumArgs() >= 1);
+    if (getTarget().getTriple().isAMDGCN()) {
       llvm_unreachable("BIprintf NYI");
     }
+    if (getTarget().getTriple().isNVPTX()) {
+      return RValue::get(emitNVPTXDevicePrintfCallExpr(E));
+    }
     break;
 
   case Builtin::BI__builtin_canonicalize:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1478,6 +1478,8 @@ class CIRGenFunction : public CIRGenTypeCache {
   mlir::Value emitX86BuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   mlir::Value emitNVPTXBuiltinExpr(unsigned builtinID, const CallExpr *expr);
 
+  mlir::Value emitNVPTXDevicePrintfCallExpr(const CallExpr *expr);
+
   /// Given an expression with a pointer type, emit the value and compute our
   /// best estimate of the alignment of the pointee.
   ///
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -43,6 +43,7 @@ add_clang_library(clangCIR
   CIRGenTypes.cpp
   CIRGenVTables.cpp
   CIRGenerator.cpp
+  CIRGPUBuiltin.cpp
   CIRPasses.cpp
   CIRRecordLayoutBuilder.cpp
   ConstantInitBuilder.cpp
diff --git a/clang/test/CIR/CodeGen/CUDA/printf.cu b/clang/test/CIR/CodeGen/CUDA/printf.cu
@@ -0,0 +1,34 @@
+#include "../Inputs/cuda.h"
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \
+// RUN:            -fcuda-is-device -emit-cir -target-sdk-version=12.3 \
+// RUN:            %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR-DEVICE --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \
+// RUN:            -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \
+// RUN:            %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM-DEVICE --input-file=%t.ll %s
+
+
+__device__ void printer() {
+  printf("%d", 0);
+}
+
+// CIR-DEVICE: cir.func @_Z7printerv() extra({{.*}}) {
+// CIR-DEVICE:   %[[#Packed:]] = cir.alloca !ty_anon_struct
+// CIR-DEVICE:   %[[#Zero:]] = cir.const #cir.int<0> : !s32i loc(#loc5)
+// CIR-DEVICE:   %[[#Field0:]] = cir.get_member %0[0]
+// CIR-DEVICE:   cir.store align(4) %[[#Zero]], %[[#Field0]]
+// CIR-DEVICE:   %[[#Output:]] = cir.cast(bitcast, %[[#Packed]] : !cir.ptr<!ty_anon_struct>)
+// CIR-DEVICE:   cir.call @vprintf(%{{.+}}, %[[#Output]])
+// CIR-DEVICE:   cir.return
+// CIR-DEVICE: }
+
+// LLVM-DEVICE: define dso_local void @_Z7printerv() {{.*}} {
+// LLVM-DEVICE:   %[[#LLVMPacked:]] = alloca { i32 }, i64 1, align 8
+// LLVM-DEVICE:   %[[#LLVMField0:]] = getelementptr { i32 }, ptr %[[#LLVMPacked]], i32 0, i32 0
+// LLVM-DEVICE:   store i32 0, ptr %[[#LLVMField0]], align 4
+// LLVM-DEVICE:   call i32 @vprintf(ptr @.str, ptr %[[#LLVMPacked]])
+// LLVM-DEVICE:   ret void
+// LLVM-DEVICE: }