llvm · vtjnash · Feb 10, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/compiler-rt/lib/builtins/README.txt b/compiler-rt/lib/builtins/README.txt
@@ -272,11 +272,6 @@ switch32
 switch8
 switchu8
 
-// This function generates a custom trampoline function with the specific
-// realFunc and localsPtr values.
-void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated,
-                        const void* realFunc, void* localsPtr);
-
 // There is no C interface to the *_vfp_d8_d15_regs functions.  There are
 // called in the prolog and epilog of Thumb1 functions.  When the C++ ABI use
 // SJLJ for exceptions, each function with a catch clause or destructors needs

diff --git a/compiler-rt/lib/builtins/trampoline_setup.c b/compiler-rt/lib/builtins/trampoline_setup.c
@@ -41,45 +41,3 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
   __clear_cache(trampOnStack, &trampOnStack[10]);
 }
 #endif // __powerpc__ && !defined(__powerpc64__)
-
-// The AArch64 compiler generates calls to __trampoline_setup() when creating
-// trampoline functions on the stack for use with nested functions.
-// This function creates a custom 36-byte trampoline function on the stack
-// which loads x18 with a pointer to the outer function's locals
-// and then jumps to the target nested function.
-// Note: x18 is a reserved platform register on Windows and macOS.
-
-#if defined(__aarch64__) && defined(__ELF__)
-COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
-                                        int trampSizeAllocated,
-                                        const void *realFunc, void *localsPtr) {
-  // This should never happen, but if compiler did not allocate
-  // enough space on stack for the trampoline, abort.
-  if (trampSizeAllocated < 36)
-    compilerrt_abort();
-
-  // create trampoline
-  // Load realFunc into x17. mov/movk 16 bits at a time.
-  trampOnStack[0] =
-      0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11;
-  trampOnStack[1] =
-      0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11;
-  trampOnStack[2] =
-      0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11;
-  trampOnStack[3] =
-      0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11;
-  // Load localsPtr into x18
-  trampOnStack[4] =
-      0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12;
-  trampOnStack[5] =
-      0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12;
-  trampOnStack[6] =
-      0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12;
-  trampOnStack[7] =
-      0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12;
-  trampOnStack[8] = 0xd61f0220; // br x17
-
-  // Clear instruction cache.
-  __clear_cache(trampOnStack, &trampOnStack[9]);
-}
-#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64)
diff --git a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
@@ -7,7 +7,7 @@
 
 /*
  * Tests nested functions
- * The ppc and aarch64 compilers generates a call to __trampoline_setup
+ * The ppc compiler generates a call to __trampoline_setup
  * The i386 and x86_64 compilers generate a call to ___enable_execute_stack
  */
 

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -274,12 +274,12 @@ class BoxedProcedurePass
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);
-            // For AArch64, PPC32 and PPC64, the thunk is populated by a call to
+            // For PPC32 and PPC64, the thunk is populated by a call to
             // __trampoline_setup, which is defined in
             // compiler-rt/lib/builtins/trampoline_setup.c and requires the
-            // thunk size greater than 32 bytes.  For RISCV and x86_64, the
-            // thunk setup doesn't go through __trampoline_setup and fits in 32
-            // bytes.
+            // thunk size greater than 32 bytes.  For AArch64, RISCV and x86_64,
+            // the thunk setup doesn't go through __trampoline_setup and fits in
+            // 32 bytes.
             fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
             mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
             auto buffer = builder.create<AllocaOp>(loc, buffTy);

diff --git a/flang/test/Fir/boxproc.fir b/flang/test/Fir/boxproc.fir
@@ -3,7 +3,7 @@
 // RUN: %if powerpc-registered-target %{tco --target=powerpc64le-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-PPC %}
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy()
-// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
 // CHECK-X86:     %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
 // CHECK-PPC:     %[[VAL_3:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8
@@ -63,7 +63,7 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
 }
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy_char()
-// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
 // CHECK-X86:     %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
 // CHECK-PPC:     %[[VAL_20:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -410,8 +410,8 @@ added in the future:
       calling convention: on most platforms, they are not preserved and need to
       be saved by the caller, but on Windows, xmm6-xmm15 are preserved.
 
-    - On AArch64 the callee preserve all general purpose registers, except X0-X8
-      and X16-X18.
+    - On AArch64 the callee preserve all general purpose registers, except X0-X9
+      and X15-X18. X9 can be used as a scratch register.
 
     The idea behind this convention is to support calls to runtime functions
     that have a hot path and a cold path. The hot path is usually a small piece
@@ -447,9 +447,9 @@ added in the future:
       R11. R11 can be used as a scratch register. Furthermore it also preserves
       all floating-point registers (XMMs/YMMs).
 
-    - On AArch64 the callee preserve all general purpose registers, except X0-X8
-      and X16-X18. Furthermore it also preserves lower 128 bits of V8-V31 SIMD -
-      floating point registers.
+    - On AArch64 the callee preserve all general purpose registers, except X0-X9
+      and X15-X18. Furthermore it also preserves lower 128 bits of V8-V31 SIMD -
+      floating point registers. X9 can be used as a scratch register.
 
     The idea behind this convention is to support calls to runtime functions
     that don't need to call out to any other functions.
@@ -20903,7 +20903,12 @@ sufficiently aligned block of memory; this memory is written to by the
 intrinsic. Note that the size and the alignment are target-specific -
 LLVM currently provides no portable way of determining them, so a
 front-end that generates this intrinsic needs to have some
-target-specific knowledge. The ``func`` argument must hold a function.
+target-specific knowledge.
+
+The ``func`` argument must be a constant (potentially bitcasted) pointer to a
+function declaration or definition, since the calling convention may affect the
+content of the trampoline that is created.
+
 
 Semantics:
 """"""""""

diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -28,6 +28,12 @@ class CCIfSubtarget<string F, CCAction A>
 //===----------------------------------------------------------------------===//
 
 defvar AArch64_Common = [
+  // The 'nest' parameter, if any, is passed in X15.
+  // The previous register used here (X18) is also defined to be unavailable
+  // for this purpose, while all of X9-X15 were defined to be free for LLVM to
+  // use for this, so use X15 (which LLVM often already clobbers anyways).
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
@@ -117,16 +123,12 @@ defvar AArch64_Common = [
 ];
 
 let Entry = 1 in
-def CC_AArch64_AAPCS : CallingConv<!listconcat(
-  // The 'nest' parameter, if any, is passed in X18.
-  // Darwin and Windows use X18 as the platform register and hence 'nest' isn't
-  // currently supported there.
-  [CCIfNest<CCAssignToReg<[X18]>>],
-  AArch64_Common
-)>;
+def CC_AArch64_AAPCS : CallingConv<AArch64_Common>;
 
 let Entry = 1 in
 def RetCC_AArch64_AAPCS : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
@@ -177,6 +179,8 @@ def CC_AArch64_Win64_VarArg : CallingConv<[
 // a stack layout compatible with the x64 calling convention.
 let Entry = 1 in
 def CC_AArch64_Arm64EC_VarArg : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   // Convert small floating-point values to integer.
   CCIfType<[f16, bf16], CCBitConvertToType<i16>>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
@@ -295,6 +299,8 @@ def CC_AArch64_Arm64EC_Thunk_Native : CallingConv<[
 
 let Entry = 1 in
 def RetCC_AArch64_Arm64EC_Thunk : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   // The X86-Win64 calling convention always returns __m64 values in RAX.
   CCIfType<[x86mmx], CCBitConvertToType<i64>>,
 
@@ -353,6 +359,8 @@ def RetCC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[
 //     + Stack slots are sized as needed rather than being at least 64-bit.
 let Entry = 1 in
 def CC_AArch64_DarwinPCS : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
@@ -427,6 +435,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
 
 let Entry = 1 in
 def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
@@ -450,6 +460,8 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
 // same as the normal Darwin VarArgs handling.
 let Entry = 1 in
 def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
 
@@ -494,6 +506,8 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
 
 let Entry = 1 in
 def CC_AArch64_GHC : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
 
   // Handle all vector types as either f64 or v2f64.
@@ -523,6 +537,7 @@ def CC_AArch64_Preserve_None : CallingConv<[
   // We can pass arguments in all general registers, except:
   // - X8, used for sret
   // - X16/X17, used by the linker as IP0/IP1
+  // - X15, the nest register and used by Windows for stack allocation
   // - X18, the platform register
   // - X19, the base pointer
   // - X29, the frame pointer
@@ -533,6 +548,7 @@ def CC_AArch64_Preserve_None : CallingConv<[
   // normal functions without saving and reloading arguments.
   // X9 is assigned last as it is used in FrameLowering as the first
   // choice for a scratch register.
+  CCIfNest<CCAssignToReg<[X15]>>,
   CCIfType<[i32], CCAssignToReg<[W20, W21, W22, W23,
                                  W24, W25, W26, W27, W28,
                                  W0, W1, W2, W3, W4, W5,
@@ -544,12 +560,6 @@ def CC_AArch64_Preserve_None : CallingConv<[
                                  X6, X7, X10, X11,
                                  X12, X13, X14, X9]>>,
 
-  // Windows uses X15 for stack allocation
-  CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
-    CCIfType<[i32], CCAssignToReg<[W15]>>>,
-  CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
-    CCIfType<[i64], CCAssignToReg<[X15]>>>,
-
   CCDelegateTo<CC_AArch64_AAPCS>
 ]>;
 
@@ -681,7 +691,7 @@ def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>;
 def CSR_AArch64_NoneRegs : CalleeSavedRegs<(add LR, FP)>;
 
 def CSR_AArch64_RT_MostRegs :  CalleeSavedRegs<(add CSR_AArch64_AAPCS,
-                                                (sequence "X%u", 9, 15))>;
+                                                (sequence "X%u", 10, 14))>;
 
 def CSR_AArch64_RT_AllRegs :  CalleeSavedRegs<(add CSR_AArch64_RT_MostRegs,
                                                 (sequence "Q%u", 8, 31))>;

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2044,6 +2044,26 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
           : 0;
 
   if (windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) {
+    // Find an available register to store value of VG to.
+    unsigned X15Scratch = AArch64::NoRegister;
+    const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+    if (llvm::any_of(MBB.liveins(),
+                     [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
+                       return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
+                           AArch64::X15, LiveIn.PhysReg);
+                     })) {
+      X15Scratch = findScratchNonCalleeSaveRegister(&MBB);
+      assert(X15Scratch != AArch64::NoRegister);
+#ifndef NDEBUG
+      LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
+#endif
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch)
+          .addReg(AArch64::XZR)
+          .addReg(AArch64::X15, RegState::Undef)
+          .addReg(AArch64::X15, RegState::Implicit)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
     uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
     if (NeedsWinCFI) {
       HasWinCFI = true;
@@ -2166,6 +2186,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       // we've set a frame pointer and already finished the SEH prologue.
       assert(!NeedsWinCFI);
     }
+    if (X15Scratch != AArch64::NoRegister) {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15)
+          .addReg(AArch64::XZR)
+          .addReg(X15Scratch, RegState::Undef)
+          .addReg(X15Scratch, RegState::Implicit)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
   }
 
   StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;