Skip to content

[AArch64] fix trampoline implementation: use X15 #126743

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions compiler-rt/lib/builtins/README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -272,11 +272,6 @@ switch32
switch8
switchu8

// This function generates a custom trampoline function with the specific
// realFunc and localsPtr values.
void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated,
const void* realFunc, void* localsPtr);

// There is no C interface to the *_vfp_d8_d15_regs functions. There are
// called in the prolog and epilog of Thumb1 functions. When the C++ ABI use
// SJLJ for exceptions, each function with a catch clause or destructors needs
Expand Down
42 changes: 0 additions & 42 deletions compiler-rt/lib/builtins/trampoline_setup.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,45 +41,3 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
__clear_cache(trampOnStack, &trampOnStack[10]);
}
#endif // __powerpc__ && !defined(__powerpc64__)

// The AArch64 compiler generates calls to __trampoline_setup() when creating
// trampoline functions on the stack for use with nested functions.
// This function creates a custom 36-byte trampoline function on the stack
// which loads x18 with a pointer to the outer function's locals
// and then jumps to the target nested function.
// Note: x18 is a reserved platform register on Windows and macOS.

#if defined(__aarch64__) && defined(__ELF__)
COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
int trampSizeAllocated,
const void *realFunc, void *localsPtr) {
// This should never happen, but if compiler did not allocate
// enough space on stack for the trampoline, abort.
if (trampSizeAllocated < 36)
compilerrt_abort();

// create trampoline
// Load realFunc into x17. mov/movk 16 bits at a time.
trampOnStack[0] =
0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11;
trampOnStack[1] =
0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11;
trampOnStack[2] =
0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11;
trampOnStack[3] =
0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11;
// Load localsPtr into x18
trampOnStack[4] =
0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12;
trampOnStack[5] =
0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12;
trampOnStack[6] =
0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12;
trampOnStack[7] =
0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12;
trampOnStack[8] = 0xd61f0220; // br x17

// Clear instruction cache.
__clear_cache(trampOnStack, &trampOnStack[9]);
}
#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64)
2 changes: 1 addition & 1 deletion compiler-rt/test/builtins/Unit/trampoline_setup_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

/*
* Tests nested functions
* The ppc and aarch64 compilers generates a call to __trampoline_setup
* The ppc compiler generates a call to __trampoline_setup
* The i386 and x86_64 compilers generate a call to ___enable_execute_stack
*/

Expand Down
8 changes: 4 additions & 4 deletions flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,12 +274,12 @@ class BoxedProcedurePass
auto loc = embox.getLoc();
mlir::Type i8Ty = builder.getI8Type();
mlir::Type i8Ptr = builder.getRefType(i8Ty);
// For AArch64, PPC32 and PPC64, the thunk is populated by a call to
// For PPC32 and PPC64, the thunk is populated by a call to
// __trampoline_setup, which is defined in
// compiler-rt/lib/builtins/trampoline_setup.c and requires the
// thunk size greater than 32 bytes. For RISCV and x86_64, the
// thunk setup doesn't go through __trampoline_setup and fits in 32
// bytes.
// thunk size greater than 32 bytes. For AArch64, RISCV and x86_64,
// the thunk setup doesn't go through __trampoline_setup and fits in
// 32 bytes.
fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
auto buffer = builder.create<AllocaOp>(loc, buffTy);
Expand Down
4 changes: 2 additions & 2 deletions flang/test/Fir/boxproc.fir
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// RUN: %if powerpc-registered-target %{tco --target=powerpc64le-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-PPC %}

// CHECK-LABEL: define void @_QPtest_proc_dummy()
// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [36 x i8], i64 1, align 1
// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
// CHECK-X86: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
// CHECK-PPC: %[[VAL_3:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
// CHECK: %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8
Expand Down Expand Up @@ -63,7 +63,7 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
}

// CHECK-LABEL: define void @_QPtest_proc_dummy_char()
// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [36 x i8], i64 1, align 1
// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
// CHECK-X86: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
// CHECK-PPC: %[[VAL_20:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
// CHECK: %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8
Expand Down
17 changes: 11 additions & 6 deletions llvm/docs/LangRef.rst
Original file line number Diff line number Diff line change
Expand Up @@ -410,8 +410,8 @@ added in the future:
calling convention: on most platforms, they are not preserved and need to
be saved by the caller, but on Windows, xmm6-xmm15 are preserved.

- On AArch64 the callee preserve all general purpose registers, except X0-X8
and X16-X18.
- On AArch64 the callee preserve all general purpose registers, except X0-X9
and X15-X18. X9 can be used as a scratch register.

The idea behind this convention is to support calls to runtime functions
that have a hot path and a cold path. The hot path is usually a small piece
Expand Down Expand Up @@ -447,9 +447,9 @@ added in the future:
R11. R11 can be used as a scratch register. Furthermore it also preserves
all floating-point registers (XMMs/YMMs).

- On AArch64 the callee preserve all general purpose registers, except X0-X8
and X16-X18. Furthermore it also preserves lower 128 bits of V8-V31 SIMD -
floating point registers.
- On AArch64 the callee preserve all general purpose registers, except X0-X9
and X15-X18. Furthermore it also preserves lower 128 bits of V8-V31 SIMD -
floating point registers. X9 can be used as a scratch register.

The idea behind this convention is to support calls to runtime functions
that don't need to call out to any other functions.
Expand Down Expand Up @@ -20903,7 +20903,12 @@ sufficiently aligned block of memory; this memory is written to by the
intrinsic. Note that the size and the alignment are target-specific -
LLVM currently provides no portable way of determining them, so a
front-end that generates this intrinsic needs to have some
target-specific knowledge. The ``func`` argument must hold a function.
target-specific knowledge.

The ``func`` argument must be a constant (potentially bitcasted) pointer to a
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How are bitcasts relevant here?

Probably we should tighten the verifier check to just require a function. And if some target eventually needs trampolines where any of the operands is in a non-zero address-space, we can make the intrinsic overloaded.

Probably we also should tighten llvm::canReplaceOperandWithVariable.

And please land this separately.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It probably isn't of much relevance since opaque pointers. I think it is just also fairly silly of a restriction, since llvm doesn't care at all about the source of the value here, it just needs to know what calling convention it should have (to know which register to use). That info probably could just as easily be passed as a separate immarg value, since calling conventions are defined in the langref to be numeric.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That all makes sense.

function declaration or definition, since the calling convention may affect the
content of the trampoline that is created.


Semantics:
""""""""""
Expand Down
38 changes: 24 additions & 14 deletions llvm/lib/Target/AArch64/AArch64CallingConvention.td
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ class CCIfSubtarget<string F, CCAction A>
//===----------------------------------------------------------------------===//

defvar AArch64_Common = [
// The 'nest' parameter, if any, is passed in X15.
// The previous register used here (X18) is also defined to be unavailable
// for this purpose, while all of X9-X15 were defined to be free for LLVM to
// use for this, so use X15 (which LLVM often already clobbers anyways).
CCIfNest<CCAssignToReg<[X15]>>,

CCIfType<[iPTR], CCBitConvertToType<i64>>,
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
Expand Down Expand Up @@ -117,16 +123,12 @@ defvar AArch64_Common = [
];

let Entry = 1 in
def CC_AArch64_AAPCS : CallingConv<!listconcat(
// The 'nest' parameter, if any, is passed in X18.
// Darwin and Windows use X18 as the platform register and hence 'nest' isn't
// currently supported there.
[CCIfNest<CCAssignToReg<[X18]>>],
AArch64_Common
)>;
def CC_AArch64_AAPCS : CallingConv<AArch64_Common>;

let Entry = 1 in
def RetCC_AArch64_AAPCS : CallingConv<[
CCIfNest<CCAssignToReg<[X15]>>,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A return value can't be "nest"?


CCIfType<[iPTR], CCBitConvertToType<i64>>,
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
Expand Down Expand Up @@ -177,6 +179,8 @@ def CC_AArch64_Win64_VarArg : CallingConv<[
// a stack layout compatible with the x64 calling convention.
let Entry = 1 in
def CC_AArch64_Arm64EC_VarArg : CallingConv<[
CCIfNest<CCAssignToReg<[X15]>>,

// Convert small floating-point values to integer.
CCIfType<[f16, bf16], CCBitConvertToType<i16>>,
CCIfType<[f32], CCBitConvertToType<i32>>,
Expand Down Expand Up @@ -295,6 +299,8 @@ def CC_AArch64_Arm64EC_Thunk_Native : CallingConv<[

let Entry = 1 in
def RetCC_AArch64_Arm64EC_Thunk : CallingConv<[
CCIfNest<CCAssignToReg<[X15]>>,

// The X86-Win64 calling convention always returns __m64 values in RAX.
CCIfType<[x86mmx], CCBitConvertToType<i64>>,

Expand Down Expand Up @@ -353,6 +359,8 @@ def RetCC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[
// + Stack slots are sized as needed rather than being at least 64-bit.
let Entry = 1 in
def CC_AArch64_DarwinPCS : CallingConv<[
CCIfNest<CCAssignToReg<[X15]>>,

CCIfType<[iPTR], CCBitConvertToType<i64>>,
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
Expand Down Expand Up @@ -427,6 +435,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[

let Entry = 1 in
def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
CCIfNest<CCAssignToReg<[X15]>>,

CCIfType<[iPTR], CCBitConvertToType<i64>>,
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
Expand All @@ -450,6 +460,8 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
// same as the normal Darwin VarArgs handling.
let Entry = 1 in
def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
CCIfNest<CCAssignToReg<[X15]>>,

CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,

Expand Down Expand Up @@ -494,6 +506,8 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[

let Entry = 1 in
def CC_AArch64_GHC : CallingConv<[
CCIfNest<CCAssignToReg<[X15]>>,

CCIfType<[iPTR], CCBitConvertToType<i64>>,

// Handle all vector types as either f64 or v2f64.
Expand Down Expand Up @@ -523,6 +537,7 @@ def CC_AArch64_Preserve_None : CallingConv<[
// We can pass arguments in all general registers, except:
// - X8, used for sret
// - X16/X17, used by the linker as IP0/IP1
// - X15, the nest register and used by Windows for stack allocation
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of reducing the utility preservenone/preservemost/etc., can we just forbid using "nest" arguments with them? I can't see why you'd want to use them together.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds reasonable. If I read the code correctly, the comment about "X15 for stack allocation" is not quite accurate as well (the code doesn't necessarily have to clobber that, since it is implemented to use a temp register first--though such a register does need to be made available for it). Other comments here seem contradictory also, since it assigns X9 last "because it is needed as a scratch register"–but it seems like either it is needed as a scratch register (in which case it should not have been allowed to assign it), or it doesn't need it as a scratch register (in which case the comment is wrong since it doesn't get used as a scratch register)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, the x15 thing isn't a fundamental limitation; it's just a bit complicated to generate the correct code, and the author of the preservenonecc code didn't want to try to implement it.

For the x9 thing, see #99434 .

// - X18, the platform register
// - X19, the base pointer
// - X29, the frame pointer
Expand All @@ -533,6 +548,7 @@ def CC_AArch64_Preserve_None : CallingConv<[
// normal functions without saving and reloading arguments.
// X9 is assigned last as it is used in FrameLowering as the first
// choice for a scratch register.
CCIfNest<CCAssignToReg<[X15]>>,
CCIfType<[i32], CCAssignToReg<[W20, W21, W22, W23,
W24, W25, W26, W27, W28,
W0, W1, W2, W3, W4, W5,
Expand All @@ -544,12 +560,6 @@ def CC_AArch64_Preserve_None : CallingConv<[
X6, X7, X10, X11,
X12, X13, X14, X9]>>,

// Windows uses X15 for stack allocation
CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
CCIfType<[i32], CCAssignToReg<[W15]>>>,
CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
CCIfType<[i64], CCAssignToReg<[X15]>>>,

CCDelegateTo<CC_AArch64_AAPCS>
]>;

Expand Down Expand Up @@ -681,7 +691,7 @@ def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>;
def CSR_AArch64_NoneRegs : CalleeSavedRegs<(add LR, FP)>;

def CSR_AArch64_RT_MostRegs : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
(sequence "X%u", 9, 15))>;
(sequence "X%u", 10, 14))>;

def CSR_AArch64_RT_AllRegs : CalleeSavedRegs<(add CSR_AArch64_RT_MostRegs,
(sequence "Q%u", 8, 31))>;
Expand Down
27 changes: 27 additions & 0 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2044,6 +2044,26 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
: 0;

if (windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) {
// Find an available register to store value of VG to.
unsigned X15Scratch = AArch64::NoRegister;
const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
if (llvm::any_of(MBB.liveins(),
[&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
AArch64::X15, LiveIn.PhysReg);
})) {
X15Scratch = findScratchNonCalleeSaveRegister(&MBB);
assert(X15Scratch != AArch64::NoRegister);
#ifndef NDEBUG
LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
#endif
BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch)
.addReg(AArch64::XZR)
.addReg(AArch64::X15, RegState::Undef)
.addReg(AArch64::X15, RegState::Implicit)
.setMIFlag(MachineInstr::FrameSetup);
}

uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
if (NeedsWinCFI) {
HasWinCFI = true;
Expand Down Expand Up @@ -2166,6 +2186,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// we've set a frame pointer and already finished the SEH prologue.
assert(!NeedsWinCFI);
}
if (X15Scratch != AArch64::NoRegister) {
BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15)
.addReg(AArch64::XZR)
.addReg(X15Scratch, RegState::Undef)
.addReg(X15Scratch, RegState::Implicit)
.setMIFlag(MachineInstr::FrameSetup);
}
}

StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
Expand Down
Loading