[x86] Enable indirect tail calls with more arguments #137643

zmodem · 2025-04-28T14:49:10Z

X86ISelDAGToDAG's isCalleeLoad / moveBelowOrigChain tries to move the load instruction next to the call so they can be folded, but it would only allow a single CopyToReg node in between.

This patch makes it look through multiple CopyToReg, while being careful to only perform the transformation when the load+call can be folded.

Fixes #136848

…two args this folds more stuff, but also finds new breakages Fixes llvm#136848

llvmbot · 2025-04-28T14:49:34Z

@llvm/pr-subscribers-backend-x86

Author: Hans Wennborg (zmodem)

Changes

X86ISelDAGToDAG's isCalleeLoad / moveBelowOrigChain tries to move the load instruction next to the call so they can be folded, but it would only allow a single CopyToReg node in between.

This patch makes it look through multiple CopyToReg, while being careful to only perform the transformation when the load+call can be folded.

Fixes #136848

Full diff: https://github.com/llvm/llvm-project/pull/137643.diff

3 Files Affected:

(modified) llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (+46-13)
(modified) llvm/test/CodeGen/X86/cfguard-checks.ll (+1-2)
(added) llvm/test/CodeGen/X86/fold-call-4.ll (+13)

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 01118beb9cf5e..7d6359f701368 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -890,6 +890,12 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
       LD->getExtensionType() != ISD::NON_EXTLOAD)
     return false;
 
+  // If the load's outgoing chain has more than one use, we can't (currently)
+  // move the load since we'd most likely create a loop. TODO: Maybe it could
+  // work if moveBelowOrigChain() updated *all* the chain users.
+  if (!Callee.getValue(1).hasOneUse())
+    return false;
+
   // Now let's find the callseq_start.
   while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
     if (!Chain.hasOneUse())
@@ -897,20 +903,31 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
     Chain = Chain.getOperand(0);
   }
 
-  if (!Chain.getNumOperands())
-    return false;
-  // Since we are not checking for AA here, conservatively abort if the chain
-  // writes to memory. It's not safe to move the callee (a load) across a store.
-  if (isa<MemSDNode>(Chain.getNode()) &&
-      cast<MemSDNode>(Chain.getNode())->writeMem())
+  while (true) {
+    if (!Chain.getNumOperands())
+      return false;
+    // Since we are not checking for AA here, conservatively abort if the chain
+    // writes to memory. It's not safe to move the callee (a load) across a store.
+    if (isa<MemSDNode>(Chain.getNode()) &&
+        cast<MemSDNode>(Chain.getNode())->writeMem())
+      return false;
+
+    if (Chain.getOperand(0).getNode() == Callee.getNode())
+      return true;
+    if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+        Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+        Callee.getValue(1).hasOneUse())
+      return true;
+
+    // Look past CopyToRegs. We only walk one path, so the chain mustn't branch.
+    if (Chain.getOperand(0).getOpcode() == ISD::CopyToReg &&
+        Chain.getOperand(0).getValue(0).hasOneUse()) {
+      Chain = Chain.getOperand(0);
+      continue;
+    }
+
     return false;
-  if (Chain.getOperand(0).getNode() == Callee.getNode())
-    return true;
-  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
-      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
-      Callee.getValue(1).hasOneUse())
-    return true;
-  return false;
+  }
 }
 
 static bool isEndbrImm64(uint64_t Imm) {
@@ -1353,6 +1370,22 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
          (N->getOpcode() == X86ISD::TC_RETURN &&
           (Subtarget->is64Bit() ||
            !getTargetMachine().isPositionIndependent())))) {
+
+      if (N->getOpcode() == X86ISD::TC_RETURN) {
+        // There needs to be enough non-callee-saved GPRs available to compute
+        // the load address if folded into the tailcall. See how the
+        // X86tcret_6regs and X86tcret_1reg classes are used and defined.
+        unsigned NumRegs = 0;
+        for (unsigned I = 3, E = N->getNumOperands(); I != E; ++I) {
+          if (isa<RegisterSDNode>(N->getOperand(I)))
+            ++NumRegs;
+        }
+        if (!Subtarget->is64Bit() && NumRegs > 1)
+          continue;
+        if (NumRegs > 6)
+          continue;
+      }
+
       /// Also try moving call address load from outside callseq_start to just
       /// before the call to allow it to be folded.
       ///
diff --git a/llvm/test/CodeGen/X86/cfguard-checks.ll b/llvm/test/CodeGen/X86/cfguard-checks.ll
index a727bbbfdcbe3..db19efaf910a3 100644
--- a/llvm/test/CodeGen/X86/cfguard-checks.ll
+++ b/llvm/test/CodeGen/X86/cfguard-checks.ll
@@ -210,8 +210,7 @@ entry:
   ; X64-LABEL: vmptr_thunk:
   ; X64:            movq (%rcx), %rax
   ; X64-NEXT:       movq 8(%rax), %rax
-  ; X64-NEXT:       movq __guard_dispatch_icall_fptr(%rip), %rdx
-  ; X64-NEXT:       rex64 jmpq *%rdx            # TAILCALL
+  ; X64-NEXT:       rex64 jmpq *__guard_dispatch_icall_fptr(%rip) # TAILCALL
   ; X64-NOT:   callq
 }
 
diff --git a/llvm/test/CodeGen/X86/fold-call-4.ll b/llvm/test/CodeGen/X86/fold-call-4.ll
new file mode 100644
index 0000000000000..708e05a0bfff0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fold-call-4.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+; The callee address computation should get folded into the call.
+; CHECK-LABEL: f:
+; CHECK-NOT: mov
+; CHECK: jmpq *(%rdi,%rsi,8)
+define void @f(ptr %table, i64 %idx, i64 %aux1, i64 %aux2, i64 %aux3) {
+entry:
+  %arrayidx = getelementptr inbounds ptr, ptr %table, i64 %idx
+  %funcptr = load ptr, ptr %arrayidx, align 8
+  tail call void %funcptr(ptr %table, i64 %idx, i64 %aux1, i64 %aux2, i64 %aux3)
+  ret void
+}

github-actions · 2025-04-28T14:51:41Z

✅ With the latest revision this PR passed the C/C++ code formatter.

rnk

Nice! The comments say to me this should be safe, but I'm honestly not sufficiently confident that I can read the SDAG logic well enough to know that it's correct. I would appreciate it if one of the other reviewers can take a look if they have time.

One thing you could do to build more confidence in the meantime would be to add more negative tests, like load ; store ; call load, perhaps substituting in other oddball memory clobbering write operations. Or just point at this this test if it exists.

…64 build)

zmodem · 2025-04-29T12:38:06Z

I also don't feel super confident with the DAG stuff. One positive is that we're good at asserting when there's an issue with the graph structure. That makes me more comfortable touching this code.

The current version builds Chromium for Linux (x64) and Windows (x86 and x64) asserts free (I found and fixed one issue).

The load ; store ; call load case is covered by

llvm-project/llvm/test/CodeGen/X86/fold-call.ll

Lines 20 to 26 in 2f97695

    
           define void @test2(ptr nocapture %x) { 
        
           entry: 
        
             %0 = load ptr, ptr %x 
        
             store ptr null, ptr %x 
        
             tail call void %0() 
        
             ret void 
        
           }

Having inline asm in between also seems bad (a pre-existing bug?), so I added a check for that too.

The hasOneUse() checks I added generally prevent loops, which trigger asserts in the lit tests.

rnk

Thanks for the extra tests! Sorry I came up with more questions. My confidence in this code has increased, and I'd be OK to approve it after some iteration.

rnk · 2025-04-29T22:23:47Z

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

+        // the load address if folded into the tailcall. See how the
+        // X86tcret_6regs and X86tcret_1reg classes are used and defined.
+        unsigned NumRegs = 0;
+        for (unsigned I = 3, E = N->getNumOperands(); I != E; ++I) {


Is there a way to avoid the magic number 3?

I didn't find one, but I'll try to reduce the number of copies of this code.

rnk · 2025-04-29T22:24:22Z

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

+        // X86tcret_6regs and X86tcret_1reg classes are used and defined.
+        unsigned NumRegs = 0;
+        for (unsigned I = 3, E = N->getNumOperands(); I != E; ++I) {
+          if (isa<RegisterSDNode>(N->getOperand(I)))


We can skip XMM / FP register operands, so I would do a GPR64 class check here before counting up.

rnk · 2025-04-29T22:37:31Z

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

+        }
+        if (!Subtarget->is64Bit() && NumRegs > 1)
+          continue;
+        if (NumRegs > 6)


I think these values of 1 and 6 are informed by the SysV C calling conventions, and are incorrect for other calling conventions. You can probably construct a test case with custom conventions that use all available GPRs for parameters and starve the register allocator out.

I think this would fix a real issue on Windows x64, which by my reading only has 7 "volatile" GPRs:
https://learn.microsoft.com/en-us/cpp/build/x64-software-conventions?view=msvc-170#x64-register-usage

RAX: return

RCX, RDX, R8, R9: 4 param

R10, R11: scratch

Total: 7

If there's a way to pass something in R11, maybe via the nest parameter, this might form a tail call we can't register allocate. Or maybe there's some other convention.

I think good general fixes would be to look at the CSR mask from the target calling convention and count up the available GPRs, subtract the number of GPR register operands from that total, and check that we have at least two available GPRs (for base + index).

Alternatively we could only do this fold for C calling conventions where it's known to be safe, if we convince ourselves that it's impossible to use R10 or R11 with the MS ABI convention.

You're right, Win64 has one less. I stole my code from the X86tcret_6regs fragment:

llvm-project/llvm/lib/Target/X86/X86InstrFragments.td

Lines 676 to 684 in e58d227

def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),

(X86tcret node:$ptr, node:$off), [{

// X86tcret args: (*chain, ptr, imm, regs..., glue)

unsigned NumRegs = 0;

for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)

if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)

return false;

return true;

}]>;

which is what the folding pattern for TCRETURNmi64 uses:

llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td

Lines 1345 to 1349 in e58d227

// Don't fold loads into X86tcret requiring more than 6 regs.

// There wouldn't be enough scratch registers for base+index.

def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off),

(TCRETURNmi64 addr:$dst, timm:$off)>,

Requires<[In64BitMode, NotUseIndirectThunkCalls]>;

So that seems wrong for Win64.

I think the source of truth here is the register class which the folded instruction actually uses, which is ptr_rc_tailcall that gets defined by X86RegisterInfo::getGPRsForTailCall:

llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp

Lines 227 to 239 in a2c1ff1

const TargetRegisterClass *

X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {

const Function &F = MF.getFunction();

if (IsWin64 || (F.getCallingConv() == CallingConv::Win64))

return &X86::GR64_TCW64RegClass;

else if (Is64Bit)

return &X86::GR64_TCRegClass;

bool hasHipeCC = (F.getCallingConv() == CallingConv::HiPE);

if (hasHipeCC)

return &X86::GR32RegClass;

return &X86::GR32_TCRegClass;

}

That one seems to handle Win64 correctly, and also takes the calling convention into account in general.

So I think X86tcret_6regs should not hard-code 6, but check the ptr_rc_tailcall register class, and we should extract the code into a function that we can also use when moving the load.

And we should do the same for X86tcret_1reg, which is similar but has some differences:

llvm-project/llvm/lib/Target/X86/X86InstrFragments.td

Lines 686 to 699 in e58d227

def X86tcret_1reg : PatFrag<(ops node:$ptr, node:$off),

(X86tcret node:$ptr, node:$off), [{

// X86tcret args: (*chain, ptr, imm, regs..., glue)

unsigned NumRegs = 1;

const SDValue& BasePtr = cast<LoadSDNode>(N->getOperand(1))->getBasePtr();

if (isa<FrameIndexSDNode>(BasePtr))

NumRegs = 3;

else if (BasePtr->getNumOperands() && isa<GlobalAddressSDNode>(BasePtr->getOperand(0)))

NumRegs = 3;

for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)

if (isa<RegisterSDNode>(N->getOperand(i)) && ( NumRegs-- == 0))

return false;

return true;

}]>;

It's checking whether the load uses a frame slot or a global, in which case it figures that doesn't use up any extra registers. I'm not 100% convinced that's true for the global case? And shouldn't we do the same check in X86tcret_6regs?

efriedma-quic · 2025-04-30T01:06:02Z

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

+    // Moving across inline asm is not safe: it could do anything.
+    if (Chain.getNode()->getOpcode() == ISD::INLINEASM ||
+        Chain.getNode()->getOpcode() == ISD::INLINEASM_BR)
+      return false;


Please just allow specific nodes, and forbid anything unknown. Trying to list out every possible relevant node is guaranteed to fall out of date at some point, even if you manage to come up with a complete list.

zmodem · 2025-04-30T13:20:03Z

Thanks for the review! This should probably be landed as separate patches, but let's keep it as one for the review.

I'd appreciate extra eyes on the code from X86tcret_1reg (see the XXX comment).

zmodem added 2 commits April 25, 2025 18:34

(WORK IN PROGRESS) try to tail call address computation w/ more than …

0d00d2b

…two args this folds more stuff, but also finds new breakages Fixes llvm#136848

fixes

4913eea

zmodem added the backend:X86 label Apr 28, 2025

zmodem requested review from rnk, RKSimon, phoebewang and topperc April 28, 2025 14:49

rnk reviewed Apr 28, 2025

View reviewed changes

zmodem added 3 commits April 29, 2025 11:27

format

4695fe8

don't allow TokenFactor with more than one use (found in chromium win…

9d82088

…64 build)

don't allow moving the load across inline asm

50851ca

rnk reviewed Apr 29, 2025

View reviewed changes

efriedma-quic reviewed Apr 30, 2025

View reviewed changes

zmodem added 4 commits April 30, 2025 10:49

only allow moving the load across specific node types

4b99752

only count GPRs, and compare against the size of getGPRsForTailCall()

27716e1

extract the checks in X86tcret_6regs/1reg to a function and use it

b60bc94

format

a85c944

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[x86] Enable indirect tail calls with more arguments #137643

[x86] Enable indirect tail calls with more arguments #137643

zmodem commented Apr 28, 2025

llvmbot commented Apr 28, 2025

github-actions bot commented Apr 28, 2025 •

edited

Loading

rnk left a comment

zmodem commented Apr 29, 2025

rnk left a comment

rnk Apr 29, 2025

zmodem Apr 30, 2025

rnk Apr 29, 2025

zmodem Apr 30, 2025

rnk Apr 29, 2025

zmodem Apr 30, 2025

efriedma-quic Apr 30, 2025

zmodem Apr 30, 2025

zmodem commented Apr 30, 2025

	def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
	(X86tcret node:$ptr, node:$off), [{
	// X86tcret args: (*chain, ptr, imm, regs..., glue)
	unsigned NumRegs = 0;
	for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
	if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
	return false;
	return true;
	}]>;

	// Don't fold loads into X86tcret requiring more than 6 regs.
	// There wouldn't be enough scratch registers for base+index.
	def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off),
	(TCRETURNmi64 addr:$dst, timm:$off)>,
	Requires<[In64BitMode, NotUseIndirectThunkCalls]>;

	const TargetRegisterClass *
	X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
	const Function &F = MF.getFunction();
	if (IsWin64 \|\| (F.getCallingConv() == CallingConv::Win64))
	return &X86::GR64_TCW64RegClass;
	else if (Is64Bit)
	return &X86::GR64_TCRegClass;

	bool hasHipeCC = (F.getCallingConv() == CallingConv::HiPE);
	if (hasHipeCC)
	return &X86::GR32RegClass;
	return &X86::GR32_TCRegClass;
	}

	def X86tcret_1reg : PatFrag<(ops node:$ptr, node:$off),
	(X86tcret node:$ptr, node:$off), [{
	// X86tcret args: (*chain, ptr, imm, regs..., glue)
	unsigned NumRegs = 1;
	const SDValue& BasePtr = cast<LoadSDNode>(N->getOperand(1))->getBasePtr();
	if (isa<FrameIndexSDNode>(BasePtr))
	NumRegs = 3;
	else if (BasePtr->getNumOperands() && isa<GlobalAddressSDNode>(BasePtr->getOperand(0)))
	NumRegs = 3;
	for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
	if (isa<RegisterSDNode>(N->getOperand(i)) && ( NumRegs-- == 0))
	return false;
	return true;
	}]>;

[x86] Enable indirect tail calls with more arguments #137643

Are you sure you want to change the base?

[x86] Enable indirect tail calls with more arguments #137643

Conversation

zmodem commented Apr 28, 2025

llvmbot commented Apr 28, 2025

github-actions bot commented Apr 28, 2025 • edited Loading

rnk left a comment

Choose a reason for hiding this comment

zmodem commented Apr 29, 2025

rnk left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

zmodem commented Apr 30, 2025

github-actions bot commented Apr 28, 2025 •

edited

Loading