From 17117c191b5fd5e9c047af74d39dc3a7be9d2091 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Tue, 3 Dec 2024 19:00:18 +0800 Subject: [PATCH 1/3] [RISCV] Correct the limit of RegPresureSet `GPRAll` The generated limit is 33, which is the total number of scalar registers plus 1 (for `DUMMY_REG_PAIR_WITH_X0`). This is not right as not all scalar registers can be used. There are 4-6 reserved registers, so we need to adjust the limit by the reserved set. This change has impacts on instruction scheduling, MachineLICM, etc. Here are the statistics of spills/reloads on `llvm-test-suite` with `-O3 -march=rva23u64`: ``` Metric: regalloc.NumSpills,regalloc.NumReloads Program regalloc.NumSpills regalloc.NumReloads baseline after diff baseline after diff External/S...NT2017rate/502.gcc_r/502.gcc_r 11812.00 11338.00 -474.00 26813.00 25751.00 -1062.00 External/S...T2017speed/602.gcc_s/602.gcc_s 11812.00 11338.00 -474.00 26813.00 25751.00 -1062.00 External/S...te/526.blender_r/526.blender_r 13514.00 13228.00 -286.00 27456.00 27260.00 -196.00 External/S...00.perlbench_s/600.perlbench_s 4398.00 4274.00 -124.00 9745.00 9341.00 -404.00 External/S...00.perlbench_r/500.perlbench_r 4398.00 4274.00 -124.00 9745.00 9341.00 -404.00 SingleSour...nchmarks/Adobe-C++/loop_unroll 1533.00 1413.00 -120.00 2943.00 2633.00 -310.00 External/S...rate/510.parest_r/510.parest_r 43985.00 43879.00 -106.00 87409.00 87309.00 -100.00 External/S...te/538.imagick_r/538.imagick_r 4160.00 4060.00 -100.00 10338.00 10244.00 -94.00 External/S...ed/638.imagick_s/638.imagick_s 4160.00 4060.00 -100.00 10338.00 10244.00 -94.00 MultiSourc...e/Applications/ClamAV/clamscan 2120.00 2023.00 -97.00 5035.00 4901.00 -134.00 MultiSourc...sumer-typeset/consumer-typeset 1218.00 1129.00 -89.00 3041.00 2887.00 -154.00 MultiSourc.../Applications/JM/ldecod/ldecod 1341.00 1263.00 -78.00 2316.00 2238.00 -78.00 External/S...rate/511.povray_r/511.povray_r 1734.00 1659.00 -75.00 3413.00 3246.00 -167.00 MultiSource/Applications/SPASS/SPASS 1442.00 1376.00 -66.00 2954.00 2837.00 -117.00 MultiSourc.../DOE-ProxyApps-C++/CLAMR/CLAMR 1628.00 1568.00 -60.00 3026.00 2958.00 -68.00 regalloc.NumSpills regalloc.NumReloads run baseline after diff baseline after diff mean 86.725206 85.041122 -1.684083 1363.122137 1342.900383 -3.212869 ``` Co-authored-by: BoyaoWang430 --- llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 14 + llvm/lib/Target/RISCV/RISCVRegisterInfo.h | 2 + llvm/test/CodeGen/RISCV/pr69586.ll | 821 ++--- .../RISCV/rvv/fixed-vectors-masked-scatter.ll | 78 +- .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll | 2208 +++++------ .../RISCV/rvv/intrinsic-vector-match.ll | 700 ++-- .../RISCV/rvv/vxrm-insert-out-of-loop.ll | 5 +- ...lar-shift-by-byte-multiple-legalization.ll | 3240 ++++++++--------- .../RISCV/wide-scalar-shift-legalization.ll | 646 ++-- 9 files changed, 3755 insertions(+), 3959 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index cfcc3119257f6..a73bd1621a739 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -934,3 +934,17 @@ bool RISCVRegisterInfo::getRegAllocationHints( return BaseImplRetVal; } + +unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const { + if (Idx == RISCV::RegisterPressureSets::GPRAll) { + unsigned Reserved = 0; + BitVector ReservedRegs = getReservedRegs(MF); + for (MCPhysReg Reg = RISCV::X0_H; Reg <= RISCV::X31_H; Reg++) + if (ReservedRegs.test(Reg)) + Reserved++; + + return 32 - Reserved; + } + return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx); +} diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h index 3ab79694e175c..ca4934de2f52d 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h @@ -144,6 +144,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo { static bool isRVVRegClass(const TargetRegisterClass *RC) { return RISCVRI::isVRegClass(RC->TSFlags); } + unsigned getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const override; }; } // namespace llvm diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll index 9fc9a3c42867e..21e64ada7061a 100644 --- a/llvm/test/CodeGen/RISCV/pr69586.ll +++ b/llvm/test/CodeGen/RISCV/pr69586.ll @@ -44,59 +44,50 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: addi a5, a7, 512 ; NOREMAT-NEXT: addi a4, a7, 1024 ; NOREMAT-NEXT: addi a6, a7, 1536 -; NOREMAT-NEXT: li t4, 1 -; NOREMAT-NEXT: li a2, 5 -; NOREMAT-NEXT: li t1, 3 -; NOREMAT-NEXT: li t0, 7 -; NOREMAT-NEXT: lui t5, 1 -; NOREMAT-NEXT: li s4, 9 -; NOREMAT-NEXT: li s6, 11 -; NOREMAT-NEXT: li s9, 13 -; NOREMAT-NEXT: li ra, 15 -; NOREMAT-NEXT: lui t2, 2 -; NOREMAT-NEXT: lui s1, 3 -; NOREMAT-NEXT: lui t3, 4 -; NOREMAT-NEXT: lui s0, 5 -; NOREMAT-NEXT: lui s3, 6 -; NOREMAT-NEXT: lui s7, 7 +; NOREMAT-NEXT: li t1, 1 +; NOREMAT-NEXT: li a3, 5 +; NOREMAT-NEXT: li t0, 3 +; NOREMAT-NEXT: li a2, 7 +; NOREMAT-NEXT: lui t2, 1 +; NOREMAT-NEXT: li s5, 9 +; NOREMAT-NEXT: li s8, 11 +; NOREMAT-NEXT: lui s1, 2 +; NOREMAT-NEXT: lui t5, 3 +; NOREMAT-NEXT: lui s11, 4 +; NOREMAT-NEXT: lui ra, 5 +; NOREMAT-NEXT: lui t3, 6 +; NOREMAT-NEXT: lui s0, 7 ; NOREMAT-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; NOREMAT-NEXT: slli t4, t4, 11 -; NOREMAT-NEXT: sd t4, 512(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: slli a3, a2, 9 -; NOREMAT-NEXT: sd a3, 504(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: slli t6, t1, 10 -; NOREMAT-NEXT: slli s2, t0, 9 -; NOREMAT-NEXT: add a0, a7, t5 -; NOREMAT-NEXT: lui s11, 1 -; NOREMAT-NEXT: slli s4, s4, 9 -; NOREMAT-NEXT: slli s5, a2, 10 -; NOREMAT-NEXT: slli s6, s6, 9 -; NOREMAT-NEXT: slli s8, t1, 11 +; NOREMAT-NEXT: slli t4, t1, 11 +; NOREMAT-NEXT: slli t6, a3, 9 +; NOREMAT-NEXT: slli s2, t0, 10 +; NOREMAT-NEXT: slli s4, a2, 9 +; NOREMAT-NEXT: add a0, a7, t2 ; NOREMAT-NEXT: vle32.v v8, (a5) -; NOREMAT-NEXT: slli s9, s9, 9 -; NOREMAT-NEXT: li t5, 13 +; NOREMAT-NEXT: slli s5, s5, 9 ; NOREMAT-NEXT: vle32.v v10, (a4) ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: slli s10, t0, 10 +; NOREMAT-NEXT: slli s6, a3, 10 ; NOREMAT-NEXT: vle32.v v0, (a6) ; NOREMAT-NEXT: vle32.v v12, (a6) -; NOREMAT-NEXT: slli ra, ra, 9 +; NOREMAT-NEXT: slli s8, s8, 9 +; NOREMAT-NEXT: slli s9, t0, 11 ; NOREMAT-NEXT: vle32.v v4, (a0) ; NOREMAT-NEXT: vle32.v v20, (a0) -; NOREMAT-NEXT: add a4, a7, t2 +; NOREMAT-NEXT: add a4, a7, s1 ; NOREMAT-NEXT: vle32.v v6, (a4) ; NOREMAT-NEXT: vle32.v v30, (a4) -; NOREMAT-NEXT: add a4, a7, s1 +; NOREMAT-NEXT: add a4, a7, t5 ; NOREMAT-NEXT: vle32.v v28, (a4) ; NOREMAT-NEXT: vle32.v v26, (a4) -; NOREMAT-NEXT: add a4, a7, t3 +; NOREMAT-NEXT: add a4, a7, s11 ; NOREMAT-NEXT: vle32.v v24, (a4) ; NOREMAT-NEXT: vle32.v v22, (a4) -; NOREMAT-NEXT: add a4, a7, s0 +; NOREMAT-NEXT: add a4, a7, ra ; NOREMAT-NEXT: vle32.v v14, (a7) ; NOREMAT-NEXT: vle32.v v18, (a4) ; NOREMAT-NEXT: vle32.v v16, (a4) -; NOREMAT-NEXT: add a4, a7, s3 +; NOREMAT-NEXT: add a4, a7, t3 ; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v8 ; NOREMAT-NEXT: vle32.v v14, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10 @@ -107,78 +98,86 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: vle32.v v10, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: add a4, a7, a3 +; NOREMAT-NEXT: add a4, a7, t6 ; NOREMAT-NEXT: vle32.v v0, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v10 ; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: add a4, a7, t6 +; NOREMAT-NEXT: add a4, a7, s2 ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: add a4, a7, s2 +; NOREMAT-NEXT: add a4, a7, s4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s7 +; NOREMAT-NEXT: add a4, a7, s0 ; NOREMAT-NEXT: vle32.v v0, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v8 ; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: add a4, a7, s4 +; NOREMAT-NEXT: add a4, a7, s5 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s5 +; NOREMAT-NEXT: add a4, a7, s6 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v8 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: add a4, a7, s6 +; NOREMAT-NEXT: add a4, a7, s8 ; NOREMAT-NEXT: vle32.v v20, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s8 +; NOREMAT-NEXT: add a4, a7, s9 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: add a4, a7, s9 +; NOREMAT-NEXT: li t5, 13 +; NOREMAT-NEXT: slli a4, t5, 9 +; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a7, a4 ; NOREMAT-NEXT: vle32.v v20, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s10 +; NOREMAT-NEXT: slli a4, a2, 10 +; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a7, a4 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: add a4, a7, ra +; NOREMAT-NEXT: li a6, 15 +; NOREMAT-NEXT: slli a4, a6, 9 +; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a7, a4 ; NOREMAT-NEXT: vle32.v v2, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 -; NOREMAT-NEXT: lui t4, 8 -; NOREMAT-NEXT: add a5, a7, t4 +; NOREMAT-NEXT: lui t1, 8 +; NOREMAT-NEXT: add a5, a7, t1 ; NOREMAT-NEXT: vle32.v v20, (a5) ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v2 ; NOREMAT-NEXT: li a4, 17 ; NOREMAT-NEXT: slli a4, a4, 9 -; NOREMAT-NEXT: li s1, 17 -; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: li t2, 17 +; NOREMAT-NEXT: sd a4, 600(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a4, a7, a4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v6 ; NOREMAT-NEXT: li a5, 9 ; NOREMAT-NEXT: slli a4, a5, 10 -; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a4, 592(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a4, a7, a4 ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: vle32.v v6, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 ; NOREMAT-NEXT: li a4, 19 ; NOREMAT-NEXT: slli a4, a4, 9 -; NOREMAT-NEXT: li t2, 19 -; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: li s1, 19 +; NOREMAT-NEXT: sd a4, 584(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a4, a7, a4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: vle32.v v30, (a4) -; NOREMAT-NEXT: slli a3, a2, 11 -; NOREMAT-NEXT: sd a3, 600(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: slli a3, a3, 11 +; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 ; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) @@ -186,46 +185,45 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 ; NOREMAT-NEXT: li s7, 21 ; NOREMAT-NEXT: slli a3, s7, 9 -; NOREMAT-NEXT: sd a3, 592(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v6, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 -; NOREMAT-NEXT: li a6, 11 -; NOREMAT-NEXT: slli a3, a6, 10 -; NOREMAT-NEXT: sd a3, 584(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: li a4, 11 +; NOREMAT-NEXT: slli a3, a4, 10 +; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8 ; NOREMAT-NEXT: li s3, 23 -; NOREMAT-NEXT: slli a3, s3, 9 -; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: slli s10, s3, 9 +; NOREMAT-NEXT: add a3, a7, s10 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 ; NOREMAT-NEXT: li s0, 25 ; NOREMAT-NEXT: slli a3, s0, 9 -; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v6, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 ; NOREMAT-NEXT: slli a3, t5, 10 -; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a3, 544(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28 ; NOREMAT-NEXT: li t3, 27 ; NOREMAT-NEXT: slli a3, t3, 9 -; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a3, 536(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v28, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) -; NOREMAT-NEXT: slli a2, t0, 11 -; NOREMAT-NEXT: sd a2, 544(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: slli a2, a2, 11 +; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12 ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) @@ -233,39 +231,37 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 ; NOREMAT-NEXT: li t0, 29 ; NOREMAT-NEXT: slli a2, t0, 9 -; NOREMAT-NEXT: sd a2, 536(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v28 -; NOREMAT-NEXT: li a3, 15 -; NOREMAT-NEXT: slli a2, a3, 10 -; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: slli a2, a6, 10 +; NOREMAT-NEXT: sd a2, 512(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 -; NOREMAT-NEXT: li t1, 31 -; NOREMAT-NEXT: slli a2, t1, 9 -; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v12, (a2) -; NOREMAT-NEXT: vle32.v v4, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8 -; NOREMAT-NEXT: lui a4, 4 -; NOREMAT-NEXT: addiw a0, a4, 512 -; NOREMAT-NEXT: sd a0, 496(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: li a3, 31 +; NOREMAT-NEXT: slli a0, a3, 9 +; NOREMAT-NEXT: sd a0, 504(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a0, a7, a0 -; NOREMAT-NEXT: vle32.v v8, (a0) -; NOREMAT-NEXT: vle32.v v26, (a0) +; NOREMAT-NEXT: vle32.v v12, (a0) +; NOREMAT-NEXT: vle32.v v4, (a0) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8 +; NOREMAT-NEXT: addiw a2, s11, 512 +; NOREMAT-NEXT: sd a2, 496(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v8, (a2) +; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v28 -; NOREMAT-NEXT: slli a2, s1, 10 +; NOREMAT-NEXT: slli a2, t2, 10 ; NOREMAT-NEXT: sd a2, 488(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 -; NOREMAT-NEXT: addiw a2, a4, 1536 +; NOREMAT-NEXT: addiw a2, s11, 1536 ; NOREMAT-NEXT: sd a2, 480(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) @@ -277,27 +273,25 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v8 -; NOREMAT-NEXT: lui a5, 5 -; NOREMAT-NEXT: addiw a2, a5, -1536 +; NOREMAT-NEXT: addiw a2, ra, -1536 ; NOREMAT-NEXT: sd a2, 464(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v28 -; NOREMAT-NEXT: slli a2, t2, 10 +; NOREMAT-NEXT: slli a2, s1, 10 ; NOREMAT-NEXT: sd a2, 456(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: li t2, 19 ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 -; NOREMAT-NEXT: addiw a2, a5, -512 +; NOREMAT-NEXT: addiw a2, ra, -512 ; NOREMAT-NEXT: sd a2, 448(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v24 -; NOREMAT-NEXT: addiw a2, a5, 512 +; NOREMAT-NEXT: addiw a2, ra, 512 ; NOREMAT-NEXT: sd a2, 440(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v24, (a2) @@ -309,20 +303,20 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v26 -; NOREMAT-NEXT: addiw a2, a5, 1536 +; NOREMAT-NEXT: addiw a2, ra, 1536 ; NOREMAT-NEXT: sd a2, 424(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v26, (a2) -; NOREMAT-NEXT: slli a2, a6, 11 +; NOREMAT-NEXT: slli a2, a4, 11 ; NOREMAT-NEXT: sd a2, 416(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v12 ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v18 -; NOREMAT-NEXT: lui a6, 6 -; NOREMAT-NEXT: addiw a2, a6, -1536 +; NOREMAT-NEXT: lui a4, 6 +; NOREMAT-NEXT: addiw a2, a4, -1536 ; NOREMAT-NEXT: sd a2, 408(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) @@ -334,13 +328,13 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 -; NOREMAT-NEXT: addiw a2, a6, -512 +; NOREMAT-NEXT: addiw a2, a4, -512 ; NOREMAT-NEXT: sd a2, 392(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v22 -; NOREMAT-NEXT: addiw a2, a6, 512 +; NOREMAT-NEXT: addiw a2, a4, 512 ; NOREMAT-NEXT: sd a2, 384(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) @@ -352,7 +346,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v2, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v18 -; NOREMAT-NEXT: addiw a2, a6, 1536 +; NOREMAT-NEXT: addiw a2, a4, 1536 ; NOREMAT-NEXT: sd a2, 368(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) @@ -364,8 +358,8 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v8 -; NOREMAT-NEXT: lui s0, 7 -; NOREMAT-NEXT: addiw a2, s0, -1536 +; NOREMAT-NEXT: lui a5, 7 +; NOREMAT-NEXT: addiw a2, a5, -1536 ; NOREMAT-NEXT: sd a2, 352(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) @@ -379,15 +373,14 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: addi a0, sp, 640 ; NOREMAT-NEXT: vl2r.v v12, (a0) # Unknown-size Folded Reload ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v22 -; NOREMAT-NEXT: addiw a2, s0, -512 +; NOREMAT-NEXT: addiw a2, a5, -512 ; NOREMAT-NEXT: sd a2, 336(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v26 -; NOREMAT-NEXT: addiw a2, s0, 512 +; NOREMAT-NEXT: addiw a2, a5, 512 ; NOREMAT-NEXT: sd a2, 328(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: lui t3, 7 ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) @@ -398,30 +391,30 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v2, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v16 -; NOREMAT-NEXT: addiw a2, t3, 1536 +; NOREMAT-NEXT: addiw a2, a5, 1536 ; NOREMAT-NEXT: sd a2, 312(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) -; NOREMAT-NEXT: slli a2, a3, 11 +; NOREMAT-NEXT: slli a2, a6, 11 ; NOREMAT-NEXT: sd a2, 304(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v14 -; NOREMAT-NEXT: addiw a2, t4, -1536 +; NOREMAT-NEXT: addiw a2, t1, -1536 ; NOREMAT-NEXT: sd a2, 296(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v14, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) -; NOREMAT-NEXT: slli a2, t1, 10 +; NOREMAT-NEXT: slli a2, a3, 10 ; NOREMAT-NEXT: sd a2, 288(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v22 ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: addiw a0, t4, -512 +; NOREMAT-NEXT: addiw a0, t1, -512 ; NOREMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a0, a7, a0 ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0 @@ -438,32 +431,33 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: addi a0, a1, 1024 ; NOREMAT-NEXT: vse32.v v8, (a0) -; NOREMAT-NEXT: add s11, a1, s11 -; NOREMAT-NEXT: sd s11, 272(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: lui a0, 1 +; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: sd a0, 272(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui a0, 2 ; NOREMAT-NEXT: add a0, a1, a0 ; NOREMAT-NEXT: sd a0, 264(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui a0, 3 ; NOREMAT-NEXT: add a0, a1, a0 ; NOREMAT-NEXT: sd a0, 256(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add s11, a1, s11 +; NOREMAT-NEXT: sd s11, 248(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 240(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a4, a1, a4 -; NOREMAT-NEXT: sd a4, 248(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a4, 232(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a5, a1, a5 -; NOREMAT-NEXT: sd a5, 240(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a6, a1, a6 -; NOREMAT-NEXT: sd a6, 232(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add t3, a1, t3 -; NOREMAT-NEXT: sd t3, 224(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a0, a1, t4 +; NOREMAT-NEXT: sd a5, 224(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a0, a1, t1 ; NOREMAT-NEXT: sd a0, 216(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: addiw a0, t4, 512 +; NOREMAT-NEXT: addiw a0, t1, 512 ; NOREMAT-NEXT: sd a0, 192(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: addiw a0, t4, 1024 +; NOREMAT-NEXT: addiw a0, t1, 1024 ; NOREMAT-NEXT: sd a0, 176(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: addiw a0, t4, 1536 +; NOREMAT-NEXT: addiw a0, t1, 1536 ; NOREMAT-NEXT: sd a0, 160(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: slli s1, s1, 11 -; NOREMAT-NEXT: sd s1, 128(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: slli t2, t2, 11 +; NOREMAT-NEXT: sd t2, 128(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui a0, 9 ; NOREMAT-NEXT: addiw a2, a0, -1536 ; NOREMAT-NEXT: sd a2, 88(sp) # 8-byte Folded Spill @@ -476,7 +470,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: addiw s11, a0, 512 ; NOREMAT-NEXT: addiw s7, a0, 1024 ; NOREMAT-NEXT: addiw s3, a0, 1536 -; NOREMAT-NEXT: slli s1, t2, 11 +; NOREMAT-NEXT: slli s1, s1, 11 ; NOREMAT-NEXT: lui a0, 10 ; NOREMAT-NEXT: addiw t2, a0, -1536 ; NOREMAT-NEXT: addiw a7, a0, -1024 @@ -484,52 +478,52 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: add a2, a1, a0 ; NOREMAT-NEXT: sd a2, 200(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: addiw a0, a0, 512 -; NOREMAT-NEXT: ld a2, 512(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a2, a1, a2 -; NOREMAT-NEXT: ld a3, 504(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a3, a1, a3 -; NOREMAT-NEXT: add a5, a1, t6 -; NOREMAT-NEXT: add a6, a1, s2 -; NOREMAT-NEXT: add t0, a1, s4 -; NOREMAT-NEXT: add t1, a1, s5 -; NOREMAT-NEXT: add t3, a1, s6 -; NOREMAT-NEXT: add t4, a1, s8 -; NOREMAT-NEXT: add t5, a1, s9 -; NOREMAT-NEXT: add t6, a1, s10 -; NOREMAT-NEXT: add s0, a1, ra -; NOREMAT-NEXT: ld s2, 624(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add a2, a1, t4 +; NOREMAT-NEXT: add a3, a1, t6 +; NOREMAT-NEXT: add a5, a1, s2 +; NOREMAT-NEXT: add a6, a1, s4 +; NOREMAT-NEXT: add t0, a1, s5 +; NOREMAT-NEXT: add t1, a1, s6 +; NOREMAT-NEXT: add t3, a1, s8 +; NOREMAT-NEXT: add t4, a1, s9 +; NOREMAT-NEXT: ld t5, 624(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add t5, a1, t5 +; NOREMAT-NEXT: ld t6, 616(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add t6, a1, t6 +; NOREMAT-NEXT: ld s0, 608(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add s0, a1, s0 +; NOREMAT-NEXT: ld s2, 600(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s2, a1, s2 -; NOREMAT-NEXT: ld s4, 616(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s4, 592(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s4, a1, s4 -; NOREMAT-NEXT: ld s5, 608(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s5, 584(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s5, a1, s5 -; NOREMAT-NEXT: ld s6, 600(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s6, 576(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s6, a1, s6 -; NOREMAT-NEXT: ld s8, 592(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s8, 568(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s8, a1, s8 -; NOREMAT-NEXT: ld s9, 584(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s9, 560(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s9, a1, s9 -; NOREMAT-NEXT: ld s10, 576(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s10, a1, s10 -; NOREMAT-NEXT: ld ra, 568(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 16(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 560(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 32(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 48(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 512(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 64(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 504(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 80(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 496(sp) # 8-byte Folded Reload @@ -923,10 +917,9 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: .cfi_offset s10, -96 ; REMAT-NEXT: .cfi_offset s11, -104 ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 18 -; REMAT-NEXT: mul a2, a2, a3 +; REMAT-NEXT: slli a2, a2, 3 ; REMAT-NEXT: sub sp, sp, a2 -; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 18 * vlenb +; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 8 * vlenb ; REMAT-NEXT: li a4, 32 ; REMAT-NEXT: addi a5, a0, 512 ; REMAT-NEXT: addi a3, a0, 1024 @@ -963,23 +956,14 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: slli s6, s6, 9 ; REMAT-NEXT: li s7, 5 ; REMAT-NEXT: slli s7, s7, 11 -; REMAT-NEXT: li s8, 21 -; REMAT-NEXT: slli s8, s8, 9 -; REMAT-NEXT: li s9, 11 -; REMAT-NEXT: slli s9, s9, 10 -; REMAT-NEXT: li s10, 23 -; REMAT-NEXT: slli s10, s10, 9 -; REMAT-NEXT: lui s11, 3 -; REMAT-NEXT: li ra, 25 -; REMAT-NEXT: slli ra, ra, 9 ; REMAT-NEXT: vsetvli zero, a4, e32, m2, ta, ma ; REMAT-NEXT: vle32.v v8, (a5) -; REMAT-NEXT: li a4, 13 -; REMAT-NEXT: slli a4, a4, 10 +; REMAT-NEXT: li a4, 21 +; REMAT-NEXT: slli a4, a4, 9 ; REMAT-NEXT: vle32.v v10, (a3) ; REMAT-NEXT: vle32.v v12, (a3) -; REMAT-NEXT: li a3, 27 -; REMAT-NEXT: slli a3, a3, 9 +; REMAT-NEXT: li a3, 11 +; REMAT-NEXT: slli a3, a3, 10 ; REMAT-NEXT: vle32.v v14, (a2) ; REMAT-NEXT: vle32.v v16, (a2) ; REMAT-NEXT: add a2, a0, a6 @@ -995,7 +979,8 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 4 +; REMAT-NEXT: li a5, 6 +; REMAT-NEXT: mul a2, a2, a5 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 ; REMAT-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill @@ -1004,8 +989,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a5, 14 -; REMAT-NEXT: mul a2, a2, a5 +; REMAT-NEXT: slli a2, a2, 2 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 ; REMAT-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill @@ -1019,17 +1003,11 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v14 ; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: add a2, a0, t5 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v16, v18 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a5, 12 -; REMAT-NEXT: mul a2, a2, a5 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: add a2, a0, t6 +; REMAT-NEXT: sf.vc.vv 3, 0, v16, v18 ; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: add a2, a0, t6 +; REMAT-NEXT: vle32.v v16, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22 ; REMAT-NEXT: vle32.v v20, (a2) ; REMAT-NEXT: add a2, a0, s0 @@ -1039,403 +1017,340 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: add a2, a0, s1 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v28, v30 -; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: vle32.v v14, (a2) ; REMAT-NEXT: add a2, a0, s2 -; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: vle32.v v12, (a2) ; REMAT-NEXT: csrr a5, vlenb -; REMAT-NEXT: slli a5, a5, 4 +; REMAT-NEXT: li a6, 6 +; REMAT-NEXT: mul a5, a5, a6 ; REMAT-NEXT: add a5, sp, a5 ; REMAT-NEXT: addi a5, a5, 432 -; REMAT-NEXT: vl2r.v v12, (a5) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v12, v2 +; REMAT-NEXT: vl2r.v v28, (a5) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v28, v2 ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: add a2, a0, s3 -; REMAT-NEXT: vle32.v v12, (a2) +; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: csrr a5, vlenb -; REMAT-NEXT: li a6, 14 -; REMAT-NEXT: mul a5, a5, a6 +; REMAT-NEXT: slli a5, a5, 2 ; REMAT-NEXT: add a5, sp, a5 ; REMAT-NEXT: addi a5, a5, 432 -; REMAT-NEXT: vl2r.v v16, (a5) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 -; REMAT-NEXT: vle32.v v30, (a2) +; REMAT-NEXT: vl2r.v v30, (a5) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v30, v4 +; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: add a2, a0, s4 -; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10 -; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: add a2, a0, s5 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v0, v14 -; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: add a2, a0, s6 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: csrr a5, vlenb -; REMAT-NEXT: li a6, 12 -; REMAT-NEXT: mul a5, a5, a6 -; REMAT-NEXT: add a5, sp, a5 -; REMAT-NEXT: addi a5, a5, 432 -; REMAT-NEXT: vl2r.v v0, (a5) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18 +; REMAT-NEXT: add a2, a0, s5 +; REMAT-NEXT: vle32.v v6, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v0, v8 ; REMAT-NEXT: vle32.v v0, (a2) -; REMAT-NEXT: add a2, a0, s7 +; REMAT-NEXT: add a2, a0, s6 +; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v18, v16 ; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: add a2, a0, s7 +; REMAT-NEXT: vle32.v v16, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: add a2, a0, s8 +; REMAT-NEXT: add a2, a0, a4 ; REMAT-NEXT: vle32.v v20, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v24, v26 -; REMAT-NEXT: vle32.v v26, (a2) -; REMAT-NEXT: add a2, a0, s9 ; REMAT-NEXT: vle32.v v24, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v28, v8 -; REMAT-NEXT: vle32.v v28, (a2) -; REMAT-NEXT: add a2, a0, s10 -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v2, v12 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 3 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v12, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: add a2, a0, s11 +; REMAT-NEXT: addi a2, sp, 432 +; REMAT-NEXT: vs2r.v v24, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: add a2, a0, a3 +; REMAT-NEXT: vle32.v v24, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v14, v12 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v30, v16 -; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: add a2, a0, ra -; REMAT-NEXT: vle32.v v2, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 1 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: add a2, a0, a4 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v4, v14 +; REMAT-NEXT: li a5, 23 +; REMAT-NEXT: slli a5, a5, 9 +; REMAT-NEXT: add a2, a0, a5 +; REMAT-NEXT: vle32.v v26, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v2, v28 ; REMAT-NEXT: vle32.v v14, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 2 +; REMAT-NEXT: li a3, 6 +; REMAT-NEXT: mul a2, a2, a3 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 ; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: add a2, a0, a3 +; REMAT-NEXT: lui s8, 3 +; REMAT-NEXT: add a2, a0, s8 +; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v4, v30 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18 -; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 4 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: li a5, 7 -; REMAT-NEXT: slli a5, a5, 11 -; REMAT-NEXT: add a2, a0, a5 -; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: addi a3, sp, 432 -; REMAT-NEXT: vs2r.v v18, (a3) # Unknown-size Folded Spill -; REMAT-NEXT: sf.vc.vv 3, 0, v22, v20 -; REMAT-NEXT: vle32.v v18, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 14 -; REMAT-NEXT: mul a2, a2, a3 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: li a2, 29 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v26, v24 -; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 12 -; REMAT-NEXT: mul a2, a2, a3 +; REMAT-NEXT: slli a2, a2, 2 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v20, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: li a2, 15 -; REMAT-NEXT: slli a2, a2, 10 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: li s9, 25 +; REMAT-NEXT: slli s9, s9, 9 +; REMAT-NEXT: add a2, a0, s9 ; REMAT-NEXT: vle32.v v30, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v28, v8 -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 10 -; REMAT-NEXT: mul a2, a2, a3 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: li a2, 31 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: sf.vc.vv 3, 0, v10, v6 +; REMAT-NEXT: vle32.v v14, (a2) +; REMAT-NEXT: li s10, 13 +; REMAT-NEXT: slli s10, s10, 10 +; REMAT-NEXT: add a2, a0, s10 ; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 3 -; REMAT-NEXT: add a3, sp, a3 -; REMAT-NEXT: addi a3, a3, 432 -; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v12 +; REMAT-NEXT: sf.vc.vv 3, 0, v0, v8 ; REMAT-NEXT: vle32.v v8, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 3 +; REMAT-NEXT: slli a2, a2, 1 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 ; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: lui a2, 4 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: li s11, 27 +; REMAT-NEXT: slli s11, s11, 9 +; REMAT-NEXT: add a2, a0, s11 ; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v16, v2 -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 6 -; REMAT-NEXT: mul a2, a2, a3 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: lui a2, 4 -; REMAT-NEXT: addiw a2, a2, 512 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: sf.vc.vv 3, 0, v18, v16 +; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: li ra, 7 +; REMAT-NEXT: slli ra, ra, 11 +; REMAT-NEXT: add a2, a0, ra ; REMAT-NEXT: vle32.v v2, (a2) -; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 1 -; REMAT-NEXT: add a3, sp, a3 -; REMAT-NEXT: addi a3, a3, 432 -; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10 +; REMAT-NEXT: sf.vc.vv 3, 0, v22, v20 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: li a2, 17 -; REMAT-NEXT: slli a2, a2, 10 +; REMAT-NEXT: li a2, 29 +; REMAT-NEXT: slli a2, a2, 9 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v0, (a2) -; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 2 -; REMAT-NEXT: add a3, sp, a3 -; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: addi a3, sp, 432 ; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: lui a2, 4 -; REMAT-NEXT: addiw a2, a2, 1536 +; REMAT-NEXT: li a2, 15 +; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v24, (a2) -; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 4 -; REMAT-NEXT: add a3, sp, a3 -; REMAT-NEXT: addi a3, a3, 432 -; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: addi a3, sp, 432 -; REMAT-NEXT: vl2r.v v10, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10 +; REMAT-NEXT: sf.vc.vv 3, 0, v12, v26 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li a2, 9 -; REMAT-NEXT: slli a2, a2, 11 +; REMAT-NEXT: li a2, 31 +; REMAT-NEXT: slli a2, a2, 9 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: li a4, 14 +; REMAT-NEXT: li a4, 6 ; REMAT-NEXT: mul a3, a3, a4 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v10, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v18 +; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: lui a2, 5 -; REMAT-NEXT: addiw a2, a2, -1536 +; REMAT-NEXT: lui a2, 4 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: li a4, 12 -; REMAT-NEXT: mul a3, a3, a4 +; REMAT-NEXT: slli a3, a3, 2 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v12, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: li a2, 19 -; REMAT-NEXT: slli a2, a2, 10 +; REMAT-NEXT: lui a2, 4 +; REMAT-NEXT: addiw a2, a2, 512 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v30, (a2) -; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: li a4, 10 -; REMAT-NEXT: mul a3, a3, a4 -; REMAT-NEXT: add a3, sp, a3 -; REMAT-NEXT: addi a3, a3, 432 -; REMAT-NEXT: vl2r.v v14, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: lui a2, 5 -; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: li a2, 17 +; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 3 +; REMAT-NEXT: slli a3, a3, 1 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v16, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: lui a2, 4 +; REMAT-NEXT: addiw a2, a2, 1536 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: li a4, 6 -; REMAT-NEXT: mul a3, a3, a4 -; REMAT-NEXT: add a3, sp, a3 -; REMAT-NEXT: addi a3, a3, 432 -; REMAT-NEXT: vl2r.v v18, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: lui a2, 5 -; REMAT-NEXT: addiw a2, a2, 512 +; REMAT-NEXT: li a2, 9 +; REMAT-NEXT: slli a2, a2, 11 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: li s7, 21 -; REMAT-NEXT: slli s7, s7, 10 -; REMAT-NEXT: add a2, a0, s7 +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: addiw a2, a2, -1536 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: lui s4, 5 -; REMAT-NEXT: addiw s4, s4, 1536 -; REMAT-NEXT: add a2, a0, s4 +; REMAT-NEXT: li a2, 19 +; REMAT-NEXT: slli a2, a2, 10 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li a2, 11 -; REMAT-NEXT: slli a2, a2, 11 +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: addiw a2, a2, -512 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: lui s3, 6 -; REMAT-NEXT: addiw s3, s3, -1536 -; REMAT-NEXT: add a2, a0, s3 +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: li s2, 23 -; REMAT-NEXT: slli s2, s2, 10 -; REMAT-NEXT: add a2, a0, s2 +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: addiw a2, a2, 512 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: lui a2, 6 -; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: li a2, 21 +; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: lui a2, 6 +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: addiw a2, a2, 1536 ; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: lui s1, 6 ; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: lui s0, 6 -; REMAT-NEXT: addiw s0, s0, 512 -; REMAT-NEXT: add a2, a0, s0 +; REMAT-NEXT: li a2, 11 +; REMAT-NEXT: slli a2, a2, 11 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: li a2, 25 -; REMAT-NEXT: slli a2, a2, 10 +; REMAT-NEXT: lui a2, 6 +; REMAT-NEXT: addiw a2, a2, -1536 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: lui t6, 6 -; REMAT-NEXT: addiw t6, t6, 1536 -; REMAT-NEXT: add a2, a0, t6 +; REMAT-NEXT: li a2, 23 +; REMAT-NEXT: slli a2, a2, 10 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li t5, 13 -; REMAT-NEXT: slli t5, t5, 11 -; REMAT-NEXT: add a2, a0, t5 +; REMAT-NEXT: lui a2, 6 +; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: lui a2, 7 -; REMAT-NEXT: addiw a2, a2, -1536 +; REMAT-NEXT: lui a2, 6 ; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui s1, 6 ; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: li t4, 27 -; REMAT-NEXT: slli t4, t4, 10 -; REMAT-NEXT: add a2, a0, t4 +; REMAT-NEXT: lui s0, 6 +; REMAT-NEXT: addiw s0, s0, 512 +; REMAT-NEXT: add a2, a0, s0 ; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: lui a2, 7 -; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: li a2, 25 +; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: lui a2, 7 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: lui t3, 7 +; REMAT-NEXT: lui t6, 6 +; REMAT-NEXT: addiw t6, t6, 1536 +; REMAT-NEXT: add a2, a0, t6 ; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: lui t2, 7 -; REMAT-NEXT: addiw t2, t2, 512 -; REMAT-NEXT: add a2, a0, t2 +; REMAT-NEXT: li t5, 13 +; REMAT-NEXT: slli t5, t5, 11 +; REMAT-NEXT: add a2, a0, t5 ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: li t1, 29 -; REMAT-NEXT: slli t1, t1, 10 -; REMAT-NEXT: add a2, a0, t1 +; REMAT-NEXT: lui a2, 7 +; REMAT-NEXT: addiw a2, a2, -1536 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: lui t0, 7 -; REMAT-NEXT: addiw t0, t0, 1536 -; REMAT-NEXT: add a2, a0, t0 +; REMAT-NEXT: li t4, 27 +; REMAT-NEXT: slli t4, t4, 10 +; REMAT-NEXT: add a2, a0, t4 ; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li a7, 15 -; REMAT-NEXT: slli a7, a7, 11 -; REMAT-NEXT: add a2, a0, a7 +; REMAT-NEXT: lui a2, 7 +; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: lui a6, 8 -; REMAT-NEXT: addiw a6, a6, -1536 -; REMAT-NEXT: add a2, a0, a6 +; REMAT-NEXT: lui a2, 7 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui t3, 7 ; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: li a4, 31 -; REMAT-NEXT: slli a4, a4, 10 -; REMAT-NEXT: add a2, a0, a4 +; REMAT-NEXT: lui t2, 7 +; REMAT-NEXT: addiw t2, t2, 512 +; REMAT-NEXT: add a2, a0, t2 ; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: lui a3, 8 -; REMAT-NEXT: addiw a3, a3, -512 -; REMAT-NEXT: add a2, a0, a3 +; REMAT-NEXT: li t1, 29 +; REMAT-NEXT: slli t1, t1, 10 +; REMAT-NEXT: add a2, a0, t1 ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: lui a2, 8 -; REMAT-NEXT: add a0, a0, a2 -; REMAT-NEXT: vle32.v v4, (a0) +; REMAT-NEXT: lui t0, 7 +; REMAT-NEXT: addiw t0, t0, 1536 +; REMAT-NEXT: add a2, a0, t0 +; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 +; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: li a7, 15 +; REMAT-NEXT: slli a7, a7, 11 +; REMAT-NEXT: add a2, a0, a7 +; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 +; REMAT-NEXT: vle32.v v20, (a2) +; REMAT-NEXT: lui a6, 8 +; REMAT-NEXT: addiw a6, a6, -1536 +; REMAT-NEXT: add a2, a0, a6 +; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 +; REMAT-NEXT: vle32.v v22, (a2) +; REMAT-NEXT: li a4, 31 +; REMAT-NEXT: slli a4, a4, 10 +; REMAT-NEXT: add a2, a0, a4 +; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 +; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: lui a3, 8 +; REMAT-NEXT: addiw a3, a3, -512 +; REMAT-NEXT: add a2, a0, a3 +; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 +; REMAT-NEXT: vle32.v v10, (a2) +; REMAT-NEXT: lui a2, 8 +; REMAT-NEXT: add a0, a0, a2 +; REMAT-NEXT: vle32.v v28, (a0) ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 +; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 +; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 +; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 +; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: addi a0, a1, 1024 ; REMAT-NEXT: vse32.v v8, (a0) @@ -1482,45 +1397,38 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 ; REMAT-NEXT: sd a0, 336(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 15 -; REMAT-NEXT: slli a0, a0, 9 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 328(sp) # 8-byte Folded Spill -; REMAT-NEXT: lui a0, 2 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 320(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 17 -; REMAT-NEXT: slli a0, a0, 9 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 312(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s2, a1, s2 +; REMAT-NEXT: sd s2, 328(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s3, a1, s3 +; REMAT-NEXT: sd s3, 320(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s4, a1, s4 +; REMAT-NEXT: sd s4, 312(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s5, a1, s5 ; REMAT-NEXT: sd s5, 304(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s6, a1, s6 ; REMAT-NEXT: sd s6, 296(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 5 -; REMAT-NEXT: slli a0, a0, 11 +; REMAT-NEXT: add s7, a1, s7 +; REMAT-NEXT: sd s7, 288(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 21 +; REMAT-NEXT: slli a0, a0, 9 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 11 +; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 288(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd a0, 272(sp) # 8-byte Folded Spill +; REMAT-NEXT: add a5, a1, a5 +; REMAT-NEXT: sd a5, 264(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s8, a1, s8 -; REMAT-NEXT: sd s8, 280(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s8, 256(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s9, a1, s9 -; REMAT-NEXT: sd s9, 272(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s9, 248(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s10, a1, s10 -; REMAT-NEXT: sd s10, 264(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s10, 240(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s11, a1, s11 -; REMAT-NEXT: sd s11, 256(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s11, 232(sp) # 8-byte Folded Spill ; REMAT-NEXT: add ra, a1, ra -; REMAT-NEXT: sd ra, 248(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 13 -; REMAT-NEXT: slli a0, a0, 10 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 240(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 27 -; REMAT-NEXT: slli a0, a0, 9 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 232(sp) # 8-byte Folded Spill -; REMAT-NEXT: add a5, a1, a5 -; REMAT-NEXT: sd a5, 224(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd ra, 224(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 29 ; REMAT-NEXT: slli a0, a0, 9 ; REMAT-NEXT: add a0, a1, a0 @@ -1571,18 +1479,26 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: addiw a0, a0, 512 ; REMAT-NEXT: add a0, a1, a0 ; REMAT-NEXT: sd a0, 120(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s7, a1, s7 -; REMAT-NEXT: sd s7, 112(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s4, a1, s4 -; REMAT-NEXT: sd s4, 104(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 21 +; REMAT-NEXT: slli a0, a0, 10 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 112(sp) # 8-byte Folded Spill +; REMAT-NEXT: lui a0, 5 +; REMAT-NEXT: addiw a0, a0, 1536 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 104(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 11 ; REMAT-NEXT: slli a0, a0, 11 ; REMAT-NEXT: add a0, a1, a0 ; REMAT-NEXT: sd a0, 96(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s3, a1, s3 -; REMAT-NEXT: sd s3, 88(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s2, a1, s2 -; REMAT-NEXT: sd s2, 80(sp) # 8-byte Folded Spill +; REMAT-NEXT: lui a0, 6 +; REMAT-NEXT: addiw a0, a0, -1536 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 88(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 23 +; REMAT-NEXT: slli a0, a0, 10 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 80(sp) # 8-byte Folded Spill ; REMAT-NEXT: lui a0, 6 ; REMAT-NEXT: addiw a0, a0, -512 ; REMAT-NEXT: add a0, a1, a0 @@ -1879,8 +1795,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: csrr a0, vlenb -; REMAT-NEXT: li a1, 18 -; REMAT-NEXT: mul a0, a0, a1 +; REMAT-NEXT: slli a0, a0, 3 ; REMAT-NEXT: add sp, sp, a0 ; REMAT-NEXT: .cfi_def_cfa sp, 544 ; REMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index 575a757149ebb..0b5856a7000dd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -5682,28 +5682,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -48 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 48 -; RV32ZVE32F-NEXT: sw s0, 44(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s1, 40(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s2, 36(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s3, 32(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s4, 28(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s5, 24(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s6, 20(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s7, 16(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s8, 12(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s9, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s3, 0(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 ; RV32ZVE32F-NEXT: .cfi_offset s3, -16 -; RV32ZVE32F-NEXT: .cfi_offset s4, -20 -; RV32ZVE32F-NEXT: .cfi_offset s5, -24 -; RV32ZVE32F-NEXT: .cfi_offset s6, -28 -; RV32ZVE32F-NEXT: .cfi_offset s7, -32 -; RV32ZVE32F-NEXT: .cfi_offset s8, -36 -; RV32ZVE32F-NEXT: .cfi_offset s9, -40 ; RV32ZVE32F-NEXT: .cfi_remember_state ; RV32ZVE32F-NEXT: lw a3, 56(a0) ; RV32ZVE32F-NEXT: lw a4, 60(a0) @@ -5715,30 +5703,30 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: lw t4, 28(a0) ; RV32ZVE32F-NEXT: lw t1, 32(a0) ; RV32ZVE32F-NEXT: lw t2, 36(a0) +; RV32ZVE32F-NEXT: lw t5, 0(a2) +; RV32ZVE32F-NEXT: lw t6, 8(a2) +; RV32ZVE32F-NEXT: lw s0, 16(a2) +; RV32ZVE32F-NEXT: lw s1, 24(a2) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vmv.v.x v8, t5 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t6 +; RV32ZVE32F-NEXT: lw t5, 32(a2) +; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: lw s2, 48(a2) +; RV32ZVE32F-NEXT: lw s3, 56(a2) +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s0 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s1 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t5 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t6 ; RV32ZVE32F-NEXT: lw s0, 8(a0) ; RV32ZVE32F-NEXT: lw s1, 12(a0) ; RV32ZVE32F-NEXT: lw t5, 16(a0) ; RV32ZVE32F-NEXT: lw t6, 20(a0) -; RV32ZVE32F-NEXT: lw s2, 32(a2) -; RV32ZVE32F-NEXT: lw s3, 40(a2) -; RV32ZVE32F-NEXT: lw s4, 48(a2) -; RV32ZVE32F-NEXT: lw s5, 56(a2) -; RV32ZVE32F-NEXT: lw s6, 0(a2) -; RV32ZVE32F-NEXT: lw s7, 8(a2) -; RV32ZVE32F-NEXT: lw s8, 16(a2) -; RV32ZVE32F-NEXT: lw s9, 24(a2) -; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v8, s6 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s7 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s8 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s9 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s3 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s4 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s5 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: andi s2, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 @@ -5771,27 +5759,15 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a4, 4(a0) ; RV32ZVE32F-NEXT: .LBB51_9: # %else14 -; RV32ZVE32F-NEXT: lw s0, 44(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s1, 40(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s2, 36(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s3, 32(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s4, 28(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s5, 24(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s6, 20(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s7, 16(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s8, 12(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s9, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s3, 0(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: .cfi_restore s0 ; RV32ZVE32F-NEXT: .cfi_restore s1 ; RV32ZVE32F-NEXT: .cfi_restore s2 ; RV32ZVE32F-NEXT: .cfi_restore s3 -; RV32ZVE32F-NEXT: .cfi_restore s4 -; RV32ZVE32F-NEXT: .cfi_restore s5 -; RV32ZVE32F-NEXT: .cfi_restore s6 -; RV32ZVE32F-NEXT: .cfi_restore s7 -; RV32ZVE32F-NEXT: .cfi_restore s8 -; RV32ZVE32F-NEXT: .cfi_restore s9 -; RV32ZVE32F-NEXT: addi sp, sp, 48 +; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 0 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB51_10: # %cond.store diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll index 03d5762b4903e..036fee6a13ca4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll @@ -1364,19 +1364,16 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN32-NEXT: vslidedown.vi v26, v8, 15 -; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 14 -; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 13 -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 12 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 1 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 14 +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 13 +; ZVFHMIN32-NEXT: addi a2, sp, 848 ; ZVFHMIN32-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v4, v8, 11 -; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 10 -; ZVFHMIN32-NEXT: vslidedown.vi v30, v8, 9 -; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 8 -; ZVFHMIN32-NEXT: vmv.x.s a4, v16 +; ZVFHMIN32-NEXT: vslidedown.vi v6, v8, 12 +; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 11 +; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 10 +; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 9 +; ZVFHMIN32-NEXT: vslidedown.vi v18, v8, 8 +; ZVFHMIN32-NEXT: vmv.x.s a3, v16 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -1384,52 +1381,51 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 560(sp) ; ZVFHMIN32-NEXT: lh a1, 304(sp) ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 7 -; ZVFHMIN32-NEXT: vslidedown.vi v31, v16, 6 -; ZVFHMIN32-NEXT: vslidedown.vi v5, v16, 5 +; ZVFHMIN32-NEXT: vslidedown.vi v21, v16, 7 +; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 6 +; ZVFHMIN32-NEXT: vslidedown.vi v19, v16, 5 ; ZVFHMIN32-NEXT: vslidedown.vi v23, v16, 4 ; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 3 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 21 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: li a4, 10 +; ZVFHMIN32-NEXT: mul a2, a2, a4 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 2 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 20 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a2, a2, 4 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 1 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 22 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a4, a2, 4 +; ZVFHMIN32-NEXT: sub a2, a4, a2 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v18, v16, 15 -; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 14 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 13 +; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 15 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 14 +; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 13 ; ZVFHMIN32-NEXT: vslidedown.vi v12, v16, 12 -; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 11 -; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 10 +; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 11 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 18 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a4, a2, 4 +; ZVFHMIN32-NEXT: add a2, a4, a2 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 9 +; ZVFHMIN32-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 10 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 14 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: li a4, 11 +; ZVFHMIN32-NEXT: mul a2, a2, a4 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 8 +; ZVFHMIN32-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v4, v16, 9 +; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -1437,12 +1433,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 558(sp) ; ZVFHMIN32-NEXT: lh a1, 302(sp) ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v13, v0, 7 -; ZVFHMIN32-NEXT: vslidedown.vi v29, v0, 6 -; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 5 -; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 4 -; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 3 -; ZVFHMIN32-NEXT: vslidedown.vi v21, v0, 2 +; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 7 +; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 6 +; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 5 +; ZVFHMIN32-NEXT: vslidedown.vi v29, v0, 4 +; ZVFHMIN32-NEXT: vslidedown.vi v31, v0, 3 +; ZVFHMIN32-NEXT: vslidedown.vi v5, v0, 2 ; ZVFHMIN32-NEXT: vslidedown.vi v27, v0, 1 ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 15 @@ -1453,63 +1449,63 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 14 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 3 +; ZVFHMIN32-NEXT: slli a2, a2, 1 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 13 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 6 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: li a4, 6 +; ZVFHMIN32-NEXT: mul a2, a2, a4 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 12 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 12 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a2, a2, 3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 11 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 10 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: li a4, 13 +; ZVFHMIN32-NEXT: mul a2, a2, a4 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 10 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 4 +; ZVFHMIN32-NEXT: li a4, 19 +; ZVFHMIN32-NEXT: mul a2, a2, a4 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 9 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a4, 21 +; ZVFHMIN32-NEXT: mul a2, a2, a4 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v0, v0, 8 -; ZVFHMIN32-NEXT: addi a2, sp, 848 -; ZVFHMIN32-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vmv.x.s t4, v26 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 215(sp) ; ZVFHMIN32-NEXT: lh a0, 556(sp) ; ZVFHMIN32-NEXT: lh a1, 300(sp) -; ZVFHMIN32-NEXT: vmv.x.s t3, v20 -; ZVFHMIN32-NEXT: vmv.x.s t1, v28 +; ZVFHMIN32-NEXT: vmv.x.s t3, v26 +; ZVFHMIN32-NEXT: vmv.x.s t2, v28 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 214(sp) ; ZVFHMIN32-NEXT: lh a0, 554(sp) ; ZVFHMIN32-NEXT: lh a1, 298(sp) -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 1 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s t2, v0 -; ZVFHMIN32-NEXT: vmv.x.s t0, v4 +; ZVFHMIN32-NEXT: addi a2, sp, 848 +; ZVFHMIN32-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s t1, v16 +; ZVFHMIN32-NEXT: vmv.x.s t0, v6 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -1517,229 +1513,234 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 552(sp) ; ZVFHMIN32-NEXT: lh a1, 296(sp) ; ZVFHMIN32-NEXT: vmv.x.s a7, v2 -; ZVFHMIN32-NEXT: vmv.x.s a6, v30 +; ZVFHMIN32-NEXT: vmv.x.s a6, v22 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 212(sp) ; ZVFHMIN32-NEXT: lh a0, 550(sp) ; ZVFHMIN32-NEXT: lh a1, 294(sp) -; ZVFHMIN32-NEXT: vmv.x.s a5, v22 +; ZVFHMIN32-NEXT: vmv.x.s a5, v20 ; ZVFHMIN32-NEXT: vmv.x.s a2, v18 -; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw a2, 108(sp) # 4-byte Folded Spill ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 211(sp) -; ZVFHMIN32-NEXT: lh a1, 548(sp) -; ZVFHMIN32-NEXT: lh t5, 292(sp) -; ZVFHMIN32-NEXT: vmv.x.s a0, v14 -; ZVFHMIN32-NEXT: sw a0, 116(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: vmv.x.s a0, v8 -; ZVFHMIN32-NEXT: sw a0, 124(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t5 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 210(sp) -; ZVFHMIN32-NEXT: lh a1, 546(sp) -; ZVFHMIN32-NEXT: lh t5, 290(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: vmv.x.s a4, v24 +; ZVFHMIN32-NEXT: lh a0, 548(sp) +; ZVFHMIN32-NEXT: lh a1, 292(sp) +; ZVFHMIN32-NEXT: vmv.x.s a2, v14 +; ZVFHMIN32-NEXT: sw a2, 116(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s a2, v8 +; ZVFHMIN32-NEXT: sw a2, 124(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa3, t5 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa3 -; ZVFHMIN32-NEXT: sb a1, 209(sp) -; ZVFHMIN32-NEXT: lh a1, 544(sp) -; ZVFHMIN32-NEXT: lh t5, 288(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t5 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a4, 192(sp) -; ZVFHMIN32-NEXT: sb a1, 208(sp) -; ZVFHMIN32-NEXT: lh t5, 738(sp) -; ZVFHMIN32-NEXT: lh t6, 482(sp) -; ZVFHMIN32-NEXT: vmv.x.s a0, v12 -; ZVFHMIN32-NEXT: sw a0, 108(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: vmv.x.s a0, v10 -; ZVFHMIN32-NEXT: sw a0, 120(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN32-NEXT: sb t5, 177(sp) -; ZVFHMIN32-NEXT: lh t5, 736(sp) -; ZVFHMIN32-NEXT: lh t6, 480(sp) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 29 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s5, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 28 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s6, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN32-NEXT: sb t5, 176(sp) -; ZVFHMIN32-NEXT: lh t5, 734(sp) -; ZVFHMIN32-NEXT: lh t6, 478(sp) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 27 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s7, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 26 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s8, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN32-NEXT: sb t5, 175(sp) -; ZVFHMIN32-NEXT: lh t5, 732(sp) -; ZVFHMIN32-NEXT: lh t6, 476(sp) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 25 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s4, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 24 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s3, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN32-NEXT: sb t5, 174(sp) -; ZVFHMIN32-NEXT: lh t6, 730(sp) -; ZVFHMIN32-NEXT: lh s9, 474(sp) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 23 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s2, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s t5, v3 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t6 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 210(sp) +; ZVFHMIN32-NEXT: lh a0, 546(sp) +; ZVFHMIN32-NEXT: lh a1, 290(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN32-NEXT: vmv.x.s a3, v24 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 +; ZVFHMIN32-NEXT: sb a0, 209(sp) +; ZVFHMIN32-NEXT: lh a0, 544(sp) +; ZVFHMIN32-NEXT: lh a1, 288(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a3, 192(sp) +; ZVFHMIN32-NEXT: sb a0, 208(sp) +; ZVFHMIN32-NEXT: lh a0, 738(sp) +; ZVFHMIN32-NEXT: lh a1, 482(sp) +; ZVFHMIN32-NEXT: vmv.x.s a2, v10 +; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s a2, v12 +; ZVFHMIN32-NEXT: sw a2, 120(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 177(sp) +; ZVFHMIN32-NEXT: lh a0, 736(sp) +; ZVFHMIN32-NEXT: lh a1, 480(sp) +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 29 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 28 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 176(sp) +; ZVFHMIN32-NEXT: lh a0, 734(sp) +; ZVFHMIN32-NEXT: lh a1, 478(sp) +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 27 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh s6, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 26 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 175(sp) +; ZVFHMIN32-NEXT: lh a0, 732(sp) +; ZVFHMIN32-NEXT: lh a1, 476(sp) +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 25 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh s7, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 24 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 174(sp) +; ZVFHMIN32-NEXT: lh a0, 730(sp) +; ZVFHMIN32-NEXT: lh a1, 474(sp) +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 23 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh s8, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s t4, v21 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 173(sp) +; ZVFHMIN32-NEXT: lh a0, 728(sp) +; ZVFHMIN32-NEXT: lh a1, 472(sp) +; ZVFHMIN32-NEXT: vmv.x.s t6, v3 +; ZVFHMIN32-NEXT: vmv.x.s t5, v19 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 172(sp) +; ZVFHMIN32-NEXT: lh a0, 726(sp) +; ZVFHMIN32-NEXT: lh a1, 470(sp) +; ZVFHMIN32-NEXT: vmv.x.s s10, v11 +; ZVFHMIN32-NEXT: vmv.x.s s11, v7 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 171(sp) +; ZVFHMIN32-NEXT: lh a0, 724(sp) +; ZVFHMIN32-NEXT: lh s9, 468(sp) +; ZVFHMIN32-NEXT: vmv.x.s a4, v9 +; ZVFHMIN32-NEXT: vmv.x.s ra, v29 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, s9 -; ZVFHMIN32-NEXT: feq.h t6, fa5, fa4 -; ZVFHMIN32-NEXT: sb t6, 173(sp) -; ZVFHMIN32-NEXT: lh s9, 728(sp) -; ZVFHMIN32-NEXT: lh s10, 472(sp) -; ZVFHMIN32-NEXT: vmv.x.s t6, v31 -; ZVFHMIN32-NEXT: vmv.x.s ra, v13 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s9 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN32-NEXT: feq.h s9, fa5, fa4 -; ZVFHMIN32-NEXT: sb s9, 172(sp) -; ZVFHMIN32-NEXT: lh s9, 726(sp) -; ZVFHMIN32-NEXT: lh s10, 470(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v29 -; ZVFHMIN32-NEXT: vmv.x.s a3, v11 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s9 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN32-NEXT: feq.h s9, fa5, fa4 -; ZVFHMIN32-NEXT: sb s9, 171(sp) -; ZVFHMIN32-NEXT: lh s10, 724(sp) -; ZVFHMIN32-NEXT: lh s11, 468(sp) -; ZVFHMIN32-NEXT: vmv.x.s a4, v7 -; ZVFHMIN32-NEXT: vmv.x.s s9, v9 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s10 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s11 -; ZVFHMIN32-NEXT: feq.h s10, fa5, fa4 -; ZVFHMIN32-NEXT: sb s10, 170(sp) +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 170(sp) ; ZVFHMIN32-NEXT: lh a0, 722(sp) ; ZVFHMIN32-NEXT: lh a1, 466(sp) -; ZVFHMIN32-NEXT: vmv.x.s s10, v21 -; ZVFHMIN32-NEXT: vmv.x.s s11, v27 +; ZVFHMIN32-NEXT: vmv.x.s s9, v31 +; ZVFHMIN32-NEXT: vmv.x.s a3, v5 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 169(sp) ; ZVFHMIN32-NEXT: lh a0, 720(sp) ; ZVFHMIN32-NEXT: lh a1, 464(sp) +; ZVFHMIN32-NEXT: vmv.x.s a2, v27 ; ZVFHMIN32-NEXT: fmv.h.x fa5, s5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s6 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 ; ZVFHMIN32-NEXT: sb a0, 168(sp) ; ZVFHMIN32-NEXT: lh a0, 718(sp) ; ZVFHMIN32-NEXT: lh a1, 462(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa3, s7 -; ZVFHMIN32-NEXT: fmv.h.x fa2, s8 -; ZVFHMIN32-NEXT: fmv.h.x fa1, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa0, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa1, fa0 -; ZVFHMIN32-NEXT: fmv.h.x fa1, ra +; ZVFHMIN32-NEXT: fmv.h.x fa4, s2 +; ZVFHMIN32-NEXT: fmv.h.x fa3, s6 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 167(sp) ; ZVFHMIN32-NEXT: lh a0, 716(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa0, a2 ; ZVFHMIN32-NEXT: lh a1, 460(sp) -; ZVFHMIN32-NEXT: feq.h s5, fa5, fa1 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s4 -; ZVFHMIN32-NEXT: sb a1, 166(sp) -; ZVFHMIN32-NEXT: lh a1, 714(sp) -; ZVFHMIN32-NEXT: lh a2, 458(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a3, fa3, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa3 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s3 -; ZVFHMIN32-NEXT: sb a1, 165(sp) -; ZVFHMIN32-NEXT: lh a1, 712(sp) -; ZVFHMIN32-NEXT: lh a2, 456(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa3, a4 -; ZVFHMIN32-NEXT: feq.h a4, fa2, fa3 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa3, fa2 -; ZVFHMIN32-NEXT: fmv.h.x fa3, s2 -; ZVFHMIN32-NEXT: sb a1, 164(sp) -; ZVFHMIN32-NEXT: lh a1, 710(sp) -; ZVFHMIN32-NEXT: lh a2, 454(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa2, s9 -; ZVFHMIN32-NEXT: feq.h s2, fa5, fa2 +; ZVFHMIN32-NEXT: fmv.h.x fa2, s3 +; ZVFHMIN32-NEXT: fmv.h.x fa1, s7 +; ZVFHMIN32-NEXT: fmv.h.x fa0, a0 +; ZVFHMIN32-NEXT: fmv.h.x ft0, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa0, ft0 +; ZVFHMIN32-NEXT: sb a0, 166(sp) +; ZVFHMIN32-NEXT: lh a0, 714(sp) +; ZVFHMIN32-NEXT: lh a1, 458(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa0, s4 +; ZVFHMIN32-NEXT: fmv.h.x ft0, s8 +; ZVFHMIN32-NEXT: fmv.h.x ft1, a0 +; ZVFHMIN32-NEXT: fmv.h.x ft2, a1 +; ZVFHMIN32-NEXT: feq.h a0, ft1, ft2 +; ZVFHMIN32-NEXT: sb a0, 165(sp) +; ZVFHMIN32-NEXT: lh a0, 712(sp) +; ZVFHMIN32-NEXT: lh a1, 456(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft1, s10 +; ZVFHMIN32-NEXT: fmv.h.x ft2, s11 +; ZVFHMIN32-NEXT: fmv.h.x ft3, a0 +; ZVFHMIN32-NEXT: fmv.h.x ft4, a1 +; ZVFHMIN32-NEXT: feq.h a0, ft3, ft4 +; ZVFHMIN32-NEXT: sb a0, 164(sp) +; ZVFHMIN32-NEXT: lh a0, 710(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft3, a4 +; ZVFHMIN32-NEXT: lh a1, 454(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft4, ra +; ZVFHMIN32-NEXT: fmv.h.x ft5, a0 +; ZVFHMIN32-NEXT: feq.h a0, fa5, ft1 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa2 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s10 -; ZVFHMIN32-NEXT: fmv.h.x fa2, s11 +; ZVFHMIN32-NEXT: feq.h a1, ft5, fa5 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 ; ZVFHMIN32-NEXT: sb a1, 163(sp) ; ZVFHMIN32-NEXT: lh a1, 708(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft1, a2 ; ZVFHMIN32-NEXT: lh a2, 452(sp) -; ZVFHMIN32-NEXT: feq.h s3, fa4, fa5 -; ZVFHMIN32-NEXT: feq.h s4, fa3, fa2 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 162(sp) -; ZVFHMIN32-NEXT: lh a1, 706(sp) -; ZVFHMIN32-NEXT: lh a2, 450(sp) -; ZVFHMIN32-NEXT: sb s4, 129(sp) -; ZVFHMIN32-NEXT: sb s3, 130(sp) -; ZVFHMIN32-NEXT: sb s2, 131(sp) -; ZVFHMIN32-NEXT: sb a4, 132(sp) +; ZVFHMIN32-NEXT: feq.h a3, fa0, fa5 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a1, ft0, ft1 +; ZVFHMIN32-NEXT: fmv.h.x fa0, a2 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa0 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN32-NEXT: sb a2, 162(sp) +; ZVFHMIN32-NEXT: lh a2, 706(sp) +; ZVFHMIN32-NEXT: lh a4, 450(sp) +; ZVFHMIN32-NEXT: sb a1, 129(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa1, fa5 +; ZVFHMIN32-NEXT: sb a3, 130(sp) +; ZVFHMIN32-NEXT: feq.h a3, fa2, ft4 +; ZVFHMIN32-NEXT: sb a1, 131(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa4, ft2 +; ZVFHMIN32-NEXT: sb a3, 132(sp) +; ZVFHMIN32-NEXT: feq.h a3, fa3, ft3 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN32-NEXT: sb a3, 133(sp) -; ZVFHMIN32-NEXT: sb a0, 134(sp) -; ZVFHMIN32-NEXT: sb s5, 135(sp) -; ZVFHMIN32-NEXT: sb a1, 161(sp) +; ZVFHMIN32-NEXT: sb a1, 134(sp) +; ZVFHMIN32-NEXT: sb a0, 135(sp) +; ZVFHMIN32-NEXT: sb a2, 161(sp) ; ZVFHMIN32-NEXT: lh a0, 610(sp) ; ZVFHMIN32-NEXT: lh a1, 354(sp) -; ZVFHMIN32-NEXT: vmv.x.s s6, v5 -; ZVFHMIN32-NEXT: vmv.x.s s5, v23 +; ZVFHMIN32-NEXT: vmv.x.s s4, v23 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 10 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -1747,13 +1748,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 608(sp) ; ZVFHMIN32-NEXT: lh a1, 352(sp) ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 21 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a2, a2, 4 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 20 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a3, a2, 4 +; ZVFHMIN32-NEXT: sub a2, a3, a2 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 @@ -1762,153 +1762,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: sb a0, 240(sp) ; ZVFHMIN32-NEXT: lh a0, 606(sp) ; ZVFHMIN32-NEXT: lh a1, 350(sp) -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 22 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7 +; ZVFHMIN32-NEXT: vmv.x.s s6, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 239(sp) ; ZVFHMIN32-NEXT: lh a0, 604(sp) ; ZVFHMIN32-NEXT: lh a1, 348(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6 +; ZVFHMIN32-NEXT: vmv.x.s s7, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 238(sp) ; ZVFHMIN32-NEXT: lh a0, 602(sp) ; ZVFHMIN32-NEXT: lh a1, 346(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5 +; ZVFHMIN32-NEXT: vmv.x.s s8, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 237(sp) ; ZVFHMIN32-NEXT: lh a0, 600(sp) ; ZVFHMIN32-NEXT: lh a1, 344(sp) -; ZVFHMIN32-NEXT: vmv.x.s a3, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4 +; ZVFHMIN32-NEXT: vmv.x.s s9, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 236(sp) ; ZVFHMIN32-NEXT: lh a0, 598(sp) ; ZVFHMIN32-NEXT: lh a1, 342(sp) -; ZVFHMIN32-NEXT: vmv.x.s a4, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3 +; ZVFHMIN32-NEXT: vmv.x.s s10, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 235(sp) ; ZVFHMIN32-NEXT: lh a0, 596(sp) ; ZVFHMIN32-NEXT: lh a1, 340(sp) -; ZVFHMIN32-NEXT: vmv.x.s s8, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2 +; ZVFHMIN32-NEXT: vmv.x.s s11, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 234(sp) ; ZVFHMIN32-NEXT: lh a0, 594(sp) ; ZVFHMIN32-NEXT: lh a1, 338(sp) -; ZVFHMIN32-NEXT: vmv.x.s s9, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1 +; ZVFHMIN32-NEXT: vmv.x.s ra, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 233(sp) ; ZVFHMIN32-NEXT: lh a0, 592(sp) -; ZVFHMIN32-NEXT: vmv.x.s a1, v8 -; ZVFHMIN32-NEXT: lh t5, 336(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1 +; ZVFHMIN32-NEXT: lh a1, 336(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 ; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: vmv.x.s s7, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa2, t5 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a2 ; ZVFHMIN32-NEXT: sb a0, 232(sp) ; ZVFHMIN32-NEXT: lh a0, 590(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa2, a3 -; ZVFHMIN32-NEXT: lh a2, 334(sp) -; ZVFHMIN32-NEXT: feq.h t5, fa5, fa3 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: feq.h t6, fa4, fa2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s6 +; ZVFHMIN32-NEXT: lh a1, 334(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa3, t5 +; ZVFHMIN32-NEXT: fmv.h.x fa2, s4 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa0, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa1, fa0 ; ZVFHMIN32-NEXT: sb a0, 231(sp) ; ZVFHMIN32-NEXT: lh a0, 588(sp) -; ZVFHMIN32-NEXT: lh a2, 332(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s5 +; ZVFHMIN32-NEXT: lh a1, 332(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa1, s2 +; ZVFHMIN32-NEXT: fmv.h.x fa0, s5 +; ZVFHMIN32-NEXT: fmv.h.x ft0, a0 +; ZVFHMIN32-NEXT: fmv.h.x ft1, a1 +; ZVFHMIN32-NEXT: feq.h a0, ft0, ft1 ; ZVFHMIN32-NEXT: sb a0, 230(sp) ; ZVFHMIN32-NEXT: lh a0, 586(sp) -; ZVFHMIN32-NEXT: lh a2, 330(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s8 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s4 -; ZVFHMIN32-NEXT: sb a0, 229(sp) -; ZVFHMIN32-NEXT: lh a0, 584(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft0, s3 +; ZVFHMIN32-NEXT: lh a1, 330(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft1, s6 +; ZVFHMIN32-NEXT: fmv.h.x ft2, a0 +; ZVFHMIN32-NEXT: feq.h a0, fa5, ft1 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: feq.h a1, ft2, fa5 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s7 +; ZVFHMIN32-NEXT: sb a1, 229(sp) +; ZVFHMIN32-NEXT: lh a1, 584(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft1, s8 ; ZVFHMIN32-NEXT: lh a2, 328(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s9 -; ZVFHMIN32-NEXT: feq.h s4, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s3 -; ZVFHMIN32-NEXT: sb a0, 228(sp) -; ZVFHMIN32-NEXT: lh a0, 582(sp) -; ZVFHMIN32-NEXT: lh a2, 326(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s2 -; ZVFHMIN32-NEXT: sb a0, 227(sp) -; ZVFHMIN32-NEXT: lh a0, 580(sp) -; ZVFHMIN32-NEXT: lh a2, 324(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s7 -; ZVFHMIN32-NEXT: feq.h s2, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 226(sp) -; ZVFHMIN32-NEXT: lh a0, 578(sp) -; ZVFHMIN32-NEXT: lh a2, 322(sp) -; ZVFHMIN32-NEXT: sb s2, 193(sp) -; ZVFHMIN32-NEXT: sb a1, 194(sp) -; ZVFHMIN32-NEXT: sb s4, 195(sp) -; ZVFHMIN32-NEXT: sb a4, 196(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa3, ft1 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a3, 197(sp) -; ZVFHMIN32-NEXT: sb t6, 198(sp) -; ZVFHMIN32-NEXT: sb t5, 199(sp) -; ZVFHMIN32-NEXT: sb a0, 225(sp) +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN32-NEXT: sb a2, 228(sp) +; ZVFHMIN32-NEXT: lh a2, 582(sp) +; ZVFHMIN32-NEXT: lh a4, 326(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, s10 +; ZVFHMIN32-NEXT: feq.h t4, fa2, fa5 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa3 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s11 +; ZVFHMIN32-NEXT: fmv.h.x fa3, ra +; ZVFHMIN32-NEXT: sb a2, 227(sp) +; ZVFHMIN32-NEXT: lh a2, 580(sp) +; ZVFHMIN32-NEXT: lh a4, 324(sp) +; ZVFHMIN32-NEXT: feq.h t5, fa0, fa5 +; ZVFHMIN32-NEXT: feq.h t6, ft0, fa3 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa3 +; ZVFHMIN32-NEXT: sb a2, 226(sp) +; ZVFHMIN32-NEXT: lh a2, 578(sp) +; ZVFHMIN32-NEXT: lh a4, 322(sp) +; ZVFHMIN32-NEXT: sb t6, 193(sp) +; ZVFHMIN32-NEXT: feq.h t6, fa1, fa4 +; ZVFHMIN32-NEXT: sb t5, 194(sp) +; ZVFHMIN32-NEXT: sb t6, 195(sp) +; ZVFHMIN32-NEXT: sb t4, 196(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 197(sp) +; ZVFHMIN32-NEXT: sb a3, 198(sp) +; ZVFHMIN32-NEXT: sb a0, 199(sp) +; ZVFHMIN32-NEXT: sb a2, 225(sp) ; ZVFHMIN32-NEXT: lh a0, 766(sp) ; ZVFHMIN32-NEXT: lh a1, 510(sp) ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 18 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a3, a2, 4 +; ZVFHMIN32-NEXT: add a2, a3, a2 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload ; ZVFHMIN32-NEXT: vmv.x.s s2, v8 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 14 +; ZVFHMIN32-NEXT: li a3, 11 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 @@ -1920,165 +1915,171 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: sb a0, 191(sp) ; ZVFHMIN32-NEXT: lh a0, 764(sp) ; ZVFHMIN32-NEXT: lh a1, 508(sp) -; ZVFHMIN32-NEXT: vmv.x.s t5, v6 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 2 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s a2, v8 +; ZVFHMIN32-NEXT: vmv.x.s t5, v4 +; ZVFHMIN32-NEXT: vmv.x.s t4, v30 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 190(sp) ; ZVFHMIN32-NEXT: lh a0, 762(sp) ; ZVFHMIN32-NEXT: lh a1, 506(sp) +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a2, a2, 2 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s a2, v8 ; ZVFHMIN32-NEXT: csrr a3, vlenb -; ZVFHMIN32-NEXT: slli a3, a3, 3 +; ZVFHMIN32-NEXT: slli a3, a3, 1 ; ZVFHMIN32-NEXT: add a3, sp, a3 ; ZVFHMIN32-NEXT: addi a3, a3, 848 ; ZVFHMIN32-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload ; ZVFHMIN32-NEXT: vmv.x.s a3, v8 -; ZVFHMIN32-NEXT: csrr a4, vlenb -; ZVFHMIN32-NEXT: li s3, 6 -; ZVFHMIN32-NEXT: mul a4, a4, s3 -; ZVFHMIN32-NEXT: add a4, sp, a4 -; ZVFHMIN32-NEXT: addi a4, a4, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s a4, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 189(sp) ; ZVFHMIN32-NEXT: lh a0, 760(sp) ; ZVFHMIN32-NEXT: lh a1, 504(sp) -; ZVFHMIN32-NEXT: csrr s3, vlenb -; ZVFHMIN32-NEXT: li s4, 12 -; ZVFHMIN32-NEXT: mul s3, s3, s4 -; ZVFHMIN32-NEXT: add s3, sp, s3 -; ZVFHMIN32-NEXT: addi s3, s3, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s s6, v8 -; ZVFHMIN32-NEXT: csrr s3, vlenb -; ZVFHMIN32-NEXT: li s4, 10 -; ZVFHMIN32-NEXT: mul s3, s3, s4 -; ZVFHMIN32-NEXT: add s3, sp, s3 -; ZVFHMIN32-NEXT: addi s3, s3, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s s4, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, t3 +; ZVFHMIN32-NEXT: csrr a4, vlenb +; ZVFHMIN32-NEXT: li t3, 6 +; ZVFHMIN32-NEXT: mul a4, a4, t3 +; ZVFHMIN32-NEXT: add a4, sp, a4 +; ZVFHMIN32-NEXT: addi a4, a4, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s a4, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 ; ZVFHMIN32-NEXT: sb a0, 188(sp) ; ZVFHMIN32-NEXT: lh a0, 758(sp) ; ZVFHMIN32-NEXT: lh a1, 502(sp) -; ZVFHMIN32-NEXT: csrr s3, vlenb -; ZVFHMIN32-NEXT: slli s3, s3, 4 -; ZVFHMIN32-NEXT: add s3, sp, s3 -; ZVFHMIN32-NEXT: addi s3, s3, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s s5, v8 -; ZVFHMIN32-NEXT: vmv.x.s s3, v16 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN32-NEXT: csrr t2, vlenb +; ZVFHMIN32-NEXT: slli t2, t2, 3 +; ZVFHMIN32-NEXT: add t2, sp, t2 +; ZVFHMIN32-NEXT: addi t2, t2, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (t2) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s t2, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 ; ZVFHMIN32-NEXT: sb a0, 187(sp) ; ZVFHMIN32-NEXT: lh a0, 756(sp) ; ZVFHMIN32-NEXT: lh a1, 500(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h t4, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t3 +; ZVFHMIN32-NEXT: fmv.h.x fa3, t1 +; ZVFHMIN32-NEXT: csrr t1, vlenb +; ZVFHMIN32-NEXT: li t3, 13 +; ZVFHMIN32-NEXT: mul t1, t1, t3 +; ZVFHMIN32-NEXT: add t1, sp, t1 +; ZVFHMIN32-NEXT: addi t1, t1, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (t1) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s t3, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 186(sp) ; ZVFHMIN32-NEXT: lh a0, 754(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa2, t0 ; ZVFHMIN32-NEXT: lh a1, 498(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h t3, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t1 -; ZVFHMIN32-NEXT: sb a0, 185(sp) -; ZVFHMIN32-NEXT: lh a0, 752(sp) -; ZVFHMIN32-NEXT: lh a1, 496(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h t1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: sb a0, 184(sp) -; ZVFHMIN32-NEXT: lh a0, 750(sp) -; ZVFHMIN32-NEXT: lh a1, 494(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s6 +; ZVFHMIN32-NEXT: csrr t0, vlenb +; ZVFHMIN32-NEXT: li t1, 19 +; ZVFHMIN32-NEXT: mul t0, t0, t1 +; ZVFHMIN32-NEXT: add t0, sp, t0 +; ZVFHMIN32-NEXT: addi t0, t0, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (t0) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s3, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a0 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li t0, 21 +; ZVFHMIN32-NEXT: mul a0, a0, t0 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s a0, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa0, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa1, fa0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a2 +; ZVFHMIN32-NEXT: sb a1, 185(sp) +; ZVFHMIN32-NEXT: lh a1, 752(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa0, a3 +; ZVFHMIN32-NEXT: lh a2, 496(sp) +; ZVFHMIN32-NEXT: feq.h t0, fa5, fa1 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: feq.h t1, fa4, fa0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN32-NEXT: sb a1, 184(sp) +; ZVFHMIN32-NEXT: lh a1, 750(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN32-NEXT: lh a2, 494(sp) +; ZVFHMIN32-NEXT: feq.h a3, fa3, fa5 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa2, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN32-NEXT: sb a0, 183(sp) -; ZVFHMIN32-NEXT: lh a0, 748(sp) -; ZVFHMIN32-NEXT: lh a1, 492(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s4 -; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a7 -; ZVFHMIN32-NEXT: sb a0, 182(sp) -; ZVFHMIN32-NEXT: lh a0, 746(sp) -; ZVFHMIN32-NEXT: lh a1, 490(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s5 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a2, 183(sp) +; ZVFHMIN32-NEXT: lh a2, 748(sp) +; ZVFHMIN32-NEXT: lh a4, 492(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 +; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN32-NEXT: sb a0, 181(sp) -; ZVFHMIN32-NEXT: lh a0, 744(sp) -; ZVFHMIN32-NEXT: lh a1, 488(sp) +; ZVFHMIN32-NEXT: sb a2, 182(sp) +; ZVFHMIN32-NEXT: lh a2, 746(sp) +; ZVFHMIN32-NEXT: lh a4, 490(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, s3 ; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a5 -; ZVFHMIN32-NEXT: addi a1, sp, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s a1, v8 +; ZVFHMIN32-NEXT: sb a2, 181(sp) +; ZVFHMIN32-NEXT: lh a2, 744(sp) +; ZVFHMIN32-NEXT: lh a4, 488(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: lw a4, 108(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN32-NEXT: vmv.x.s a5, v0 ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 15 -; ZVFHMIN32-NEXT: vmv.x.s a5, v8 -; ZVFHMIN32-NEXT: sb a0, 180(sp) -; ZVFHMIN32-NEXT: lh a0, 742(sp) -; ZVFHMIN32-NEXT: lh a7, 486(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 179(sp) -; ZVFHMIN32-NEXT: lh a0, 740(sp) -; ZVFHMIN32-NEXT: lh a7, 484(sp) -; ZVFHMIN32-NEXT: sb a2, 140(sp) -; ZVFHMIN32-NEXT: sb t1, 141(sp) -; ZVFHMIN32-NEXT: sb t3, 142(sp) -; ZVFHMIN32-NEXT: sb t4, 143(sp) -; ZVFHMIN32-NEXT: sb a1, 136(sp) -; ZVFHMIN32-NEXT: sb a6, 137(sp) -; ZVFHMIN32-NEXT: sb a4, 138(sp) -; ZVFHMIN32-NEXT: sb a3, 139(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN32-NEXT: vmv.x.s a4, v8 +; ZVFHMIN32-NEXT: sb a2, 180(sp) +; ZVFHMIN32-NEXT: lh a2, 742(sp) +; ZVFHMIN32-NEXT: lh t2, 486(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: sb a2, 179(sp) +; ZVFHMIN32-NEXT: lh a2, 740(sp) +; ZVFHMIN32-NEXT: lh t2, 484(sp) +; ZVFHMIN32-NEXT: sb a1, 140(sp) +; ZVFHMIN32-NEXT: sb a3, 141(sp) +; ZVFHMIN32-NEXT: sb t1, 142(sp) +; ZVFHMIN32-NEXT: sb t0, 143(sp) +; ZVFHMIN32-NEXT: sb a5, 136(sp) +; ZVFHMIN32-NEXT: sb a0, 137(sp) +; ZVFHMIN32-NEXT: sb a6, 138(sp) +; ZVFHMIN32-NEXT: sb a7, 139(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 178(sp) ; ZVFHMIN32-NEXT: lh a0, 638(sp) ; ZVFHMIN32-NEXT: lh a1, 382(sp) ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 14 -; ZVFHMIN32-NEXT: vmv.x.s t3, v8 +; ZVFHMIN32-NEXT: vmv.x.s t2, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -2086,7 +2087,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 636(sp) ; ZVFHMIN32-NEXT: lh a1, 380(sp) ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 13 -; ZVFHMIN32-NEXT: vmv.x.s t2, v8 +; ZVFHMIN32-NEXT: vmv.x.s t1, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -2094,7 +2095,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 634(sp) ; ZVFHMIN32-NEXT: lh a1, 378(sp) ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 12 -; ZVFHMIN32-NEXT: vmv.x.s t1, v8 +; ZVFHMIN32-NEXT: vmv.x.s t0, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -2102,7 +2103,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 632(sp) ; ZVFHMIN32-NEXT: lh a1, 376(sp) ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 11 -; ZVFHMIN32-NEXT: vmv.x.s t0, v8 +; ZVFHMIN32-NEXT: vmv.x.s a7, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -2110,7 +2111,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 630(sp) ; ZVFHMIN32-NEXT: lh a1, 374(sp) ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 10 -; ZVFHMIN32-NEXT: vmv.x.s a7, v8 +; ZVFHMIN32-NEXT: vmv.x.s a6, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -2118,102 +2119,101 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 628(sp) ; ZVFHMIN32-NEXT: lh a1, 372(sp) ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 9 -; ZVFHMIN32-NEXT: vmv.x.s a6, v8 +; ZVFHMIN32-NEXT: vmv.x.s a5, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: lw a1, 112(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw a1, 116(sp) # 4-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: sb a0, 250(sp) ; ZVFHMIN32-NEXT: lh a0, 626(sp) ; ZVFHMIN32-NEXT: lh a1, 370(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: lw a1, 116(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: sb a0, 249(sp) -; ZVFHMIN32-NEXT: lh a0, 624(sp) -; ZVFHMIN32-NEXT: lh a1, 368(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 ; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: lw a1, 124(sp) # 4-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: sb a0, 248(sp) -; ZVFHMIN32-NEXT: lh a0, 622(sp) -; ZVFHMIN32-NEXT: lh a1, 366(sp) +; ZVFHMIN32-NEXT: sb a0, 249(sp) +; ZVFHMIN32-NEXT: lh a1, 624(sp) +; ZVFHMIN32-NEXT: lh a3, 368(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: lw a1, 108(sp) # 4-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: sb a0, 247(sp) -; ZVFHMIN32-NEXT: lh a0, 620(sp) -; ZVFHMIN32-NEXT: lh a1, 364(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: lw a3, 112(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN32-NEXT: sb a1, 248(sp) +; ZVFHMIN32-NEXT: lh a1, 622(sp) +; ZVFHMIN32-NEXT: lh a3, 366(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: lw a1, 120(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: sb a0, 246(sp) -; ZVFHMIN32-NEXT: lh a0, 618(sp) -; ZVFHMIN32-NEXT: lh a1, 362(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: lw a3, 120(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN32-NEXT: sb a1, 247(sp) +; ZVFHMIN32-NEXT: lh a1, 620(sp) +; ZVFHMIN32-NEXT: lh a3, 364(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, t0 ; ZVFHMIN32-NEXT: feq.h t0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, s2 -; ZVFHMIN32-NEXT: sb a0, 245(sp) -; ZVFHMIN32-NEXT: lh a0, 616(sp) -; ZVFHMIN32-NEXT: lh a1, 360(sp) +; ZVFHMIN32-NEXT: sb a1, 246(sp) +; ZVFHMIN32-NEXT: lh a1, 618(sp) +; ZVFHMIN32-NEXT: lh a3, 362(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 ; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN32-NEXT: sb a0, 244(sp) -; ZVFHMIN32-NEXT: lh a0, 614(sp) -; ZVFHMIN32-NEXT: lh a1, 358(sp) +; ZVFHMIN32-NEXT: sb a1, 245(sp) +; ZVFHMIN32-NEXT: lh a1, 616(sp) +; ZVFHMIN32-NEXT: lh a3, 360(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, a6 ; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN32-NEXT: sb a1, 244(sp) +; ZVFHMIN32-NEXT: lh a1, 614(sp) +; ZVFHMIN32-NEXT: lh a3, 358(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 8 -; ZVFHMIN32-NEXT: vmv.x.s a1, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: sb a0, 243(sp) -; ZVFHMIN32-NEXT: lh a0, 612(sp) -; ZVFHMIN32-NEXT: lh a1, 356(sp) -; ZVFHMIN32-NEXT: sb a5, 204(sp) +; ZVFHMIN32-NEXT: vmv.x.s a3, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: sb a1, 243(sp) +; ZVFHMIN32-NEXT: lh a1, 612(sp) +; ZVFHMIN32-NEXT: lh a3, 356(sp) +; ZVFHMIN32-NEXT: sb t0, 204(sp) ; ZVFHMIN32-NEXT: sb a4, 205(sp) -; ZVFHMIN32-NEXT: sb a2, 206(sp) -; ZVFHMIN32-NEXT: sb a3, 207(sp) -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: sb a2, 200(sp) -; ZVFHMIN32-NEXT: sb a6, 201(sp) -; ZVFHMIN32-NEXT: sb a7, 202(sp) -; ZVFHMIN32-NEXT: sb t0, 203(sp) -; ZVFHMIN32-NEXT: li a2, 128 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 242(sp) -; ZVFHMIN32-NEXT: addi a0, sp, 128 -; ZVFHMIN32-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; ZVFHMIN32-NEXT: vle8.v v8, (a0) +; ZVFHMIN32-NEXT: sb a0, 206(sp) +; ZVFHMIN32-NEXT: sb a2, 207(sp) +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 200(sp) +; ZVFHMIN32-NEXT: sb a5, 201(sp) +; ZVFHMIN32-NEXT: sb a6, 202(sp) +; ZVFHMIN32-NEXT: sb a7, 203(sp) +; ZVFHMIN32-NEXT: li a0, 128 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 242(sp) +; ZVFHMIN32-NEXT: addi a1, sp, 128 +; ZVFHMIN32-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; ZVFHMIN32-NEXT: vle8.v v8, (a1) ; ZVFHMIN32-NEXT: vand.vi v8, v8, 1 ; ZVFHMIN32-NEXT: vmsne.vi v0, v8, 0 ; ZVFHMIN32-NEXT: addi sp, s0, -896 @@ -2498,19 +2498,16 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN64-NEXT: vslidedown.vi v26, v8, 15 -; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 14 -; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 13 -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 12 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 1 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 14 +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 13 +; ZVFHMIN64-NEXT: addi a2, sp, 800 ; ZVFHMIN64-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v4, v8, 11 -; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 10 -; ZVFHMIN64-NEXT: vslidedown.vi v30, v8, 9 -; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 8 -; ZVFHMIN64-NEXT: vmv.x.s a4, v16 +; ZVFHMIN64-NEXT: vslidedown.vi v6, v8, 12 +; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 11 +; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 10 +; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 9 +; ZVFHMIN64-NEXT: vslidedown.vi v18, v8, 8 +; ZVFHMIN64-NEXT: vmv.x.s a3, v16 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -2518,52 +2515,51 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 560(sp) ; ZVFHMIN64-NEXT: lh a1, 304(sp) ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 7 -; ZVFHMIN64-NEXT: vslidedown.vi v31, v16, 6 -; ZVFHMIN64-NEXT: vslidedown.vi v5, v16, 5 +; ZVFHMIN64-NEXT: vslidedown.vi v21, v16, 7 +; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 6 +; ZVFHMIN64-NEXT: vslidedown.vi v19, v16, 5 ; ZVFHMIN64-NEXT: vslidedown.vi v23, v16, 4 ; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 3 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 21 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: li a4, 10 +; ZVFHMIN64-NEXT: mul a2, a2, a4 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 2 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 20 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a2, a2, 4 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 1 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 22 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a4, a2, 4 +; ZVFHMIN64-NEXT: sub a2, a4, a2 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v18, v16, 15 -; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 14 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 13 +; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 15 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 14 +; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 13 ; ZVFHMIN64-NEXT: vslidedown.vi v12, v16, 12 -; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 11 -; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 10 +; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 11 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 18 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a4, a2, 4 +; ZVFHMIN64-NEXT: add a2, a4, a2 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 9 +; ZVFHMIN64-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 10 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 14 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: li a4, 11 +; ZVFHMIN64-NEXT: mul a2, a2, a4 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 8 +; ZVFHMIN64-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v4, v16, 9 +; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -2571,12 +2567,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 558(sp) ; ZVFHMIN64-NEXT: lh a1, 302(sp) ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v13, v0, 7 -; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 6 -; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 5 -; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 4 -; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 3 -; ZVFHMIN64-NEXT: vslidedown.vi v21, v0, 2 +; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 7 +; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 6 +; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 5 +; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 4 +; ZVFHMIN64-NEXT: vslidedown.vi v31, v0, 3 +; ZVFHMIN64-NEXT: vslidedown.vi v5, v0, 2 ; ZVFHMIN64-NEXT: vslidedown.vi v27, v0, 1 ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 15 @@ -2587,63 +2583,63 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 14 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 3 +; ZVFHMIN64-NEXT: slli a2, a2, 1 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 13 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 6 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: li a4, 6 +; ZVFHMIN64-NEXT: mul a2, a2, a4 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 12 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 12 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a2, a2, 3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 11 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 10 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: li a4, 13 +; ZVFHMIN64-NEXT: mul a2, a2, a4 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 10 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 4 +; ZVFHMIN64-NEXT: li a4, 19 +; ZVFHMIN64-NEXT: mul a2, a2, a4 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 9 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a4, 21 +; ZVFHMIN64-NEXT: mul a2, a2, a4 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v0, v0, 8 -; ZVFHMIN64-NEXT: addi a2, sp, 800 -; ZVFHMIN64-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vmv.x.s t4, v26 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 215(sp) ; ZVFHMIN64-NEXT: lh a0, 556(sp) ; ZVFHMIN64-NEXT: lh a1, 300(sp) -; ZVFHMIN64-NEXT: vmv.x.s t3, v20 -; ZVFHMIN64-NEXT: vmv.x.s t1, v28 +; ZVFHMIN64-NEXT: vmv.x.s t3, v26 +; ZVFHMIN64-NEXT: vmv.x.s t2, v28 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 214(sp) ; ZVFHMIN64-NEXT: lh a0, 554(sp) ; ZVFHMIN64-NEXT: lh a1, 298(sp) -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 1 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s t2, v0 -; ZVFHMIN64-NEXT: vmv.x.s t0, v4 +; ZVFHMIN64-NEXT: addi a2, sp, 800 +; ZVFHMIN64-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s t1, v16 +; ZVFHMIN64-NEXT: vmv.x.s t0, v6 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -2651,229 +2647,234 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 552(sp) ; ZVFHMIN64-NEXT: lh a1, 296(sp) ; ZVFHMIN64-NEXT: vmv.x.s a7, v2 -; ZVFHMIN64-NEXT: vmv.x.s a6, v30 +; ZVFHMIN64-NEXT: vmv.x.s a6, v22 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 212(sp) ; ZVFHMIN64-NEXT: lh a0, 550(sp) ; ZVFHMIN64-NEXT: lh a1, 294(sp) -; ZVFHMIN64-NEXT: vmv.x.s a5, v22 +; ZVFHMIN64-NEXT: vmv.x.s a5, v20 ; ZVFHMIN64-NEXT: vmv.x.s a2, v18 -; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd a2, 88(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 211(sp) -; ZVFHMIN64-NEXT: lh a1, 548(sp) -; ZVFHMIN64-NEXT: lh t5, 292(sp) -; ZVFHMIN64-NEXT: vmv.x.s a0, v14 -; ZVFHMIN64-NEXT: sd a0, 104(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: vmv.x.s a0, v8 -; ZVFHMIN64-NEXT: sd a0, 120(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t5 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 210(sp) -; ZVFHMIN64-NEXT: lh a1, 546(sp) -; ZVFHMIN64-NEXT: lh t5, 290(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: vmv.x.s a4, v24 +; ZVFHMIN64-NEXT: lh a0, 548(sp) +; ZVFHMIN64-NEXT: lh a1, 292(sp) +; ZVFHMIN64-NEXT: vmv.x.s a2, v14 +; ZVFHMIN64-NEXT: sd a2, 104(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s a2, v8 +; ZVFHMIN64-NEXT: sd a2, 120(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa3, t5 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa3 -; ZVFHMIN64-NEXT: sb a1, 209(sp) -; ZVFHMIN64-NEXT: lh a1, 544(sp) -; ZVFHMIN64-NEXT: lh t5, 288(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t5 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a4, 192(sp) -; ZVFHMIN64-NEXT: sb a1, 208(sp) -; ZVFHMIN64-NEXT: lh t5, 738(sp) -; ZVFHMIN64-NEXT: lh t6, 482(sp) -; ZVFHMIN64-NEXT: vmv.x.s a0, v12 -; ZVFHMIN64-NEXT: sd a0, 88(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: vmv.x.s a0, v10 -; ZVFHMIN64-NEXT: sd a0, 112(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN64-NEXT: sb t5, 177(sp) -; ZVFHMIN64-NEXT: lh t5, 736(sp) -; ZVFHMIN64-NEXT: lh t6, 480(sp) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 29 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s5, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 28 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s6, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN64-NEXT: sb t5, 176(sp) -; ZVFHMIN64-NEXT: lh t5, 734(sp) -; ZVFHMIN64-NEXT: lh t6, 478(sp) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 27 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s7, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 26 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s8, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN64-NEXT: sb t5, 175(sp) -; ZVFHMIN64-NEXT: lh t5, 732(sp) -; ZVFHMIN64-NEXT: lh t6, 476(sp) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 25 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s4, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 24 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s3, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN64-NEXT: sb t5, 174(sp) -; ZVFHMIN64-NEXT: lh t6, 730(sp) -; ZVFHMIN64-NEXT: lh s9, 474(sp) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 23 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s2, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s t5, v3 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t6 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 210(sp) +; ZVFHMIN64-NEXT: lh a0, 546(sp) +; ZVFHMIN64-NEXT: lh a1, 290(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN64-NEXT: vmv.x.s a3, v24 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 +; ZVFHMIN64-NEXT: sb a0, 209(sp) +; ZVFHMIN64-NEXT: lh a0, 544(sp) +; ZVFHMIN64-NEXT: lh a1, 288(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a3, 192(sp) +; ZVFHMIN64-NEXT: sb a0, 208(sp) +; ZVFHMIN64-NEXT: lh a0, 738(sp) +; ZVFHMIN64-NEXT: lh a1, 482(sp) +; ZVFHMIN64-NEXT: vmv.x.s a2, v10 +; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s a2, v12 +; ZVFHMIN64-NEXT: sd a2, 112(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 177(sp) +; ZVFHMIN64-NEXT: lh a0, 736(sp) +; ZVFHMIN64-NEXT: lh a1, 480(sp) +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 29 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 28 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 176(sp) +; ZVFHMIN64-NEXT: lh a0, 734(sp) +; ZVFHMIN64-NEXT: lh a1, 478(sp) +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 27 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh s6, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 26 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 175(sp) +; ZVFHMIN64-NEXT: lh a0, 732(sp) +; ZVFHMIN64-NEXT: lh a1, 476(sp) +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 25 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh s7, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 24 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 174(sp) +; ZVFHMIN64-NEXT: lh a0, 730(sp) +; ZVFHMIN64-NEXT: lh a1, 474(sp) +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 23 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh s8, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s t4, v21 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 173(sp) +; ZVFHMIN64-NEXT: lh a0, 728(sp) +; ZVFHMIN64-NEXT: lh a1, 472(sp) +; ZVFHMIN64-NEXT: vmv.x.s t6, v3 +; ZVFHMIN64-NEXT: vmv.x.s t5, v19 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 172(sp) +; ZVFHMIN64-NEXT: lh a0, 726(sp) +; ZVFHMIN64-NEXT: lh a1, 470(sp) +; ZVFHMIN64-NEXT: vmv.x.s s10, v11 +; ZVFHMIN64-NEXT: vmv.x.s s11, v7 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 171(sp) +; ZVFHMIN64-NEXT: lh a0, 724(sp) +; ZVFHMIN64-NEXT: lh s9, 468(sp) +; ZVFHMIN64-NEXT: vmv.x.s a4, v9 +; ZVFHMIN64-NEXT: vmv.x.s ra, v29 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, s9 -; ZVFHMIN64-NEXT: feq.h t6, fa5, fa4 -; ZVFHMIN64-NEXT: sb t6, 173(sp) -; ZVFHMIN64-NEXT: lh s9, 728(sp) -; ZVFHMIN64-NEXT: lh s10, 472(sp) -; ZVFHMIN64-NEXT: vmv.x.s t6, v31 -; ZVFHMIN64-NEXT: vmv.x.s ra, v13 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s9 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN64-NEXT: feq.h s9, fa5, fa4 -; ZVFHMIN64-NEXT: sb s9, 172(sp) -; ZVFHMIN64-NEXT: lh s9, 726(sp) -; ZVFHMIN64-NEXT: lh s10, 470(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v29 -; ZVFHMIN64-NEXT: vmv.x.s a3, v11 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s9 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN64-NEXT: feq.h s9, fa5, fa4 -; ZVFHMIN64-NEXT: sb s9, 171(sp) -; ZVFHMIN64-NEXT: lh s10, 724(sp) -; ZVFHMIN64-NEXT: lh s11, 468(sp) -; ZVFHMIN64-NEXT: vmv.x.s a4, v7 -; ZVFHMIN64-NEXT: vmv.x.s s9, v9 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s10 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s11 -; ZVFHMIN64-NEXT: feq.h s10, fa5, fa4 -; ZVFHMIN64-NEXT: sb s10, 170(sp) +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 170(sp) ; ZVFHMIN64-NEXT: lh a0, 722(sp) ; ZVFHMIN64-NEXT: lh a1, 466(sp) -; ZVFHMIN64-NEXT: vmv.x.s s10, v21 -; ZVFHMIN64-NEXT: vmv.x.s s11, v27 +; ZVFHMIN64-NEXT: vmv.x.s s9, v31 +; ZVFHMIN64-NEXT: vmv.x.s a3, v5 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 169(sp) ; ZVFHMIN64-NEXT: lh a0, 720(sp) ; ZVFHMIN64-NEXT: lh a1, 464(sp) +; ZVFHMIN64-NEXT: vmv.x.s a2, v27 ; ZVFHMIN64-NEXT: fmv.h.x fa5, s5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s6 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 ; ZVFHMIN64-NEXT: sb a0, 168(sp) ; ZVFHMIN64-NEXT: lh a0, 718(sp) ; ZVFHMIN64-NEXT: lh a1, 462(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa3, s7 -; ZVFHMIN64-NEXT: fmv.h.x fa2, s8 -; ZVFHMIN64-NEXT: fmv.h.x fa1, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa0, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa1, fa0 -; ZVFHMIN64-NEXT: fmv.h.x fa1, ra +; ZVFHMIN64-NEXT: fmv.h.x fa4, s2 +; ZVFHMIN64-NEXT: fmv.h.x fa3, s6 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 167(sp) ; ZVFHMIN64-NEXT: lh a0, 716(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa0, a2 ; ZVFHMIN64-NEXT: lh a1, 460(sp) -; ZVFHMIN64-NEXT: feq.h s5, fa5, fa1 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s4 -; ZVFHMIN64-NEXT: sb a1, 166(sp) -; ZVFHMIN64-NEXT: lh a1, 714(sp) -; ZVFHMIN64-NEXT: lh a2, 458(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a3, fa3, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa3 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s3 -; ZVFHMIN64-NEXT: sb a1, 165(sp) -; ZVFHMIN64-NEXT: lh a1, 712(sp) -; ZVFHMIN64-NEXT: lh a2, 456(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa3, a4 -; ZVFHMIN64-NEXT: feq.h a4, fa2, fa3 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa3, fa2 -; ZVFHMIN64-NEXT: fmv.h.x fa3, s2 -; ZVFHMIN64-NEXT: sb a1, 164(sp) -; ZVFHMIN64-NEXT: lh a1, 710(sp) -; ZVFHMIN64-NEXT: lh a2, 454(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa2, s9 -; ZVFHMIN64-NEXT: feq.h s2, fa5, fa2 +; ZVFHMIN64-NEXT: fmv.h.x fa2, s3 +; ZVFHMIN64-NEXT: fmv.h.x fa1, s7 +; ZVFHMIN64-NEXT: fmv.h.x fa0, a0 +; ZVFHMIN64-NEXT: fmv.h.x ft0, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa0, ft0 +; ZVFHMIN64-NEXT: sb a0, 166(sp) +; ZVFHMIN64-NEXT: lh a0, 714(sp) +; ZVFHMIN64-NEXT: lh a1, 458(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa0, s4 +; ZVFHMIN64-NEXT: fmv.h.x ft0, s8 +; ZVFHMIN64-NEXT: fmv.h.x ft1, a0 +; ZVFHMIN64-NEXT: fmv.h.x ft2, a1 +; ZVFHMIN64-NEXT: feq.h a0, ft1, ft2 +; ZVFHMIN64-NEXT: sb a0, 165(sp) +; ZVFHMIN64-NEXT: lh a0, 712(sp) +; ZVFHMIN64-NEXT: lh a1, 456(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft1, s10 +; ZVFHMIN64-NEXT: fmv.h.x ft2, s11 +; ZVFHMIN64-NEXT: fmv.h.x ft3, a0 +; ZVFHMIN64-NEXT: fmv.h.x ft4, a1 +; ZVFHMIN64-NEXT: feq.h a0, ft3, ft4 +; ZVFHMIN64-NEXT: sb a0, 164(sp) +; ZVFHMIN64-NEXT: lh a0, 710(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft3, a4 +; ZVFHMIN64-NEXT: lh a1, 454(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft4, ra +; ZVFHMIN64-NEXT: fmv.h.x ft5, a0 +; ZVFHMIN64-NEXT: feq.h a0, fa5, ft1 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa2 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s10 -; ZVFHMIN64-NEXT: fmv.h.x fa2, s11 +; ZVFHMIN64-NEXT: feq.h a1, ft5, fa5 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 ; ZVFHMIN64-NEXT: sb a1, 163(sp) ; ZVFHMIN64-NEXT: lh a1, 708(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft1, a2 ; ZVFHMIN64-NEXT: lh a2, 452(sp) -; ZVFHMIN64-NEXT: feq.h s3, fa4, fa5 -; ZVFHMIN64-NEXT: feq.h s4, fa3, fa2 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 162(sp) -; ZVFHMIN64-NEXT: lh a1, 706(sp) -; ZVFHMIN64-NEXT: lh a2, 450(sp) -; ZVFHMIN64-NEXT: sb s4, 129(sp) -; ZVFHMIN64-NEXT: sb s3, 130(sp) -; ZVFHMIN64-NEXT: sb s2, 131(sp) -; ZVFHMIN64-NEXT: sb a4, 132(sp) +; ZVFHMIN64-NEXT: feq.h a3, fa0, fa5 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a1, ft0, ft1 +; ZVFHMIN64-NEXT: fmv.h.x fa0, a2 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa0 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN64-NEXT: sb a2, 162(sp) +; ZVFHMIN64-NEXT: lh a2, 706(sp) +; ZVFHMIN64-NEXT: lh a4, 450(sp) +; ZVFHMIN64-NEXT: sb a1, 129(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa1, fa5 +; ZVFHMIN64-NEXT: sb a3, 130(sp) +; ZVFHMIN64-NEXT: feq.h a3, fa2, ft4 +; ZVFHMIN64-NEXT: sb a1, 131(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa4, ft2 +; ZVFHMIN64-NEXT: sb a3, 132(sp) +; ZVFHMIN64-NEXT: feq.h a3, fa3, ft3 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN64-NEXT: sb a3, 133(sp) -; ZVFHMIN64-NEXT: sb a0, 134(sp) -; ZVFHMIN64-NEXT: sb s5, 135(sp) -; ZVFHMIN64-NEXT: sb a1, 161(sp) +; ZVFHMIN64-NEXT: sb a1, 134(sp) +; ZVFHMIN64-NEXT: sb a0, 135(sp) +; ZVFHMIN64-NEXT: sb a2, 161(sp) ; ZVFHMIN64-NEXT: lh a0, 610(sp) ; ZVFHMIN64-NEXT: lh a1, 354(sp) -; ZVFHMIN64-NEXT: vmv.x.s s6, v5 -; ZVFHMIN64-NEXT: vmv.x.s s5, v23 +; ZVFHMIN64-NEXT: vmv.x.s s4, v23 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 10 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -2881,13 +2882,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 608(sp) ; ZVFHMIN64-NEXT: lh a1, 352(sp) ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 21 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a2, a2, 4 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 20 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a3, a2, 4 +; ZVFHMIN64-NEXT: sub a2, a3, a2 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 @@ -2896,153 +2896,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: sb a0, 240(sp) ; ZVFHMIN64-NEXT: lh a0, 606(sp) ; ZVFHMIN64-NEXT: lh a1, 350(sp) -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 22 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7 +; ZVFHMIN64-NEXT: vmv.x.s s6, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 239(sp) ; ZVFHMIN64-NEXT: lh a0, 604(sp) ; ZVFHMIN64-NEXT: lh a1, 348(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6 +; ZVFHMIN64-NEXT: vmv.x.s s7, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 238(sp) ; ZVFHMIN64-NEXT: lh a0, 602(sp) ; ZVFHMIN64-NEXT: lh a1, 346(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5 +; ZVFHMIN64-NEXT: vmv.x.s s8, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 237(sp) ; ZVFHMIN64-NEXT: lh a0, 600(sp) ; ZVFHMIN64-NEXT: lh a1, 344(sp) -; ZVFHMIN64-NEXT: vmv.x.s a3, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4 +; ZVFHMIN64-NEXT: vmv.x.s s9, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 236(sp) ; ZVFHMIN64-NEXT: lh a0, 598(sp) ; ZVFHMIN64-NEXT: lh a1, 342(sp) -; ZVFHMIN64-NEXT: vmv.x.s a4, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3 +; ZVFHMIN64-NEXT: vmv.x.s s10, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 235(sp) ; ZVFHMIN64-NEXT: lh a0, 596(sp) ; ZVFHMIN64-NEXT: lh a1, 340(sp) -; ZVFHMIN64-NEXT: vmv.x.s s8, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2 +; ZVFHMIN64-NEXT: vmv.x.s s11, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 234(sp) ; ZVFHMIN64-NEXT: lh a0, 594(sp) ; ZVFHMIN64-NEXT: lh a1, 338(sp) -; ZVFHMIN64-NEXT: vmv.x.s s9, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1 +; ZVFHMIN64-NEXT: vmv.x.s ra, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 233(sp) ; ZVFHMIN64-NEXT: lh a0, 592(sp) -; ZVFHMIN64-NEXT: vmv.x.s a1, v8 -; ZVFHMIN64-NEXT: lh t5, 336(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1 +; ZVFHMIN64-NEXT: lh a1, 336(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 ; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: vmv.x.s s7, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa2, t5 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a2 ; ZVFHMIN64-NEXT: sb a0, 232(sp) ; ZVFHMIN64-NEXT: lh a0, 590(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa2, a3 -; ZVFHMIN64-NEXT: lh a2, 334(sp) -; ZVFHMIN64-NEXT: feq.h t5, fa5, fa3 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: feq.h t6, fa4, fa2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s6 +; ZVFHMIN64-NEXT: lh a1, 334(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa3, t5 +; ZVFHMIN64-NEXT: fmv.h.x fa2, s4 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa0, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa1, fa0 ; ZVFHMIN64-NEXT: sb a0, 231(sp) ; ZVFHMIN64-NEXT: lh a0, 588(sp) -; ZVFHMIN64-NEXT: lh a2, 332(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s5 +; ZVFHMIN64-NEXT: lh a1, 332(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa1, s2 +; ZVFHMIN64-NEXT: fmv.h.x fa0, s5 +; ZVFHMIN64-NEXT: fmv.h.x ft0, a0 +; ZVFHMIN64-NEXT: fmv.h.x ft1, a1 +; ZVFHMIN64-NEXT: feq.h a0, ft0, ft1 ; ZVFHMIN64-NEXT: sb a0, 230(sp) ; ZVFHMIN64-NEXT: lh a0, 586(sp) -; ZVFHMIN64-NEXT: lh a2, 330(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s8 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s4 -; ZVFHMIN64-NEXT: sb a0, 229(sp) -; ZVFHMIN64-NEXT: lh a0, 584(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft0, s3 +; ZVFHMIN64-NEXT: lh a1, 330(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft1, s6 +; ZVFHMIN64-NEXT: fmv.h.x ft2, a0 +; ZVFHMIN64-NEXT: feq.h a0, fa5, ft1 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: feq.h a1, ft2, fa5 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s7 +; ZVFHMIN64-NEXT: sb a1, 229(sp) +; ZVFHMIN64-NEXT: lh a1, 584(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft1, s8 ; ZVFHMIN64-NEXT: lh a2, 328(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s9 -; ZVFHMIN64-NEXT: feq.h s4, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s3 -; ZVFHMIN64-NEXT: sb a0, 228(sp) -; ZVFHMIN64-NEXT: lh a0, 582(sp) -; ZVFHMIN64-NEXT: lh a2, 326(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s2 -; ZVFHMIN64-NEXT: sb a0, 227(sp) -; ZVFHMIN64-NEXT: lh a0, 580(sp) -; ZVFHMIN64-NEXT: lh a2, 324(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s7 -; ZVFHMIN64-NEXT: feq.h s2, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 226(sp) -; ZVFHMIN64-NEXT: lh a0, 578(sp) -; ZVFHMIN64-NEXT: lh a2, 322(sp) -; ZVFHMIN64-NEXT: sb s2, 193(sp) -; ZVFHMIN64-NEXT: sb a1, 194(sp) -; ZVFHMIN64-NEXT: sb s4, 195(sp) -; ZVFHMIN64-NEXT: sb a4, 196(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa3, ft1 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a3, 197(sp) -; ZVFHMIN64-NEXT: sb t6, 198(sp) -; ZVFHMIN64-NEXT: sb t5, 199(sp) -; ZVFHMIN64-NEXT: sb a0, 225(sp) +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN64-NEXT: sb a2, 228(sp) +; ZVFHMIN64-NEXT: lh a2, 582(sp) +; ZVFHMIN64-NEXT: lh a4, 326(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, s10 +; ZVFHMIN64-NEXT: feq.h t4, fa2, fa5 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa3 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s11 +; ZVFHMIN64-NEXT: fmv.h.x fa3, ra +; ZVFHMIN64-NEXT: sb a2, 227(sp) +; ZVFHMIN64-NEXT: lh a2, 580(sp) +; ZVFHMIN64-NEXT: lh a4, 324(sp) +; ZVFHMIN64-NEXT: feq.h t5, fa0, fa5 +; ZVFHMIN64-NEXT: feq.h t6, ft0, fa3 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa3 +; ZVFHMIN64-NEXT: sb a2, 226(sp) +; ZVFHMIN64-NEXT: lh a2, 578(sp) +; ZVFHMIN64-NEXT: lh a4, 322(sp) +; ZVFHMIN64-NEXT: sb t6, 193(sp) +; ZVFHMIN64-NEXT: feq.h t6, fa1, fa4 +; ZVFHMIN64-NEXT: sb t5, 194(sp) +; ZVFHMIN64-NEXT: sb t6, 195(sp) +; ZVFHMIN64-NEXT: sb t4, 196(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 197(sp) +; ZVFHMIN64-NEXT: sb a3, 198(sp) +; ZVFHMIN64-NEXT: sb a0, 199(sp) +; ZVFHMIN64-NEXT: sb a2, 225(sp) ; ZVFHMIN64-NEXT: lh a0, 766(sp) ; ZVFHMIN64-NEXT: lh a1, 510(sp) ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 18 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a3, a2, 4 +; ZVFHMIN64-NEXT: add a2, a3, a2 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload ; ZVFHMIN64-NEXT: vmv.x.s s2, v8 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 14 +; ZVFHMIN64-NEXT: li a3, 11 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 @@ -3054,165 +3049,171 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: sb a0, 191(sp) ; ZVFHMIN64-NEXT: lh a0, 764(sp) ; ZVFHMIN64-NEXT: lh a1, 508(sp) -; ZVFHMIN64-NEXT: vmv.x.s t5, v6 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 2 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s a2, v8 +; ZVFHMIN64-NEXT: vmv.x.s t5, v4 +; ZVFHMIN64-NEXT: vmv.x.s t4, v30 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 190(sp) ; ZVFHMIN64-NEXT: lh a0, 762(sp) ; ZVFHMIN64-NEXT: lh a1, 506(sp) +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a2, a2, 2 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s a2, v8 ; ZVFHMIN64-NEXT: csrr a3, vlenb -; ZVFHMIN64-NEXT: slli a3, a3, 3 +; ZVFHMIN64-NEXT: slli a3, a3, 1 ; ZVFHMIN64-NEXT: add a3, sp, a3 ; ZVFHMIN64-NEXT: addi a3, a3, 800 ; ZVFHMIN64-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload ; ZVFHMIN64-NEXT: vmv.x.s a3, v8 -; ZVFHMIN64-NEXT: csrr a4, vlenb -; ZVFHMIN64-NEXT: li s3, 6 -; ZVFHMIN64-NEXT: mul a4, a4, s3 -; ZVFHMIN64-NEXT: add a4, sp, a4 -; ZVFHMIN64-NEXT: addi a4, a4, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s a4, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 189(sp) ; ZVFHMIN64-NEXT: lh a0, 760(sp) ; ZVFHMIN64-NEXT: lh a1, 504(sp) -; ZVFHMIN64-NEXT: csrr s3, vlenb -; ZVFHMIN64-NEXT: li s4, 12 -; ZVFHMIN64-NEXT: mul s3, s3, s4 -; ZVFHMIN64-NEXT: add s3, sp, s3 -; ZVFHMIN64-NEXT: addi s3, s3, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s s6, v8 -; ZVFHMIN64-NEXT: csrr s3, vlenb -; ZVFHMIN64-NEXT: li s4, 10 -; ZVFHMIN64-NEXT: mul s3, s3, s4 -; ZVFHMIN64-NEXT: add s3, sp, s3 -; ZVFHMIN64-NEXT: addi s3, s3, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s s4, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, t3 +; ZVFHMIN64-NEXT: csrr a4, vlenb +; ZVFHMIN64-NEXT: li t3, 6 +; ZVFHMIN64-NEXT: mul a4, a4, t3 +; ZVFHMIN64-NEXT: add a4, sp, a4 +; ZVFHMIN64-NEXT: addi a4, a4, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s a4, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 ; ZVFHMIN64-NEXT: sb a0, 188(sp) ; ZVFHMIN64-NEXT: lh a0, 758(sp) ; ZVFHMIN64-NEXT: lh a1, 502(sp) -; ZVFHMIN64-NEXT: csrr s3, vlenb -; ZVFHMIN64-NEXT: slli s3, s3, 4 -; ZVFHMIN64-NEXT: add s3, sp, s3 -; ZVFHMIN64-NEXT: addi s3, s3, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s s5, v8 -; ZVFHMIN64-NEXT: vmv.x.s s3, v16 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN64-NEXT: csrr t2, vlenb +; ZVFHMIN64-NEXT: slli t2, t2, 3 +; ZVFHMIN64-NEXT: add t2, sp, t2 +; ZVFHMIN64-NEXT: addi t2, t2, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (t2) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s t2, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 ; ZVFHMIN64-NEXT: sb a0, 187(sp) ; ZVFHMIN64-NEXT: lh a0, 756(sp) ; ZVFHMIN64-NEXT: lh a1, 500(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h t4, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t3 +; ZVFHMIN64-NEXT: fmv.h.x fa3, t1 +; ZVFHMIN64-NEXT: csrr t1, vlenb +; ZVFHMIN64-NEXT: li t3, 13 +; ZVFHMIN64-NEXT: mul t1, t1, t3 +; ZVFHMIN64-NEXT: add t1, sp, t1 +; ZVFHMIN64-NEXT: addi t1, t1, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (t1) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s t3, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 186(sp) ; ZVFHMIN64-NEXT: lh a0, 754(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa2, t0 ; ZVFHMIN64-NEXT: lh a1, 498(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h t3, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t1 -; ZVFHMIN64-NEXT: sb a0, 185(sp) -; ZVFHMIN64-NEXT: lh a0, 752(sp) -; ZVFHMIN64-NEXT: lh a1, 496(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h t1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: sb a0, 184(sp) -; ZVFHMIN64-NEXT: lh a0, 750(sp) -; ZVFHMIN64-NEXT: lh a1, 494(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s6 +; ZVFHMIN64-NEXT: csrr t0, vlenb +; ZVFHMIN64-NEXT: li t1, 19 +; ZVFHMIN64-NEXT: mul t0, t0, t1 +; ZVFHMIN64-NEXT: add t0, sp, t0 +; ZVFHMIN64-NEXT: addi t0, t0, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (t0) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s3, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a0 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li t0, 21 +; ZVFHMIN64-NEXT: mul a0, a0, t0 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s a0, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa0, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa1, fa0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a2 +; ZVFHMIN64-NEXT: sb a1, 185(sp) +; ZVFHMIN64-NEXT: lh a1, 752(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa0, a3 +; ZVFHMIN64-NEXT: lh a2, 496(sp) +; ZVFHMIN64-NEXT: feq.h t0, fa5, fa1 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: feq.h t1, fa4, fa0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN64-NEXT: sb a1, 184(sp) +; ZVFHMIN64-NEXT: lh a1, 750(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN64-NEXT: lh a2, 494(sp) +; ZVFHMIN64-NEXT: feq.h a3, fa3, fa5 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa2, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN64-NEXT: sb a0, 183(sp) -; ZVFHMIN64-NEXT: lh a0, 748(sp) -; ZVFHMIN64-NEXT: lh a1, 492(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s4 -; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a7 -; ZVFHMIN64-NEXT: sb a0, 182(sp) -; ZVFHMIN64-NEXT: lh a0, 746(sp) -; ZVFHMIN64-NEXT: lh a1, 490(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s5 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a2, 183(sp) +; ZVFHMIN64-NEXT: lh a2, 748(sp) +; ZVFHMIN64-NEXT: lh a4, 492(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 +; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN64-NEXT: sb a0, 181(sp) -; ZVFHMIN64-NEXT: lh a0, 744(sp) -; ZVFHMIN64-NEXT: lh a1, 488(sp) +; ZVFHMIN64-NEXT: sb a2, 182(sp) +; ZVFHMIN64-NEXT: lh a2, 746(sp) +; ZVFHMIN64-NEXT: lh a4, 490(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, s3 ; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a5 -; ZVFHMIN64-NEXT: addi a1, sp, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s a1, v8 +; ZVFHMIN64-NEXT: sb a2, 181(sp) +; ZVFHMIN64-NEXT: lh a2, 744(sp) +; ZVFHMIN64-NEXT: lh a4, 488(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: ld a4, 88(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN64-NEXT: vmv.x.s a5, v0 ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 15 -; ZVFHMIN64-NEXT: vmv.x.s a5, v8 -; ZVFHMIN64-NEXT: sb a0, 180(sp) -; ZVFHMIN64-NEXT: lh a0, 742(sp) -; ZVFHMIN64-NEXT: lh a7, 486(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 179(sp) -; ZVFHMIN64-NEXT: lh a0, 740(sp) -; ZVFHMIN64-NEXT: lh a7, 484(sp) -; ZVFHMIN64-NEXT: sb a2, 140(sp) -; ZVFHMIN64-NEXT: sb t1, 141(sp) -; ZVFHMIN64-NEXT: sb t3, 142(sp) -; ZVFHMIN64-NEXT: sb t4, 143(sp) -; ZVFHMIN64-NEXT: sb a1, 136(sp) -; ZVFHMIN64-NEXT: sb a6, 137(sp) -; ZVFHMIN64-NEXT: sb a4, 138(sp) -; ZVFHMIN64-NEXT: sb a3, 139(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN64-NEXT: vmv.x.s a4, v8 +; ZVFHMIN64-NEXT: sb a2, 180(sp) +; ZVFHMIN64-NEXT: lh a2, 742(sp) +; ZVFHMIN64-NEXT: lh t2, 486(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: sb a2, 179(sp) +; ZVFHMIN64-NEXT: lh a2, 740(sp) +; ZVFHMIN64-NEXT: lh t2, 484(sp) +; ZVFHMIN64-NEXT: sb a1, 140(sp) +; ZVFHMIN64-NEXT: sb a3, 141(sp) +; ZVFHMIN64-NEXT: sb t1, 142(sp) +; ZVFHMIN64-NEXT: sb t0, 143(sp) +; ZVFHMIN64-NEXT: sb a5, 136(sp) +; ZVFHMIN64-NEXT: sb a0, 137(sp) +; ZVFHMIN64-NEXT: sb a6, 138(sp) +; ZVFHMIN64-NEXT: sb a7, 139(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 178(sp) ; ZVFHMIN64-NEXT: lh a0, 638(sp) ; ZVFHMIN64-NEXT: lh a1, 382(sp) ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 14 -; ZVFHMIN64-NEXT: vmv.x.s t3, v8 +; ZVFHMIN64-NEXT: vmv.x.s t2, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -3220,7 +3221,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 636(sp) ; ZVFHMIN64-NEXT: lh a1, 380(sp) ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 13 -; ZVFHMIN64-NEXT: vmv.x.s t2, v8 +; ZVFHMIN64-NEXT: vmv.x.s t1, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -3228,7 +3229,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 634(sp) ; ZVFHMIN64-NEXT: lh a1, 378(sp) ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 12 -; ZVFHMIN64-NEXT: vmv.x.s t1, v8 +; ZVFHMIN64-NEXT: vmv.x.s t0, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -3236,7 +3237,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 632(sp) ; ZVFHMIN64-NEXT: lh a1, 376(sp) ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 11 -; ZVFHMIN64-NEXT: vmv.x.s t0, v8 +; ZVFHMIN64-NEXT: vmv.x.s a7, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -3244,7 +3245,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 630(sp) ; ZVFHMIN64-NEXT: lh a1, 374(sp) ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 10 -; ZVFHMIN64-NEXT: vmv.x.s a7, v8 +; ZVFHMIN64-NEXT: vmv.x.s a6, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -3252,102 +3253,101 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 628(sp) ; ZVFHMIN64-NEXT: lh a1, 372(sp) ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 9 -; ZVFHMIN64-NEXT: vmv.x.s a6, v8 +; ZVFHMIN64-NEXT: vmv.x.s a5, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: ld a1, 96(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld a1, 104(sp) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: sb a0, 250(sp) ; ZVFHMIN64-NEXT: lh a0, 626(sp) ; ZVFHMIN64-NEXT: lh a1, 370(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: ld a1, 104(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: sb a0, 249(sp) -; ZVFHMIN64-NEXT: lh a0, 624(sp) -; ZVFHMIN64-NEXT: lh a1, 368(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 ; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: ld a1, 120(sp) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: sb a0, 248(sp) -; ZVFHMIN64-NEXT: lh a0, 622(sp) -; ZVFHMIN64-NEXT: lh a1, 366(sp) +; ZVFHMIN64-NEXT: sb a0, 249(sp) +; ZVFHMIN64-NEXT: lh a1, 624(sp) +; ZVFHMIN64-NEXT: lh a3, 368(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: ld a1, 88(sp) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: sb a0, 247(sp) -; ZVFHMIN64-NEXT: lh a0, 620(sp) -; ZVFHMIN64-NEXT: lh a1, 364(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: ld a3, 96(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN64-NEXT: sb a1, 248(sp) +; ZVFHMIN64-NEXT: lh a1, 622(sp) +; ZVFHMIN64-NEXT: lh a3, 366(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: ld a1, 112(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: sb a0, 246(sp) -; ZVFHMIN64-NEXT: lh a0, 618(sp) -; ZVFHMIN64-NEXT: lh a1, 362(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: ld a3, 112(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN64-NEXT: sb a1, 247(sp) +; ZVFHMIN64-NEXT: lh a1, 620(sp) +; ZVFHMIN64-NEXT: lh a3, 364(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, t0 ; ZVFHMIN64-NEXT: feq.h t0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, s2 -; ZVFHMIN64-NEXT: sb a0, 245(sp) -; ZVFHMIN64-NEXT: lh a0, 616(sp) -; ZVFHMIN64-NEXT: lh a1, 360(sp) +; ZVFHMIN64-NEXT: sb a1, 246(sp) +; ZVFHMIN64-NEXT: lh a1, 618(sp) +; ZVFHMIN64-NEXT: lh a3, 362(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 ; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN64-NEXT: sb a0, 244(sp) -; ZVFHMIN64-NEXT: lh a0, 614(sp) -; ZVFHMIN64-NEXT: lh a1, 358(sp) +; ZVFHMIN64-NEXT: sb a1, 245(sp) +; ZVFHMIN64-NEXT: lh a1, 616(sp) +; ZVFHMIN64-NEXT: lh a3, 360(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, a6 ; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN64-NEXT: sb a1, 244(sp) +; ZVFHMIN64-NEXT: lh a1, 614(sp) +; ZVFHMIN64-NEXT: lh a3, 358(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 8 -; ZVFHMIN64-NEXT: vmv.x.s a1, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: sb a0, 243(sp) -; ZVFHMIN64-NEXT: lh a0, 612(sp) -; ZVFHMIN64-NEXT: lh a1, 356(sp) -; ZVFHMIN64-NEXT: sb a5, 204(sp) +; ZVFHMIN64-NEXT: vmv.x.s a3, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: sb a1, 243(sp) +; ZVFHMIN64-NEXT: lh a1, 612(sp) +; ZVFHMIN64-NEXT: lh a3, 356(sp) +; ZVFHMIN64-NEXT: sb t0, 204(sp) ; ZVFHMIN64-NEXT: sb a4, 205(sp) -; ZVFHMIN64-NEXT: sb a2, 206(sp) -; ZVFHMIN64-NEXT: sb a3, 207(sp) -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: sb a2, 200(sp) -; ZVFHMIN64-NEXT: sb a6, 201(sp) -; ZVFHMIN64-NEXT: sb a7, 202(sp) -; ZVFHMIN64-NEXT: sb t0, 203(sp) -; ZVFHMIN64-NEXT: li a2, 128 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 242(sp) -; ZVFHMIN64-NEXT: addi a0, sp, 128 -; ZVFHMIN64-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; ZVFHMIN64-NEXT: vle8.v v8, (a0) +; ZVFHMIN64-NEXT: sb a0, 206(sp) +; ZVFHMIN64-NEXT: sb a2, 207(sp) +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 200(sp) +; ZVFHMIN64-NEXT: sb a5, 201(sp) +; ZVFHMIN64-NEXT: sb a6, 202(sp) +; ZVFHMIN64-NEXT: sb a7, 203(sp) +; ZVFHMIN64-NEXT: li a0, 128 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 242(sp) +; ZVFHMIN64-NEXT: addi a1, sp, 128 +; ZVFHMIN64-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; ZVFHMIN64-NEXT: vle8.v v8, (a1) ; ZVFHMIN64-NEXT: vand.vi v8, v8, 1 ; ZVFHMIN64-NEXT: vmsne.vi v0, v8, 0 ; ZVFHMIN64-NEXT: addi sp, s0, -896 diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll index e70dcd16d02cd..dd2a8240ee253 100644 --- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll +++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll @@ -507,37 +507,28 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask) define @match_nxv16i8_v32i8( %op1, <32 x i8> %op2, %mask) { ; RV32-LABEL: match_nxv16i8_v32i8: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: .cfi_offset s1, -12 -; RV32-NEXT: .cfi_offset s2, -16 -; RV32-NEXT: .cfi_offset s3, -20 -; RV32-NEXT: .cfi_offset s4, -24 -; RV32-NEXT: .cfi_offset s5, -28 -; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: .cfi_offset s7, -36 -; RV32-NEXT: .cfi_offset s8, -40 -; RV32-NEXT: .cfi_offset s9, -44 -; RV32-NEXT: .cfi_offset s10, -48 -; RV32-NEXT: .cfi_offset s11, -52 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: .cfi_offset s1, -8 +; RV32-NEXT: .cfi_offset s2, -12 +; RV32-NEXT: .cfi_offset s3, -16 +; RV32-NEXT: .cfi_offset s4, -20 +; RV32-NEXT: .cfi_offset s5, -24 +; RV32-NEXT: .cfi_offset s6, -28 +; RV32-NEXT: .cfi_offset s7, -32 +; RV32-NEXT: .cfi_offset s8, -36 ; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: vslidedown.vi v12, v10, 1 ; RV32-NEXT: vslidedown.vi v13, v10, 2 ; RV32-NEXT: vslidedown.vi v14, v10, 3 @@ -593,95 +584,89 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV32-NEXT: vmv.x.s s5, v15 ; RV32-NEXT: vmv.x.s s6, v16 ; RV32-NEXT: vmv.x.s s7, v17 -; RV32-NEXT: vmv.x.s s8, v18 -; RV32-NEXT: vmv.x.s s9, v19 -; RV32-NEXT: vmv.x.s s10, v20 -; RV32-NEXT: vmv.x.s s11, v21 -; RV32-NEXT: vmv.x.s ra, v22 -; RV32-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-NEXT: lw a0, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: vsetvli s8, zero, e8, m2, ta, ma ; RV32-NEXT: vmseq.vx v12, v8, a0 -; RV32-NEXT: vmv.x.s a0, v23 +; RV32-NEXT: vmv.x.s a0, v18 ; RV32-NEXT: vmseq.vx v13, v8, s2 -; RV32-NEXT: vmv.x.s s2, v11 -; RV32-NEXT: vmseq.vx v11, v8, s3 -; RV32-NEXT: vmv.x.s s3, v24 -; RV32-NEXT: vmseq.vx v14, v8, s4 -; RV32-NEXT: vmv.x.s s4, v10 -; RV32-NEXT: vmseq.vx v10, v8, s5 -; RV32-NEXT: vmor.mm v12, v12, v13 -; RV32-NEXT: vmseq.vx v13, v8, s6 -; RV32-NEXT: vmor.mm v11, v12, v11 -; RV32-NEXT: vmseq.vx v12, v8, s7 -; RV32-NEXT: vmor.mm v11, v11, v14 -; RV32-NEXT: vmseq.vx v14, v8, s8 -; RV32-NEXT: vmor.mm v10, v11, v10 -; RV32-NEXT: vmseq.vx v11, v8, s9 -; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, s10 -; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, s11 -; RV32-NEXT: vmor.mm v10, v10, v14 -; RV32-NEXT: vmseq.vx v14, v8, ra -; RV32-NEXT: vmor.mm v10, v10, v11 +; RV32-NEXT: vmv.x.s s2, v19 +; RV32-NEXT: vmseq.vx v14, v8, s3 +; RV32-NEXT: vmv.x.s s3, v20 +; RV32-NEXT: vmseq.vx v15, v8, s4 +; RV32-NEXT: vmv.x.s s4, v21 +; RV32-NEXT: vmseq.vx v16, v8, s5 +; RV32-NEXT: vmv.x.s s5, v22 +; RV32-NEXT: vmseq.vx v17, v8, s6 +; RV32-NEXT: vmv.x.s s6, v23 +; RV32-NEXT: vmseq.vx v18, v8, s7 +; RV32-NEXT: vmv.x.s s7, v11 ; RV32-NEXT: vmseq.vx v11, v8, a0 -; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, s2 -; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, s3 +; RV32-NEXT: vmv.x.s a0, v24 +; RV32-NEXT: vmseq.vx v19, v8, s2 +; RV32-NEXT: vmv.x.s s2, v10 +; RV32-NEXT: vmor.mm v10, v12, v13 ; RV32-NEXT: vmor.mm v10, v10, v14 -; RV32-NEXT: vmseq.vx v14, v8, s4 +; RV32-NEXT: vmor.mm v10, v10, v15 +; RV32-NEXT: vmor.mm v10, v10, v16 +; RV32-NEXT: vmor.mm v10, v10, v17 +; RV32-NEXT: vmseq.vx v12, v8, s3 +; RV32-NEXT: vmor.mm v10, v10, v18 +; RV32-NEXT: vmseq.vx v13, v8, s4 ; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmseq.vx v11, v8, a1 -; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, a2 +; RV32-NEXT: vmseq.vx v11, v8, s5 +; RV32-NEXT: vmor.mm v10, v10, v19 +; RV32-NEXT: vmseq.vx v14, v8, s6 ; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, a3 -; RV32-NEXT: vmor.mm v10, v10, v14 -; RV32-NEXT: vmseq.vx v14, v8, a4 -; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmseq.vx v11, v8, a5 +; RV32-NEXT: vmseq.vx v12, v8, s7 ; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, a6 -; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, a7 -; RV32-NEXT: vmor.mm v10, v10, v14 -; RV32-NEXT: vmseq.vx v14, v8, t0 +; RV32-NEXT: vmseq.vx v13, v8, a0 ; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmseq.vx v11, v8, t1 -; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, t2 -; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, t3 +; RV32-NEXT: vmseq.vx v11, v8, s2 ; RV32-NEXT: vmor.mm v10, v10, v14 -; RV32-NEXT: vmseq.vx v14, v8, t4 -; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmseq.vx v11, v8, t5 +; RV32-NEXT: vmseq.vx v14, v8, a1 +; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmseq.vx v12, v8, a2 ; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, t6 +; RV32-NEXT: vmseq.vx v13, v8, a3 +; RV32-NEXT: vmor.mm v10, v10, v11 +; RV32-NEXT: vmseq.vx v11, v8, a4 +; RV32-NEXT: vmor.mm v10, v10, v14 +; RV32-NEXT: vmseq.vx v14, v8, a5 ; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, s0 +; RV32-NEXT: vmseq.vx v12, v8, a6 +; RV32-NEXT: vmor.mm v10, v10, v13 +; RV32-NEXT: vmseq.vx v13, v8, a7 +; RV32-NEXT: vmor.mm v10, v10, v11 +; RV32-NEXT: vmseq.vx v11, v8, t0 ; RV32-NEXT: vmor.mm v10, v10, v14 +; RV32-NEXT: vmseq.vx v14, v8, t1 +; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmseq.vx v12, v8, t2 +; RV32-NEXT: vmor.mm v10, v10, v13 +; RV32-NEXT: vmseq.vx v13, v8, t3 ; RV32-NEXT: vmor.mm v10, v10, v11 +; RV32-NEXT: vmseq.vx v11, v8, t4 +; RV32-NEXT: vmor.mm v10, v10, v14 +; RV32-NEXT: vmseq.vx v14, v8, t5 +; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmseq.vx v12, v8, t6 ; RV32-NEXT: vmor.mm v10, v10, v13 +; RV32-NEXT: vmseq.vx v13, v8, s0 +; RV32-NEXT: vmor.mm v10, v10, v11 +; RV32-NEXT: vmor.mm v10, v10, v14 ; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmor.mm v10, v10, v13 ; RV32-NEXT: vmseq.vx v11, v8, s1 ; RV32-NEXT: vmor.mm v8, v10, v11 ; RV32-NEXT: vmand.mm v0, v8, v0 -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore s0 ; RV32-NEXT: .cfi_restore s1 ; RV32-NEXT: .cfi_restore s2 @@ -691,46 +676,34 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV32-NEXT: .cfi_restore s6 ; RV32-NEXT: .cfi_restore s7 ; RV32-NEXT: .cfi_restore s8 -; RV32-NEXT: .cfi_restore s9 -; RV32-NEXT: .cfi_restore s10 -; RV32-NEXT: .cfi_restore s11 -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: match_nxv16i8_v32i8: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -112 -; RV64-NEXT: .cfi_def_cfa_offset 112 -; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: .cfi_offset s1, -24 -; RV64-NEXT: .cfi_offset s2, -32 -; RV64-NEXT: .cfi_offset s3, -40 -; RV64-NEXT: .cfi_offset s4, -48 -; RV64-NEXT: .cfi_offset s5, -56 -; RV64-NEXT: .cfi_offset s6, -64 -; RV64-NEXT: .cfi_offset s7, -72 -; RV64-NEXT: .cfi_offset s8, -80 -; RV64-NEXT: .cfi_offset s9, -88 -; RV64-NEXT: .cfi_offset s10, -96 -; RV64-NEXT: .cfi_offset s11, -104 +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: .cfi_def_cfa_offset 80 +; RV64-NEXT: sd s0, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s5, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s6, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s7, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s8, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset s0, -8 +; RV64-NEXT: .cfi_offset s1, -16 +; RV64-NEXT: .cfi_offset s2, -24 +; RV64-NEXT: .cfi_offset s3, -32 +; RV64-NEXT: .cfi_offset s4, -40 +; RV64-NEXT: .cfi_offset s5, -48 +; RV64-NEXT: .cfi_offset s6, -56 +; RV64-NEXT: .cfi_offset s7, -64 +; RV64-NEXT: .cfi_offset s8, -72 ; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: sd a0, 0(sp) # 8-byte Folded Spill ; RV64-NEXT: vslidedown.vi v12, v10, 1 ; RV64-NEXT: vslidedown.vi v13, v10, 2 ; RV64-NEXT: vslidedown.vi v14, v10, 3 @@ -786,95 +759,89 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV64-NEXT: vmv.x.s s5, v15 ; RV64-NEXT: vmv.x.s s6, v16 ; RV64-NEXT: vmv.x.s s7, v17 -; RV64-NEXT: vmv.x.s s8, v18 -; RV64-NEXT: vmv.x.s s9, v19 -; RV64-NEXT: vmv.x.s s10, v20 -; RV64-NEXT: vmv.x.s s11, v21 -; RV64-NEXT: vmv.x.s ra, v22 -; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-NEXT: ld a0, 0(sp) # 8-byte Folded Reload +; RV64-NEXT: vsetvli s8, zero, e8, m2, ta, ma ; RV64-NEXT: vmseq.vx v12, v8, a0 -; RV64-NEXT: vmv.x.s a0, v23 +; RV64-NEXT: vmv.x.s a0, v18 ; RV64-NEXT: vmseq.vx v13, v8, s2 -; RV64-NEXT: vmv.x.s s2, v11 -; RV64-NEXT: vmseq.vx v11, v8, s3 -; RV64-NEXT: vmv.x.s s3, v24 -; RV64-NEXT: vmseq.vx v14, v8, s4 -; RV64-NEXT: vmv.x.s s4, v10 -; RV64-NEXT: vmseq.vx v10, v8, s5 -; RV64-NEXT: vmor.mm v12, v12, v13 -; RV64-NEXT: vmseq.vx v13, v8, s6 -; RV64-NEXT: vmor.mm v11, v12, v11 -; RV64-NEXT: vmseq.vx v12, v8, s7 -; RV64-NEXT: vmor.mm v11, v11, v14 -; RV64-NEXT: vmseq.vx v14, v8, s8 -; RV64-NEXT: vmor.mm v10, v11, v10 -; RV64-NEXT: vmseq.vx v11, v8, s9 -; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, s10 -; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, s11 -; RV64-NEXT: vmor.mm v10, v10, v14 -; RV64-NEXT: vmseq.vx v14, v8, ra -; RV64-NEXT: vmor.mm v10, v10, v11 +; RV64-NEXT: vmv.x.s s2, v19 +; RV64-NEXT: vmseq.vx v14, v8, s3 +; RV64-NEXT: vmv.x.s s3, v20 +; RV64-NEXT: vmseq.vx v15, v8, s4 +; RV64-NEXT: vmv.x.s s4, v21 +; RV64-NEXT: vmseq.vx v16, v8, s5 +; RV64-NEXT: vmv.x.s s5, v22 +; RV64-NEXT: vmseq.vx v17, v8, s6 +; RV64-NEXT: vmv.x.s s6, v23 +; RV64-NEXT: vmseq.vx v18, v8, s7 +; RV64-NEXT: vmv.x.s s7, v11 ; RV64-NEXT: vmseq.vx v11, v8, a0 -; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, s2 -; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, s3 +; RV64-NEXT: vmv.x.s a0, v24 +; RV64-NEXT: vmseq.vx v19, v8, s2 +; RV64-NEXT: vmv.x.s s2, v10 +; RV64-NEXT: vmor.mm v10, v12, v13 ; RV64-NEXT: vmor.mm v10, v10, v14 -; RV64-NEXT: vmseq.vx v14, v8, s4 +; RV64-NEXT: vmor.mm v10, v10, v15 +; RV64-NEXT: vmor.mm v10, v10, v16 +; RV64-NEXT: vmor.mm v10, v10, v17 +; RV64-NEXT: vmseq.vx v12, v8, s3 +; RV64-NEXT: vmor.mm v10, v10, v18 +; RV64-NEXT: vmseq.vx v13, v8, s4 ; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmseq.vx v11, v8, a1 -; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, a2 +; RV64-NEXT: vmseq.vx v11, v8, s5 +; RV64-NEXT: vmor.mm v10, v10, v19 +; RV64-NEXT: vmseq.vx v14, v8, s6 ; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, a3 -; RV64-NEXT: vmor.mm v10, v10, v14 -; RV64-NEXT: vmseq.vx v14, v8, a4 -; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmseq.vx v11, v8, a5 +; RV64-NEXT: vmseq.vx v12, v8, s7 ; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, a6 -; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, a7 -; RV64-NEXT: vmor.mm v10, v10, v14 -; RV64-NEXT: vmseq.vx v14, v8, t0 +; RV64-NEXT: vmseq.vx v13, v8, a0 ; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmseq.vx v11, v8, t1 -; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, t2 -; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, t3 +; RV64-NEXT: vmseq.vx v11, v8, s2 ; RV64-NEXT: vmor.mm v10, v10, v14 -; RV64-NEXT: vmseq.vx v14, v8, t4 -; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmseq.vx v11, v8, t5 +; RV64-NEXT: vmseq.vx v14, v8, a1 +; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmseq.vx v12, v8, a2 ; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, t6 +; RV64-NEXT: vmseq.vx v13, v8, a3 +; RV64-NEXT: vmor.mm v10, v10, v11 +; RV64-NEXT: vmseq.vx v11, v8, a4 +; RV64-NEXT: vmor.mm v10, v10, v14 +; RV64-NEXT: vmseq.vx v14, v8, a5 ; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, s0 +; RV64-NEXT: vmseq.vx v12, v8, a6 +; RV64-NEXT: vmor.mm v10, v10, v13 +; RV64-NEXT: vmseq.vx v13, v8, a7 +; RV64-NEXT: vmor.mm v10, v10, v11 +; RV64-NEXT: vmseq.vx v11, v8, t0 ; RV64-NEXT: vmor.mm v10, v10, v14 +; RV64-NEXT: vmseq.vx v14, v8, t1 +; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmseq.vx v12, v8, t2 +; RV64-NEXT: vmor.mm v10, v10, v13 +; RV64-NEXT: vmseq.vx v13, v8, t3 ; RV64-NEXT: vmor.mm v10, v10, v11 +; RV64-NEXT: vmseq.vx v11, v8, t4 +; RV64-NEXT: vmor.mm v10, v10, v14 +; RV64-NEXT: vmseq.vx v14, v8, t5 +; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmseq.vx v12, v8, t6 ; RV64-NEXT: vmor.mm v10, v10, v13 +; RV64-NEXT: vmseq.vx v13, v8, s0 +; RV64-NEXT: vmor.mm v10, v10, v11 +; RV64-NEXT: vmor.mm v10, v10, v14 ; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmor.mm v10, v10, v13 ; RV64-NEXT: vmseq.vx v11, v8, s1 ; RV64-NEXT: vmor.mm v8, v10, v11 ; RV64-NEXT: vmand.mm v0, v8, v0 -; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: ld s0, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s4, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s5, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s6, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s7, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s8, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: .cfi_restore s0 ; RV64-NEXT: .cfi_restore s1 ; RV64-NEXT: .cfi_restore s2 @@ -884,10 +851,7 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV64-NEXT: .cfi_restore s6 ; RV64-NEXT: .cfi_restore s7 ; RV64-NEXT: .cfi_restore s8 -; RV64-NEXT: .cfi_restore s9 -; RV64-NEXT: .cfi_restore s10 -; RV64-NEXT: .cfi_restore s11 -; RV64-NEXT: addi sp, sp, 112 +; RV64-NEXT: addi sp, sp, 80 ; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %r = tail call @llvm.experimental.vector.match( %op1, <32 x i8> %op2, %mask) @@ -897,34 +861,24 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) { ; RV32-LABEL: match_v16i8_v32i8: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: .cfi_offset s1, -12 -; RV32-NEXT: .cfi_offset s2, -16 -; RV32-NEXT: .cfi_offset s3, -20 -; RV32-NEXT: .cfi_offset s4, -24 -; RV32-NEXT: .cfi_offset s5, -28 -; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: .cfi_offset s7, -36 -; RV32-NEXT: .cfi_offset s8, -40 -; RV32-NEXT: .cfi_offset s9, -44 -; RV32-NEXT: .cfi_offset s10, -48 -; RV32-NEXT: .cfi_offset s11, -52 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 4(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 0(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: .cfi_offset s1, -8 +; RV32-NEXT: .cfi_offset s2, -12 +; RV32-NEXT: .cfi_offset s3, -16 +; RV32-NEXT: .cfi_offset s4, -20 +; RV32-NEXT: .cfi_offset s5, -24 +; RV32-NEXT: .cfi_offset s6, -28 +; RV32-NEXT: .cfi_offset s7, -32 ; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: vslidedown.vi v9, v10, 1 @@ -982,93 +936,87 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV32-NEXT: vmv.x.s s5, v14 ; RV32-NEXT: vmv.x.s s6, v15 ; RV32-NEXT: vmv.x.s s7, v16 -; RV32-NEXT: vmv.x.s s8, v17 -; RV32-NEXT: vmv.x.s s9, v18 -; RV32-NEXT: vmv.x.s s10, v19 -; RV32-NEXT: vmv.x.s s11, v20 -; RV32-NEXT: vmv.x.s ra, v21 ; RV32-NEXT: vmseq.vx v9, v8, a0 -; RV32-NEXT: vmv.x.s a0, v22 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: vmseq.vx v12, v8, s2 -; RV32-NEXT: vmv.x.s s2, v11 -; RV32-NEXT: vmseq.vx v11, v8, s3 -; RV32-NEXT: vmv.x.s s3, v23 -; RV32-NEXT: vmseq.vx v13, v8, s4 -; RV32-NEXT: vmv.x.s s4, v10 -; RV32-NEXT: vmseq.vx v10, v8, s5 +; RV32-NEXT: vmv.x.s s2, v18 +; RV32-NEXT: vmseq.vx v13, v8, s3 +; RV32-NEXT: vmv.x.s s3, v19 +; RV32-NEXT: vmseq.vx v14, v8, s4 +; RV32-NEXT: vmv.x.s s4, v20 +; RV32-NEXT: vmseq.vx v15, v8, s5 +; RV32-NEXT: vmv.x.s s5, v21 +; RV32-NEXT: vmseq.vx v16, v8, s6 +; RV32-NEXT: vmv.x.s s6, v22 +; RV32-NEXT: vmseq.vx v17, v8, s7 +; RV32-NEXT: vmv.x.s s7, v11 +; RV32-NEXT: vmseq.vx v11, v8, a0 +; RV32-NEXT: vmv.x.s a0, v23 +; RV32-NEXT: vmseq.vx v18, v8, s2 +; RV32-NEXT: vmv.x.s s2, v10 ; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, s6 -; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, s7 ; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v13, v8, s8 -; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, s9 -; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, s10 +; RV32-NEXT: vmor.mm v9, v9, v14 +; RV32-NEXT: vmor.mm v9, v9, v15 +; RV32-NEXT: vmor.mm v9, v9, v16 +; RV32-NEXT: vmseq.vx v10, v8, s3 +; RV32-NEXT: vmor.mm v9, v9, v17 +; RV32-NEXT: vmseq.vx v12, v8, s4 ; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, s11 -; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v13, v8, ra +; RV32-NEXT: vmseq.vx v11, v8, s5 +; RV32-NEXT: vmor.mm v9, v9, v18 +; RV32-NEXT: vmseq.vx v13, v8, s6 ; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, a0 +; RV32-NEXT: vmseq.vx v10, v8, s7 ; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, s2 +; RV32-NEXT: vmseq.vx v12, v8, a0 ; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, s3 +; RV32-NEXT: vmseq.vx v11, v8, s2 ; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v13, v8, s4 +; RV32-NEXT: vmseq.vx v13, v8, a1 ; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, a1 +; RV32-NEXT: vmseq.vx v10, v8, a2 ; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, a2 +; RV32-NEXT: vmseq.vx v12, v8, a3 ; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, a3 +; RV32-NEXT: vmseq.vx v11, v8, a4 ; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v13, v8, a4 +; RV32-NEXT: vmseq.vx v13, v8, a5 ; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, a5 +; RV32-NEXT: vmseq.vx v10, v8, a6 ; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, a6 +; RV32-NEXT: vmseq.vx v12, v8, a7 ; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, a7 +; RV32-NEXT: vmseq.vx v11, v8, t0 ; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v13, v8, t0 +; RV32-NEXT: vmseq.vx v13, v8, t1 ; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, t1 +; RV32-NEXT: vmseq.vx v10, v8, t2 ; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, t2 +; RV32-NEXT: vmseq.vx v12, v8, t3 ; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, t3 +; RV32-NEXT: vmseq.vx v11, v8, t4 ; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v13, v8, t4 +; RV32-NEXT: vmseq.vx v13, v8, t5 ; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, t5 +; RV32-NEXT: vmseq.vx v10, v8, t6 ; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, t6 +; RV32-NEXT: vmseq.vx v12, v8, s0 ; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, s0 ; RV32-NEXT: vmor.mm v9, v9, v13 ; RV32-NEXT: vmor.mm v9, v9, v10 ; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmor.mm v9, v9, v11 ; RV32-NEXT: vmseq.vx v8, v8, s1 ; RV32-NEXT: vmor.mm v8, v9, v8 ; RV32-NEXT: vmand.mm v0, v8, v0 -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 4(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 0(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore s0 ; RV32-NEXT: .cfi_restore s1 ; RV32-NEXT: .cfi_restore s2 @@ -1077,44 +1025,30 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV32-NEXT: .cfi_restore s5 ; RV32-NEXT: .cfi_restore s6 ; RV32-NEXT: .cfi_restore s7 -; RV32-NEXT: .cfi_restore s8 -; RV32-NEXT: .cfi_restore s9 -; RV32-NEXT: .cfi_restore s10 -; RV32-NEXT: .cfi_restore s11 -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: match_v16i8_v32i8: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -112 -; RV64-NEXT: .cfi_def_cfa_offset 112 -; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: .cfi_offset s1, -24 -; RV64-NEXT: .cfi_offset s2, -32 -; RV64-NEXT: .cfi_offset s3, -40 -; RV64-NEXT: .cfi_offset s4, -48 -; RV64-NEXT: .cfi_offset s5, -56 -; RV64-NEXT: .cfi_offset s6, -64 -; RV64-NEXT: .cfi_offset s7, -72 -; RV64-NEXT: .cfi_offset s8, -80 -; RV64-NEXT: .cfi_offset s9, -88 -; RV64-NEXT: .cfi_offset s10, -96 -; RV64-NEXT: .cfi_offset s11, -104 +; RV64-NEXT: addi sp, sp, -64 +; RV64-NEXT: .cfi_def_cfa_offset 64 +; RV64-NEXT: sd s0, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s5, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s6, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s7, 0(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset s0, -8 +; RV64-NEXT: .cfi_offset s1, -16 +; RV64-NEXT: .cfi_offset s2, -24 +; RV64-NEXT: .cfi_offset s3, -32 +; RV64-NEXT: .cfi_offset s4, -40 +; RV64-NEXT: .cfi_offset s5, -48 +; RV64-NEXT: .cfi_offset s6, -56 +; RV64-NEXT: .cfi_offset s7, -64 ; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: vslidedown.vi v9, v10, 1 @@ -1172,93 +1106,87 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV64-NEXT: vmv.x.s s5, v14 ; RV64-NEXT: vmv.x.s s6, v15 ; RV64-NEXT: vmv.x.s s7, v16 -; RV64-NEXT: vmv.x.s s8, v17 -; RV64-NEXT: vmv.x.s s9, v18 -; RV64-NEXT: vmv.x.s s10, v19 -; RV64-NEXT: vmv.x.s s11, v20 -; RV64-NEXT: vmv.x.s ra, v21 ; RV64-NEXT: vmseq.vx v9, v8, a0 -; RV64-NEXT: vmv.x.s a0, v22 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: vmseq.vx v12, v8, s2 -; RV64-NEXT: vmv.x.s s2, v11 -; RV64-NEXT: vmseq.vx v11, v8, s3 -; RV64-NEXT: vmv.x.s s3, v23 -; RV64-NEXT: vmseq.vx v13, v8, s4 -; RV64-NEXT: vmv.x.s s4, v10 -; RV64-NEXT: vmseq.vx v10, v8, s5 +; RV64-NEXT: vmv.x.s s2, v18 +; RV64-NEXT: vmseq.vx v13, v8, s3 +; RV64-NEXT: vmv.x.s s3, v19 +; RV64-NEXT: vmseq.vx v14, v8, s4 +; RV64-NEXT: vmv.x.s s4, v20 +; RV64-NEXT: vmseq.vx v15, v8, s5 +; RV64-NEXT: vmv.x.s s5, v21 +; RV64-NEXT: vmseq.vx v16, v8, s6 +; RV64-NEXT: vmv.x.s s6, v22 +; RV64-NEXT: vmseq.vx v17, v8, s7 +; RV64-NEXT: vmv.x.s s7, v11 +; RV64-NEXT: vmseq.vx v11, v8, a0 +; RV64-NEXT: vmv.x.s a0, v23 +; RV64-NEXT: vmseq.vx v18, v8, s2 +; RV64-NEXT: vmv.x.s s2, v10 ; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, s6 -; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, s7 ; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v13, v8, s8 -; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, s9 -; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, s10 +; RV64-NEXT: vmor.mm v9, v9, v14 +; RV64-NEXT: vmor.mm v9, v9, v15 +; RV64-NEXT: vmor.mm v9, v9, v16 +; RV64-NEXT: vmseq.vx v10, v8, s3 +; RV64-NEXT: vmor.mm v9, v9, v17 +; RV64-NEXT: vmseq.vx v12, v8, s4 ; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, s11 -; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v13, v8, ra +; RV64-NEXT: vmseq.vx v11, v8, s5 +; RV64-NEXT: vmor.mm v9, v9, v18 +; RV64-NEXT: vmseq.vx v13, v8, s6 ; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, a0 +; RV64-NEXT: vmseq.vx v10, v8, s7 ; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, s2 +; RV64-NEXT: vmseq.vx v12, v8, a0 ; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, s3 +; RV64-NEXT: vmseq.vx v11, v8, s2 ; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v13, v8, s4 +; RV64-NEXT: vmseq.vx v13, v8, a1 ; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, a1 +; RV64-NEXT: vmseq.vx v10, v8, a2 ; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, a2 +; RV64-NEXT: vmseq.vx v12, v8, a3 ; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, a3 +; RV64-NEXT: vmseq.vx v11, v8, a4 ; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v13, v8, a4 +; RV64-NEXT: vmseq.vx v13, v8, a5 ; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, a5 +; RV64-NEXT: vmseq.vx v10, v8, a6 ; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, a6 +; RV64-NEXT: vmseq.vx v12, v8, a7 ; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, a7 +; RV64-NEXT: vmseq.vx v11, v8, t0 ; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v13, v8, t0 +; RV64-NEXT: vmseq.vx v13, v8, t1 ; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, t1 +; RV64-NEXT: vmseq.vx v10, v8, t2 ; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, t2 +; RV64-NEXT: vmseq.vx v12, v8, t3 ; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, t3 +; RV64-NEXT: vmseq.vx v11, v8, t4 ; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v13, v8, t4 +; RV64-NEXT: vmseq.vx v13, v8, t5 ; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, t5 +; RV64-NEXT: vmseq.vx v10, v8, t6 ; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, t6 +; RV64-NEXT: vmseq.vx v12, v8, s0 ; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, s0 ; RV64-NEXT: vmor.mm v9, v9, v13 ; RV64-NEXT: vmor.mm v9, v9, v10 ; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmor.mm v9, v9, v11 ; RV64-NEXT: vmseq.vx v8, v8, s1 ; RV64-NEXT: vmor.mm v8, v9, v8 ; RV64-NEXT: vmand.mm v0, v8, v0 -; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: ld s0, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s4, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s5, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s6, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s7, 0(sp) # 8-byte Folded Reload ; RV64-NEXT: .cfi_restore s0 ; RV64-NEXT: .cfi_restore s1 ; RV64-NEXT: .cfi_restore s2 @@ -1267,11 +1195,7 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV64-NEXT: .cfi_restore s5 ; RV64-NEXT: .cfi_restore s6 ; RV64-NEXT: .cfi_restore s7 -; RV64-NEXT: .cfi_restore s8 -; RV64-NEXT: .cfi_restore s9 -; RV64-NEXT: .cfi_restore s10 -; RV64-NEXT: .cfi_restore s11 -; RV64-NEXT: addi sp, sp, 112 +; RV64-NEXT: addi sp, sp, 64 ; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll index c35f05be304cc..ec2448cb3965f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll @@ -489,8 +489,9 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_ ; RV64-NEXT: j .LBB0_11 ; RV64-NEXT: .LBB0_8: # %vector.ph ; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1 -; RV64-NEXT: slli t6, t0, 28 -; RV64-NEXT: sub t6, t6, t1 +; RV64-NEXT: slli t6, t0, 1 +; RV64-NEXT: slli s0, t0, 28 +; RV64-NEXT: sub t6, s0, t6 ; RV64-NEXT: and t6, t6, a6 ; RV64-NEXT: csrwi vxrm, 0 ; RV64-NEXT: mv s0, a2 diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll index 437b7e557718c..22e6f23d4d6e6 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -2203,139 +2203,136 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: lshr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu s1, 0(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu t1, 4(a0) -; RV32I-NEXT: lbu t3, 5(a0) -; RV32I-NEXT: lbu t4, 6(a0) -; RV32I-NEXT: lbu s0, 7(a0) -; RV32I-NEXT: lbu t2, 8(a0) -; RV32I-NEXT: lbu s3, 9(a0) -; RV32I-NEXT: lbu s6, 10(a0) -; RV32I-NEXT: lbu s8, 11(a0) -; RV32I-NEXT: lbu s9, 12(a0) -; RV32I-NEXT: lbu s10, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s7, 15(a0) -; RV32I-NEXT: lbu s5, 16(a0) -; RV32I-NEXT: lbu s11, 17(a0) -; RV32I-NEXT: lbu ra, 18(a0) -; RV32I-NEXT: lbu a3, 19(a0) -; RV32I-NEXT: lbu t5, 20(a0) -; RV32I-NEXT: lbu t6, 21(a0) -; RV32I-NEXT: lbu a7, 22(a0) -; RV32I-NEXT: lbu t0, 23(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t3, t1 -; RV32I-NEXT: or a6, s0, t4 -; RV32I-NEXT: lbu t1, 24(a0) +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t6, 24(a0) ; RV32I-NEXT: lbu s0, 25(a0) ; RV32I-NEXT: lbu s1, 26(a0) ; RV32I-NEXT: lbu s2, 27(a0) -; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s5, s5, 8 ; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli s8, s8, 24 -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: or t2, s3, t2 -; RV32I-NEXT: or t3, s8, s6 -; RV32I-NEXT: or t4, s10, s9 -; RV32I-NEXT: lbu s3, 28(a0) -; RV32I-NEXT: lbu s6, 29(a0) -; RV32I-NEXT: lbu s8, 30(a0) -; RV32I-NEXT: lbu s9, 31(a0) -; RV32I-NEXT: slli s4, s4, 16 ; RV32I-NEXT: slli s7, s7, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a0, s7, s4 -; RV32I-NEXT: or s4, s11, s5 -; RV32I-NEXT: or s5, a3, ra -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu s7, 1(a1) -; RV32I-NEXT: lbu s10, 2(a1) +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu s6, 31(a0) +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: or a0, s11, s10 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: lbu s2, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 64(sp) -; RV32I-NEXT: sw zero, 68(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t5, t6, t5 -; RV32I-NEXT: addi t6, sp, 8 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: slli s6, s6, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: or s3, s4, s3 +; RV32I-NEXT: mv s4, sp +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, s0, t1 -; RV32I-NEXT: or t1, s2, s1 -; RV32I-NEXT: or s0, s6, s3 -; RV32I-NEXT: or s1, s9, s8 -; RV32I-NEXT: or a3, s7, a3 -; RV32I-NEXT: or a1, a1, s10 -; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, s2 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t3, t2 -; RV32I-NEXT: or a0, a0, t4 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: or a7, a7, t5 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: sw t2, 24(sp) -; RV32I-NEXT: sw a7, 28(sp) -; RV32I-NEXT: sw t0, 32(sp) -; RV32I-NEXT: sw s0, 36(sp) -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) -; RV32I-NEXT: sw a6, 16(sp) +; RV32I-NEXT: or s5, s6, s5 +; RV32I-NEXT: or s1, s2, s1 +; RV32I-NEXT: or a1, a1, s7 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or a0, a0, t5 +; RV32I-NEXT: or t0, s0, t6 +; RV32I-NEXT: or t1, s5, s3 +; RV32I-NEXT: or a1, a1, s1 +; RV32I-NEXT: sw a7, 16(sp) ; RV32I-NEXT: sw a0, 20(sp) +; RV32I-NEXT: sw t0, 24(sp) +; RV32I-NEXT: sw t1, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: slli t1, a1, 3 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: add a1, t6, a1 +; RV32I-NEXT: add a1, s4, a1 ; RV32I-NEXT: andi a0, t1, 24 -; RV32I-NEXT: xori t0, a0, 31 +; RV32I-NEXT: xori a7, a0, 31 ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a5, 8(a1) ; RV32I-NEXT: lw a6, 12(a1) -; RV32I-NEXT: lw a7, 16(a1) +; RV32I-NEXT: lw t0, 16(a1) ; RV32I-NEXT: lw t2, 20(a1) ; RV32I-NEXT: lw t3, 24(a1) ; RV32I-NEXT: lw t4, 28(a1) @@ -2344,33 +2341,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srl a1, a3, t1 ; RV32I-NEXT: slli t6, a4, 1 ; RV32I-NEXT: srl a3, a6, t1 -; RV32I-NEXT: slli s0, a7, 1 +; RV32I-NEXT: slli s0, t0, 1 ; RV32I-NEXT: srl a4, a5, t1 ; RV32I-NEXT: slli s1, a6, 1 ; RV32I-NEXT: srl a5, t2, t1 ; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: srl a6, a7, t1 +; RV32I-NEXT: srl a6, t0, t1 ; RV32I-NEXT: slli t2, t2, 1 -; RV32I-NEXT: srl a7, t3, t1 +; RV32I-NEXT: srl t0, t3, t1 ; RV32I-NEXT: slli t3, t4, 1 ; RV32I-NEXT: srl t1, t4, t1 -; RV32I-NEXT: sll t4, t5, t0 -; RV32I-NEXT: sll t5, t6, t0 -; RV32I-NEXT: sll t6, s0, t0 -; RV32I-NEXT: sll s0, s1, t0 -; RV32I-NEXT: sll s1, s2, t0 -; RV32I-NEXT: sll t2, t2, t0 -; RV32I-NEXT: sll t3, t3, t0 +; RV32I-NEXT: sll t4, t5, a7 +; RV32I-NEXT: sll t5, t6, a7 +; RV32I-NEXT: sll t6, s0, a7 +; RV32I-NEXT: sll s0, s1, a7 +; RV32I-NEXT: sll s1, s2, a7 +; RV32I-NEXT: sll t2, t2, a7 +; RV32I-NEXT: sll t3, t3, a7 ; RV32I-NEXT: srli s2, t1, 24 ; RV32I-NEXT: srli s3, t1, 16 ; RV32I-NEXT: srli s4, t1, 8 -; RV32I-NEXT: or t0, a0, t4 +; RV32I-NEXT: or a7, a0, t4 ; RV32I-NEXT: or t4, a1, t5 ; RV32I-NEXT: or t5, a3, t6 ; RV32I-NEXT: or s0, a4, s0 ; RV32I-NEXT: or s1, a5, s1 ; RV32I-NEXT: or t2, a6, t2 -; RV32I-NEXT: or t3, a7, t3 +; RV32I-NEXT: or t3, t0, t3 ; RV32I-NEXT: sb t1, 28(a2) ; RV32I-NEXT: sb s4, 29(a2) ; RV32I-NEXT: sb s3, 30(a2) @@ -2387,23 +2384,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s6, s0, 24 ; RV32I-NEXT: srli s7, s0, 16 ; RV32I-NEXT: srli s0, s0, 8 -; RV32I-NEXT: srli s8, t5, 24 -; RV32I-NEXT: srli s9, t5, 16 -; RV32I-NEXT: srli t5, t5, 8 -; RV32I-NEXT: srli s10, t4, 24 -; RV32I-NEXT: srli s11, t4, 16 -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: sb t0, 24(a2) +; RV32I-NEXT: srli t0, t5, 24 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, t5, 16 +; RV32I-NEXT: srli t5, t5, 8 ; RV32I-NEXT: sb t6, 26(a2) +; RV32I-NEXT: srli t6, t4, 24 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, t0, 24 +; RV32I-NEXT: srli t1, t4, 16 +; RV32I-NEXT: srli t4, t4, 8 ; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: srli a6, a7, 24 ; RV32I-NEXT: sb t2, 17(a2) ; RV32I-NEXT: sb s3, 18(a2) ; RV32I-NEXT: sb s2, 19(a2) -; RV32I-NEXT: srli a6, t0, 16 -; RV32I-NEXT: srli t0, t0, 8 +; RV32I-NEXT: srli t2, a7, 16 +; RV32I-NEXT: srli a7, a7, 8 ; RV32I-NEXT: sb a5, 20(a2) ; RV32I-NEXT: sb s1, 21(a2) ; RV32I-NEXT: sb s5, 22(a2) @@ -2414,30 +2411,29 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: sb a3, 12(a2) ; RV32I-NEXT: sb t5, 13(a2) -; RV32I-NEXT: sb s9, 14(a2) -; RV32I-NEXT: sb s8, 15(a2) +; RV32I-NEXT: sb t3, 14(a2) +; RV32I-NEXT: sb t0, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: sb t4, 1(a2) -; RV32I-NEXT: sb s11, 2(a2) -; RV32I-NEXT: sb s10, 3(a2) +; RV32I-NEXT: sb t1, 2(a2) +; RV32I-NEXT: sb t6, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: sb t0, 5(a2) -; RV32I-NEXT: sb a6, 6(a2) -; RV32I-NEXT: sb a7, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb t2, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -2682,129 +2678,128 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; ; RV32I-LABEL: lshr_32bytes_wordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 0(a0) -; RV32I-NEXT: lbu t0, 1(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu s1, 3(a0) -; RV32I-NEXT: lbu s7, 4(a0) -; RV32I-NEXT: lbu s8, 5(a0) -; RV32I-NEXT: lbu s4, 6(a0) -; RV32I-NEXT: lbu s6, 7(a0) -; RV32I-NEXT: lbu s5, 8(a0) -; RV32I-NEXT: lbu s10, 9(a0) -; RV32I-NEXT: lbu s11, 10(a0) -; RV32I-NEXT: lbu ra, 11(a0) -; RV32I-NEXT: lbu t4, 12(a0) -; RV32I-NEXT: lbu t6, 13(a0) -; RV32I-NEXT: lbu a5, 14(a0) -; RV32I-NEXT: lbu a6, 15(a0) -; RV32I-NEXT: lbu a3, 16(a0) -; RV32I-NEXT: lbu t2, 17(a0) -; RV32I-NEXT: lbu t3, 18(a0) -; RV32I-NEXT: lbu t5, 19(a0) -; RV32I-NEXT: lbu a4, 20(a0) -; RV32I-NEXT: lbu s0, 21(a0) -; RV32I-NEXT: lbu s2, 22(a0) -; RV32I-NEXT: lbu s3, 23(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s2, 12(a0) +; RV32I-NEXT: lbu s3, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: slli s8, s8, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, s1, t1 -; RV32I-NEXT: or t1, s8, s7 -; RV32I-NEXT: lbu s1, 24(a0) -; RV32I-NEXT: lbu s7, 25(a0) -; RV32I-NEXT: lbu s8, 26(a0) -; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: lbu s0, 22(a0) +; RV32I-NEXT: lbu s1, 23(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s3, s3, 8 ; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: slli s11, s11, 16 -; RV32I-NEXT: slli ra, ra, 24 -; RV32I-NEXT: or s4, s6, s4 -; RV32I-NEXT: or s5, s10, s5 -; RV32I-NEXT: or s6, ra, s11 -; RV32I-NEXT: lbu s10, 28(a0) -; RV32I-NEXT: lbu s11, 29(a0) -; RV32I-NEXT: lbu ra, 30(a0) +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s3, s2 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu s2, 25(a0) +; RV32I-NEXT: lbu s3, 26(a0) +; RV32I-NEXT: lbu s4, 27(a0) +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or t6, s11, s10 +; RV32I-NEXT: lbu s5, 28(a0) +; RV32I-NEXT: lbu s6, 29(a0) +; RV32I-NEXT: lbu s7, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 64(sp) -; RV32I-NEXT: sw zero, 68(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t4, t6, t4 -; RV32I-NEXT: addi t6, sp, 8 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s2, s2, 16 -; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli s0, s0, 16 +; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: mv s1, sp +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s3, s3, 16 +; RV32I-NEXT: slli s4, s4, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a3, t2, a3 -; RV32I-NEXT: or a6, t5, t3 -; RV32I-NEXT: or a4, s0, a4 -; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: or t3, s7, s1 -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: or s0, s11, s10 -; RV32I-NEXT: or a0, a0, ra +; RV32I-NEXT: or t3, s2, t3 +; RV32I-NEXT: or s2, s4, s3 +; RV32I-NEXT: or s3, s6, s5 +; RV32I-NEXT: or a0, a0, s7 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, s4, t1 -; RV32I-NEXT: or t1, s6, s5 -; RV32I-NEXT: or a5, a5, t4 -; RV32I-NEXT: or a3, a6, a3 -; RV32I-NEXT: or a4, t2, a4 -; RV32I-NEXT: or a6, t5, t3 -; RV32I-NEXT: or a0, a0, s0 -; RV32I-NEXT: add t6, t6, a1 -; RV32I-NEXT: sw a3, 24(sp) -; RV32I-NEXT: sw a4, 28(sp) -; RV32I-NEXT: sw a6, 32(sp) -; RV32I-NEXT: sw a0, 36(sp) -; RV32I-NEXT: sw a7, 8(sp) -; RV32I-NEXT: sw t0, 12(sp) -; RV32I-NEXT: sw t1, 16(sp) -; RV32I-NEXT: sw a5, 20(sp) -; RV32I-NEXT: lw a6, 16(t6) -; RV32I-NEXT: lw a5, 20(t6) -; RV32I-NEXT: lw a7, 24(t6) -; RV32I-NEXT: lw a1, 0(t6) -; RV32I-NEXT: lw a0, 4(t6) -; RV32I-NEXT: lw a4, 8(t6) -; RV32I-NEXT: lw a3, 12(t6) -; RV32I-NEXT: lw t0, 28(t6) +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t5, t4 +; RV32I-NEXT: or t0, s0, t6 +; RV32I-NEXT: or t1, s2, t3 +; RV32I-NEXT: or a0, a0, s3 +; RV32I-NEXT: add s1, s1, a1 +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: lw a6, 16(s1) +; RV32I-NEXT: lw a5, 20(s1) +; RV32I-NEXT: lw a7, 24(s1) +; RV32I-NEXT: lw a1, 0(s1) +; RV32I-NEXT: lw a0, 4(s1) +; RV32I-NEXT: lw a4, 8(s1) +; RV32I-NEXT: lw a3, 12(s1) +; RV32I-NEXT: lw t0, 28(s1) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -2819,21 +2814,21 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 -; RV32I-NEXT: srli s8, a4, 8 -; RV32I-NEXT: srli s9, a3, 24 -; RV32I-NEXT: srli s10, a3, 16 -; RV32I-NEXT: srli s11, a3, 8 -; RV32I-NEXT: srli ra, a1, 24 ; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) +; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: srli t1, a3, 8 ; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli t4, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -2845,36 +2840,35 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) -; RV32I-NEXT: sb s9, 15(a2) +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb t3, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: sb ra, 3(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb t6, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %wordOff = load i256, ptr %wordOff.ptr, align 1 @@ -2900,111 +2894,111 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a5, 0(a0) -; RV64I-NEXT: lbu a7, 1(a0) -; RV64I-NEXT: lbu t2, 2(a0) -; RV64I-NEXT: lbu s3, 3(a0) -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu s8, 5(a0) -; RV64I-NEXT: lbu s9, 6(a0) -; RV64I-NEXT: lbu s10, 7(a0) -; RV64I-NEXT: lbu s2, 8(a0) -; RV64I-NEXT: lbu s4, 9(a0) -; RV64I-NEXT: lbu s5, 10(a0) -; RV64I-NEXT: lbu s6, 11(a0) -; RV64I-NEXT: lbu s7, 12(a0) -; RV64I-NEXT: lbu s11, 13(a0) -; RV64I-NEXT: lbu t1, 14(a0) -; RV64I-NEXT: lbu t3, 15(a0) -; RV64I-NEXT: lbu a3, 16(a0) -; RV64I-NEXT: lbu a6, 17(a0) -; RV64I-NEXT: lbu t4, 18(a0) -; RV64I-NEXT: lbu t5, 19(a0) -; RV64I-NEXT: lbu a4, 20(a0) -; RV64I-NEXT: lbu t6, 21(a0) -; RV64I-NEXT: lbu s0, 22(a0) -; RV64I-NEXT: lbu s1, 23(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 ; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: slli s9, s9, 16 -; RV64I-NEXT: slli s10, s10, 24 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, s3, t2 -; RV64I-NEXT: or t0, s8, t0 -; RV64I-NEXT: or t2, s10, s9 -; RV64I-NEXT: lbu s3, 24(a0) -; RV64I-NEXT: lbu s8, 25(a0) -; RV64I-NEXT: lbu s9, 26(a0) -; RV64I-NEXT: lbu s10, 27(a0) -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: slli s5, s5, 16 -; RV64I-NEXT: slli s6, s6, 24 -; RV64I-NEXT: slli s11, s11, 8 -; RV64I-NEXT: or s2, s4, s2 -; RV64I-NEXT: or s4, s6, s5 -; RV64I-NEXT: or s5, s11, s7 -; RV64I-NEXT: lbu s6, 28(a0) -; RV64I-NEXT: lbu s7, 29(a0) -; RV64I-NEXT: lbu s11, 30(a0) +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t3, 24(a0) +; RV64I-NEXT: lbu t4, 25(a0) +; RV64I-NEXT: lbu t5, 26(a0) +; RV64I-NEXT: lbu t6, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or s0, s5, s4 +; RV64I-NEXT: or s1, s7, s6 +; RV64I-NEXT: or s2, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) ; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) ; RV64I-NEXT: sd zero, 56(sp) -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t1, t3, t1 -; RV64I-NEXT: mv t3, sp -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: slli t4, t4, 16 -; RV64I-NEXT: slli t5, t5, 24 -; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s0, s0, 16 -; RV64I-NEXT: slli s1, s1, 24 -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: slli s9, s9, 16 -; RV64I-NEXT: slli s10, s10, 24 -; RV64I-NEXT: slli s7, s7, 8 -; RV64I-NEXT: slli s11, s11, 16 +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: or s6, s11, s10 +; RV64I-NEXT: mv s7, sp +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: or a6, t5, t4 -; RV64I-NEXT: or a4, t6, a4 -; RV64I-NEXT: or s0, s1, s0 -; RV64I-NEXT: or t4, s8, s3 -; RV64I-NEXT: or t5, s10, s9 -; RV64I-NEXT: or t6, s7, s6 -; RV64I-NEXT: or a0, a0, s11 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or t4, t6, t5 +; RV64I-NEXT: or t5, s4, s3 +; RV64I-NEXT: or a0, a0, s5 ; RV64I-NEXT: andi a1, a1, 24 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, t2, t0 -; RV64I-NEXT: or t0, s4, s2 -; RV64I-NEXT: or t1, t1, s5 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: or a4, s0, a4 -; RV64I-NEXT: or a6, t5, t4 -; RV64I-NEXT: or a0, a0, t6 -; RV64I-NEXT: add t3, t3, a1 -; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or a7, s6, s2 +; RV64I-NEXT: or t0, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: add s7, s7, a1 ; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: or a1, a7, a5 -; RV64I-NEXT: or a5, t1, t0 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: sd a1, 0(sp) -; RV64I-NEXT: sd a5, 8(sp) -; RV64I-NEXT: sd a3, 16(sp) +; RV64I-NEXT: or a1, a6, a5 +; RV64I-NEXT: or a4, a7, s0 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a1, 8(sp) +; RV64I-NEXT: sd a4, 16(sp) ; RV64I-NEXT: sd a0, 24(sp) -; RV64I-NEXT: ld a4, 16(t3) -; RV64I-NEXT: ld a0, 8(t3) -; RV64I-NEXT: ld a1, 0(t3) -; RV64I-NEXT: ld a3, 24(t3) +; RV64I-NEXT: ld a4, 16(s7) +; RV64I-NEXT: ld a0, 8(s7) +; RV64I-NEXT: ld a1, 0(s7) +; RV64I-NEXT: ld a3, 24(s7) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 ; RV64I-NEXT: srli a7, a4, 40 @@ -3023,25 +3017,25 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: srli s5, a1, 48 ; RV64I-NEXT: srli s6, a1, 40 ; RV64I-NEXT: srli s7, a1, 32 -; RV64I-NEXT: srli s8, a1, 24 -; RV64I-NEXT: srli s9, a1, 16 -; RV64I-NEXT: srli s10, a1, 8 -; RV64I-NEXT: srli s11, a0, 56 ; RV64I-NEXT: sb t0, 20(a2) +; RV64I-NEXT: srli t0, a1, 24 ; RV64I-NEXT: sb a7, 21(a2) +; RV64I-NEXT: srli a7, a1, 16 ; RV64I-NEXT: sb a6, 22(a2) +; RV64I-NEXT: srli a6, a1, 8 ; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, a0, 48 +; RV64I-NEXT: srli a5, a0, 56 ; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: srli a4, a0, 48 ; RV64I-NEXT: sb t3, 17(a2) ; RV64I-NEXT: sb t2, 18(a2) ; RV64I-NEXT: sb t1, 19(a2) -; RV64I-NEXT: srli a4, a0, 40 +; RV64I-NEXT: srli t1, a0, 40 ; RV64I-NEXT: sb s0, 28(a2) ; RV64I-NEXT: sb t6, 29(a2) ; RV64I-NEXT: sb t5, 30(a2) ; RV64I-NEXT: sb t4, 31(a2) -; RV64I-NEXT: srli a6, a0, 32 +; RV64I-NEXT: srli t2, a0, 32 ; RV64I-NEXT: sb a3, 24(a2) ; RV64I-NEXT: sb s3, 25(a2) ; RV64I-NEXT: sb s2, 26(a2) @@ -3051,19 +3045,19 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sb s6, 5(a2) ; RV64I-NEXT: sb s5, 6(a2) ; RV64I-NEXT: sb s4, 7(a2) -; RV64I-NEXT: srli a7, a0, 16 +; RV64I-NEXT: srli t3, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb s10, 1(a2) -; RV64I-NEXT: sb s9, 2(a2) -; RV64I-NEXT: sb s8, 3(a2) +; RV64I-NEXT: sb a6, 1(a2) +; RV64I-NEXT: sb a7, 2(a2) +; RV64I-NEXT: sb t0, 3(a2) ; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: sb a6, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a5, 14(a2) -; RV64I-NEXT: sb s11, 15(a2) +; RV64I-NEXT: sb t2, 12(a2) +; RV64I-NEXT: sb t1, 13(a2) +; RV64I-NEXT: sb a4, 14(a2) +; RV64I-NEXT: sb a5, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: sb a7, 10(a2) +; RV64I-NEXT: sb t3, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) ; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload @@ -3082,129 +3076,128 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; ; RV32I-LABEL: lshr_32bytes_dwordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 0(a0) -; RV32I-NEXT: lbu t0, 1(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu s1, 3(a0) -; RV32I-NEXT: lbu s7, 4(a0) -; RV32I-NEXT: lbu s8, 5(a0) -; RV32I-NEXT: lbu s4, 6(a0) -; RV32I-NEXT: lbu s6, 7(a0) -; RV32I-NEXT: lbu s5, 8(a0) -; RV32I-NEXT: lbu s10, 9(a0) -; RV32I-NEXT: lbu s11, 10(a0) -; RV32I-NEXT: lbu ra, 11(a0) -; RV32I-NEXT: lbu t4, 12(a0) -; RV32I-NEXT: lbu t6, 13(a0) -; RV32I-NEXT: lbu a5, 14(a0) -; RV32I-NEXT: lbu a6, 15(a0) -; RV32I-NEXT: lbu a3, 16(a0) -; RV32I-NEXT: lbu t2, 17(a0) -; RV32I-NEXT: lbu t3, 18(a0) -; RV32I-NEXT: lbu t5, 19(a0) -; RV32I-NEXT: lbu a4, 20(a0) -; RV32I-NEXT: lbu s0, 21(a0) -; RV32I-NEXT: lbu s2, 22(a0) -; RV32I-NEXT: lbu s3, 23(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s2, 12(a0) +; RV32I-NEXT: lbu s3, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: slli s8, s8, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, s1, t1 -; RV32I-NEXT: or t1, s8, s7 -; RV32I-NEXT: lbu s1, 24(a0) -; RV32I-NEXT: lbu s7, 25(a0) -; RV32I-NEXT: lbu s8, 26(a0) -; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: lbu s0, 22(a0) +; RV32I-NEXT: lbu s1, 23(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s3, s3, 8 ; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: slli s11, s11, 16 -; RV32I-NEXT: slli ra, ra, 24 -; RV32I-NEXT: or s4, s6, s4 -; RV32I-NEXT: or s5, s10, s5 -; RV32I-NEXT: or s6, ra, s11 -; RV32I-NEXT: lbu s10, 28(a0) -; RV32I-NEXT: lbu s11, 29(a0) -; RV32I-NEXT: lbu ra, 30(a0) +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s3, s2 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu s2, 25(a0) +; RV32I-NEXT: lbu s3, 26(a0) +; RV32I-NEXT: lbu s4, 27(a0) +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or t6, s11, s10 +; RV32I-NEXT: lbu s5, 28(a0) +; RV32I-NEXT: lbu s6, 29(a0) +; RV32I-NEXT: lbu s7, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 64(sp) -; RV32I-NEXT: sw zero, 68(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t4, t6, t4 -; RV32I-NEXT: addi t6, sp, 8 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s2, s2, 16 -; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli s0, s0, 16 +; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: mv s1, sp +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s3, s3, 16 +; RV32I-NEXT: slli s4, s4, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a3, t2, a3 -; RV32I-NEXT: or a6, t5, t3 -; RV32I-NEXT: or a4, s0, a4 -; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: or t3, s7, s1 -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: or s0, s11, s10 -; RV32I-NEXT: or a0, a0, ra +; RV32I-NEXT: or t3, s2, t3 +; RV32I-NEXT: or s2, s4, s3 +; RV32I-NEXT: or s3, s6, s5 +; RV32I-NEXT: or a0, a0, s7 ; RV32I-NEXT: andi a1, a1, 24 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, s4, t1 -; RV32I-NEXT: or t1, s6, s5 -; RV32I-NEXT: or a5, a5, t4 -; RV32I-NEXT: or a3, a6, a3 -; RV32I-NEXT: or a4, t2, a4 -; RV32I-NEXT: or a6, t5, t3 -; RV32I-NEXT: or a0, a0, s0 -; RV32I-NEXT: add t6, t6, a1 -; RV32I-NEXT: sw a3, 24(sp) -; RV32I-NEXT: sw a4, 28(sp) -; RV32I-NEXT: sw a6, 32(sp) -; RV32I-NEXT: sw a0, 36(sp) -; RV32I-NEXT: sw a7, 8(sp) -; RV32I-NEXT: sw t0, 12(sp) -; RV32I-NEXT: sw t1, 16(sp) -; RV32I-NEXT: sw a5, 20(sp) -; RV32I-NEXT: lw a6, 16(t6) -; RV32I-NEXT: lw a5, 20(t6) -; RV32I-NEXT: lw a7, 24(t6) -; RV32I-NEXT: lw a1, 0(t6) -; RV32I-NEXT: lw a0, 4(t6) -; RV32I-NEXT: lw a4, 8(t6) -; RV32I-NEXT: lw a3, 12(t6) -; RV32I-NEXT: lw t0, 28(t6) +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t5, t4 +; RV32I-NEXT: or t0, s0, t6 +; RV32I-NEXT: or t1, s2, t3 +; RV32I-NEXT: or a0, a0, s3 +; RV32I-NEXT: add s1, s1, a1 +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: lw a6, 16(s1) +; RV32I-NEXT: lw a5, 20(s1) +; RV32I-NEXT: lw a7, 24(s1) +; RV32I-NEXT: lw a1, 0(s1) +; RV32I-NEXT: lw a0, 4(s1) +; RV32I-NEXT: lw a4, 8(s1) +; RV32I-NEXT: lw a3, 12(s1) +; RV32I-NEXT: lw t0, 28(s1) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -3219,21 +3212,21 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 -; RV32I-NEXT: srli s8, a4, 8 -; RV32I-NEXT: srli s9, a3, 24 -; RV32I-NEXT: srli s10, a3, 16 -; RV32I-NEXT: srli s11, a3, 8 -; RV32I-NEXT: srli ra, a1, 24 ; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) +; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: srli t1, a3, 8 ; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli t4, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -3245,36 +3238,35 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) -; RV32I-NEXT: sb s9, 15(a2) +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb t3, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: sb ra, 3(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb t6, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 @@ -3518,132 +3510,129 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu s1, 0(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu t1, 4(a0) -; RV32I-NEXT: lbu t3, 5(a0) -; RV32I-NEXT: lbu t4, 6(a0) -; RV32I-NEXT: lbu s0, 7(a0) -; RV32I-NEXT: lbu t2, 8(a0) -; RV32I-NEXT: lbu s3, 9(a0) -; RV32I-NEXT: lbu s6, 10(a0) -; RV32I-NEXT: lbu s8, 11(a0) -; RV32I-NEXT: lbu s9, 12(a0) -; RV32I-NEXT: lbu s10, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s7, 15(a0) -; RV32I-NEXT: lbu s5, 16(a0) -; RV32I-NEXT: lbu s11, 17(a0) -; RV32I-NEXT: lbu ra, 18(a0) -; RV32I-NEXT: lbu a3, 19(a0) -; RV32I-NEXT: lbu t5, 20(a0) -; RV32I-NEXT: lbu t6, 21(a0) -; RV32I-NEXT: lbu a7, 22(a0) -; RV32I-NEXT: lbu t0, 23(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t3, t1 -; RV32I-NEXT: or a6, s0, t4 -; RV32I-NEXT: lbu t1, 24(a0) +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t6, 24(a0) ; RV32I-NEXT: lbu s0, 25(a0) ; RV32I-NEXT: lbu s1, 26(a0) ; RV32I-NEXT: lbu s2, 27(a0) -; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s5, s5, 8 ; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli s8, s8, 24 -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: or t2, s3, t2 -; RV32I-NEXT: or t3, s8, s6 -; RV32I-NEXT: or t4, s10, s9 -; RV32I-NEXT: lbu s3, 28(a0) -; RV32I-NEXT: lbu s6, 29(a0) -; RV32I-NEXT: lbu s8, 30(a0) -; RV32I-NEXT: lbu s9, 31(a0) -; RV32I-NEXT: slli s4, s4, 16 ; RV32I-NEXT: slli s7, s7, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a0, s7, s4 -; RV32I-NEXT: or s4, s11, s5 -; RV32I-NEXT: or s5, a3, ra -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu s7, 1(a1) -; RV32I-NEXT: lbu s10, 2(a1) +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu s6, 31(a0) +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: or a0, s11, s10 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: lbu s2, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t5, t6, t5 -; RV32I-NEXT: addi t6, sp, 40 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: slli s6, s6, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: or s3, s4, s3 +; RV32I-NEXT: addi s4, sp, 32 +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, s0, t1 -; RV32I-NEXT: or t1, s2, s1 -; RV32I-NEXT: or s0, s6, s3 -; RV32I-NEXT: or s1, s9, s8 -; RV32I-NEXT: or a3, s7, a3 -; RV32I-NEXT: or a1, a1, s10 -; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, s2 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t3, t2 -; RV32I-NEXT: or a0, a0, t4 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: or a7, a7, t5 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: sw t2, 56(sp) -; RV32I-NEXT: sw a7, 60(sp) -; RV32I-NEXT: sw t0, 64(sp) -; RV32I-NEXT: sw s0, 68(sp) -; RV32I-NEXT: sw a4, 40(sp) -; RV32I-NEXT: sw a5, 44(sp) -; RV32I-NEXT: sw a6, 48(sp) +; RV32I-NEXT: or s5, s6, s5 +; RV32I-NEXT: or s1, s2, s1 +; RV32I-NEXT: or a1, a1, s7 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or a0, a0, t5 +; RV32I-NEXT: or t0, s0, t6 +; RV32I-NEXT: or t1, s5, s3 +; RV32I-NEXT: or a1, a1, s1 +; RV32I-NEXT: sw a7, 48(sp) ; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: sw t0, 56(sp) +; RV32I-NEXT: sw t1, 60(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: sw a4, 36(sp) +; RV32I-NEXT: sw a5, 40(sp) +; RV32I-NEXT: sw a6, 44(sp) ; RV32I-NEXT: slli a3, a1, 3 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: sub a1, t6, a1 +; RV32I-NEXT: sub a1, s4, a1 ; RV32I-NEXT: andi a0, a3, 24 ; RV32I-NEXT: xori a0, a0, 31 ; RV32I-NEXT: lw a4, 0(a1) @@ -3658,10 +3647,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli t4, a4, 1 ; RV32I-NEXT: sll t5, a7, a3 ; RV32I-NEXT: srli t6, a6, 1 -; RV32I-NEXT: sll s0, a6, a3 +; RV32I-NEXT: sll a6, a6, a3 ; RV32I-NEXT: srli a5, a5, 1 -; RV32I-NEXT: sll s1, t1, a3 -; RV32I-NEXT: srli a6, t0, 1 +; RV32I-NEXT: sll s0, t1, a3 +; RV32I-NEXT: srli s1, t0, 1 ; RV32I-NEXT: sll s2, t0, a3 ; RV32I-NEXT: srli a7, a7, 1 ; RV32I-NEXT: sll s3, a1, a3 @@ -3669,56 +3658,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sll s4, t2, a3 ; RV32I-NEXT: srli t0, t1, 1 ; RV32I-NEXT: sll s5, a4, a3 -; RV32I-NEXT: srl t2, t4, a0 -; RV32I-NEXT: srl t4, t6, a0 -; RV32I-NEXT: srl t6, a5, a0 -; RV32I-NEXT: srl s6, a6, a0 -; RV32I-NEXT: srl s7, a7, a0 -; RV32I-NEXT: srl s8, a1, a0 -; RV32I-NEXT: srl s9, t0, a0 -; RV32I-NEXT: srli t1, s4, 24 -; RV32I-NEXT: srli a7, s3, 24 +; RV32I-NEXT: srl t4, t4, a0 +; RV32I-NEXT: srl a4, t6, a0 +; RV32I-NEXT: srl t1, a5, a0 +; RV32I-NEXT: srl t6, s1, a0 +; RV32I-NEXT: srl s1, a7, a0 +; RV32I-NEXT: srl s6, a1, a0 +; RV32I-NEXT: srl s7, t0, a0 +; RV32I-NEXT: srli t2, s4, 24 +; RV32I-NEXT: srli t0, s3, 24 ; RV32I-NEXT: srli a5, s2, 24 -; RV32I-NEXT: srli a3, s1, 24 -; RV32I-NEXT: srli a1, s0, 24 +; RV32I-NEXT: srli a3, s0, 24 +; RV32I-NEXT: srli a1, a6, 24 ; RV32I-NEXT: srli a0, t5, 24 -; RV32I-NEXT: srli s10, s5, 24 -; RV32I-NEXT: srli s11, s5, 16 -; RV32I-NEXT: srli ra, s5, 8 -; RV32I-NEXT: srli a4, t3, 24 -; RV32I-NEXT: or a6, t3, t2 -; RV32I-NEXT: or t0, t5, t4 -; RV32I-NEXT: or t2, s0, t6 -; RV32I-NEXT: or t3, s1, s6 -; RV32I-NEXT: or t4, s2, s7 -; RV32I-NEXT: or t5, s3, s8 -; RV32I-NEXT: or t6, s4, s9 +; RV32I-NEXT: srli s8, s5, 24 +; RV32I-NEXT: or a4, t5, a4 +; RV32I-NEXT: srli t5, s5, 16 +; RV32I-NEXT: or t1, a6, t1 +; RV32I-NEXT: srli s9, s5, 8 +; RV32I-NEXT: or a7, t3, t4 +; RV32I-NEXT: srli a6, t3, 24 +; RV32I-NEXT: or t3, s0, t6 +; RV32I-NEXT: or t4, s2, s1 +; RV32I-NEXT: or t6, s3, s6 +; RV32I-NEXT: or s0, s4, s7 ; RV32I-NEXT: sb s5, 0(a2) -; RV32I-NEXT: sb ra, 1(a2) -; RV32I-NEXT: sb s11, 2(a2) -; RV32I-NEXT: sb s10, 3(a2) -; RV32I-NEXT: srli s0, t6, 16 -; RV32I-NEXT: srli s1, t6, 8 -; RV32I-NEXT: srli s2, t5, 16 -; RV32I-NEXT: srli s3, t5, 8 +; RV32I-NEXT: sb s9, 1(a2) +; RV32I-NEXT: sb t5, 2(a2) +; RV32I-NEXT: sb s8, 3(a2) +; RV32I-NEXT: srli t5, s0, 16 +; RV32I-NEXT: srli s1, s0, 8 +; RV32I-NEXT: srli s2, t6, 16 +; RV32I-NEXT: srli s3, t6, 8 ; RV32I-NEXT: srli s4, t4, 16 ; RV32I-NEXT: srli s5, t4, 8 ; RV32I-NEXT: srli s6, t3, 16 ; RV32I-NEXT: srli s7, t3, 8 -; RV32I-NEXT: srli s8, t2, 16 -; RV32I-NEXT: srli s9, t2, 8 -; RV32I-NEXT: srli s10, t0, 16 -; RV32I-NEXT: srli s11, t0, 8 -; RV32I-NEXT: sb t6, 24(a2) +; RV32I-NEXT: sb s0, 24(a2) +; RV32I-NEXT: srli s0, t1, 16 ; RV32I-NEXT: sb s1, 25(a2) -; RV32I-NEXT: sb s0, 26(a2) -; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a6, 16 -; RV32I-NEXT: sb t5, 28(a2) +; RV32I-NEXT: srli s1, t1, 8 +; RV32I-NEXT: sb t5, 26(a2) +; RV32I-NEXT: srli t5, a4, 16 +; RV32I-NEXT: sb t2, 27(a2) +; RV32I-NEXT: srli t2, a4, 8 +; RV32I-NEXT: sb t6, 28(a2) +; RV32I-NEXT: srli t6, a7, 16 ; RV32I-NEXT: sb s3, 29(a2) ; RV32I-NEXT: sb s2, 30(a2) -; RV32I-NEXT: sb a7, 31(a2) -; RV32I-NEXT: srli a7, a6, 8 +; RV32I-NEXT: sb t0, 31(a2) +; RV32I-NEXT: srli t0, a7, 8 ; RV32I-NEXT: sb t4, 16(a2) ; RV32I-NEXT: sb s5, 17(a2) ; RV32I-NEXT: sb s4, 18(a2) @@ -3727,32 +3716,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s7, 21(a2) ; RV32I-NEXT: sb s6, 22(a2) ; RV32I-NEXT: sb a3, 23(a2) -; RV32I-NEXT: sb t2, 8(a2) -; RV32I-NEXT: sb s9, 9(a2) -; RV32I-NEXT: sb s8, 10(a2) +; RV32I-NEXT: sb t1, 8(a2) +; RV32I-NEXT: sb s1, 9(a2) +; RV32I-NEXT: sb s0, 10(a2) ; RV32I-NEXT: sb a1, 11(a2) -; RV32I-NEXT: sb t0, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb a4, 12(a2) +; RV32I-NEXT: sb t2, 13(a2) +; RV32I-NEXT: sb t5, 14(a2) ; RV32I-NEXT: sb a0, 15(a2) -; RV32I-NEXT: sb a6, 4(a2) -; RV32I-NEXT: sb a7, 5(a2) -; RV32I-NEXT: sb t1, 6(a2) -; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: sb a7, 4(a2) +; RV32I-NEXT: sb t0, 5(a2) +; RV32I-NEXT: sb t6, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -3997,129 +3985,128 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; ; RV32I-LABEL: shl_32bytes_wordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 0(a0) -; RV32I-NEXT: lbu t0, 1(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu s1, 3(a0) -; RV32I-NEXT: lbu s7, 4(a0) -; RV32I-NEXT: lbu s8, 5(a0) -; RV32I-NEXT: lbu s4, 6(a0) -; RV32I-NEXT: lbu s6, 7(a0) -; RV32I-NEXT: lbu s5, 8(a0) -; RV32I-NEXT: lbu s10, 9(a0) -; RV32I-NEXT: lbu s11, 10(a0) -; RV32I-NEXT: lbu ra, 11(a0) -; RV32I-NEXT: lbu t4, 12(a0) -; RV32I-NEXT: lbu t6, 13(a0) -; RV32I-NEXT: lbu a5, 14(a0) -; RV32I-NEXT: lbu a6, 15(a0) -; RV32I-NEXT: lbu a3, 16(a0) -; RV32I-NEXT: lbu t2, 17(a0) -; RV32I-NEXT: lbu t3, 18(a0) -; RV32I-NEXT: lbu t5, 19(a0) -; RV32I-NEXT: lbu a4, 20(a0) -; RV32I-NEXT: lbu s0, 21(a0) -; RV32I-NEXT: lbu s2, 22(a0) -; RV32I-NEXT: lbu s3, 23(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s2, 12(a0) +; RV32I-NEXT: lbu s3, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: slli s8, s8, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, s1, t1 -; RV32I-NEXT: or t1, s8, s7 -; RV32I-NEXT: lbu s1, 24(a0) -; RV32I-NEXT: lbu s7, 25(a0) -; RV32I-NEXT: lbu s8, 26(a0) -; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: lbu s0, 22(a0) +; RV32I-NEXT: lbu s1, 23(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s3, s3, 8 ; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: slli s11, s11, 16 -; RV32I-NEXT: slli ra, ra, 24 -; RV32I-NEXT: or s4, s6, s4 -; RV32I-NEXT: or s5, s10, s5 -; RV32I-NEXT: or s6, ra, s11 -; RV32I-NEXT: lbu s10, 28(a0) -; RV32I-NEXT: lbu s11, 29(a0) -; RV32I-NEXT: lbu ra, 30(a0) +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s3, s2 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu s2, 25(a0) +; RV32I-NEXT: lbu s3, 26(a0) +; RV32I-NEXT: lbu s4, 27(a0) +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or t6, s11, s10 +; RV32I-NEXT: lbu s5, 28(a0) +; RV32I-NEXT: lbu s6, 29(a0) +; RV32I-NEXT: lbu s7, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t4, t6, t4 -; RV32I-NEXT: addi t6, sp, 40 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s2, s2, 16 -; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli s0, s0, 16 +; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: addi s1, sp, 32 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s3, s3, 16 +; RV32I-NEXT: slli s4, s4, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a3, t2, a3 -; RV32I-NEXT: or a6, t5, t3 -; RV32I-NEXT: or a4, s0, a4 -; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: or t3, s7, s1 -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: or s0, s11, s10 -; RV32I-NEXT: or a0, a0, ra +; RV32I-NEXT: or t3, s2, t3 +; RV32I-NEXT: or s2, s4, s3 +; RV32I-NEXT: or s3, s6, s5 +; RV32I-NEXT: or a0, a0, s7 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, s4, t1 -; RV32I-NEXT: or t1, s6, s5 -; RV32I-NEXT: or a5, a5, t4 -; RV32I-NEXT: or a3, a6, a3 -; RV32I-NEXT: or a4, t2, a4 -; RV32I-NEXT: or a6, t5, t3 -; RV32I-NEXT: or a0, a0, s0 -; RV32I-NEXT: sub t2, t6, a1 -; RV32I-NEXT: sw a3, 56(sp) -; RV32I-NEXT: sw a4, 60(sp) -; RV32I-NEXT: sw a6, 64(sp) -; RV32I-NEXT: sw a0, 68(sp) -; RV32I-NEXT: sw a7, 40(sp) -; RV32I-NEXT: sw t0, 44(sp) -; RV32I-NEXT: sw t1, 48(sp) -; RV32I-NEXT: sw a5, 52(sp) -; RV32I-NEXT: lw a6, 16(t2) -; RV32I-NEXT: lw a5, 20(t2) -; RV32I-NEXT: lw a7, 24(t2) -; RV32I-NEXT: lw a1, 0(t2) -; RV32I-NEXT: lw a0, 4(t2) -; RV32I-NEXT: lw a4, 8(t2) -; RV32I-NEXT: lw a3, 12(t2) -; RV32I-NEXT: lw t0, 28(t2) +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t5, t4 +; RV32I-NEXT: or t0, s0, t6 +; RV32I-NEXT: or t1, s2, t3 +; RV32I-NEXT: or a0, a0, s3 +; RV32I-NEXT: sub s1, s1, a1 +; RV32I-NEXT: sw a7, 48(sp) +; RV32I-NEXT: sw t0, 52(sp) +; RV32I-NEXT: sw t1, 56(sp) +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: sw a4, 36(sp) +; RV32I-NEXT: sw a5, 40(sp) +; RV32I-NEXT: sw a6, 44(sp) +; RV32I-NEXT: lw a6, 16(s1) +; RV32I-NEXT: lw a5, 20(s1) +; RV32I-NEXT: lw a7, 24(s1) +; RV32I-NEXT: lw a1, 0(s1) +; RV32I-NEXT: lw a0, 4(s1) +; RV32I-NEXT: lw a4, 8(s1) +; RV32I-NEXT: lw a3, 12(s1) +; RV32I-NEXT: lw t0, 28(s1) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -4134,21 +4121,21 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 -; RV32I-NEXT: srli s8, a4, 8 -; RV32I-NEXT: srli s9, a3, 24 -; RV32I-NEXT: srli s10, a3, 16 -; RV32I-NEXT: srli s11, a3, 8 -; RV32I-NEXT: srli ra, a1, 24 ; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) +; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: srli t1, a3, 8 ; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli t4, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -4160,36 +4147,35 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) -; RV32I-NEXT: sb s9, 15(a2) +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb t3, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: sb ra, 3(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb t6, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %wordOff = load i256, ptr %wordOff.ptr, align 1 @@ -4215,111 +4201,111 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a5, 0(a0) -; RV64I-NEXT: lbu a7, 1(a0) -; RV64I-NEXT: lbu t2, 2(a0) -; RV64I-NEXT: lbu s3, 3(a0) -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu s8, 5(a0) -; RV64I-NEXT: lbu s9, 6(a0) -; RV64I-NEXT: lbu s10, 7(a0) -; RV64I-NEXT: lbu s2, 8(a0) -; RV64I-NEXT: lbu s4, 9(a0) -; RV64I-NEXT: lbu s5, 10(a0) -; RV64I-NEXT: lbu s6, 11(a0) -; RV64I-NEXT: lbu s7, 12(a0) -; RV64I-NEXT: lbu s11, 13(a0) -; RV64I-NEXT: lbu t1, 14(a0) -; RV64I-NEXT: lbu t3, 15(a0) -; RV64I-NEXT: lbu a3, 16(a0) -; RV64I-NEXT: lbu a6, 17(a0) -; RV64I-NEXT: lbu t4, 18(a0) -; RV64I-NEXT: lbu t5, 19(a0) -; RV64I-NEXT: lbu a4, 20(a0) -; RV64I-NEXT: lbu t6, 21(a0) -; RV64I-NEXT: lbu s0, 22(a0) -; RV64I-NEXT: lbu s1, 23(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 ; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: slli s9, s9, 16 -; RV64I-NEXT: slli s10, s10, 24 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, s3, t2 -; RV64I-NEXT: or t0, s8, t0 -; RV64I-NEXT: or t2, s10, s9 -; RV64I-NEXT: lbu s3, 24(a0) -; RV64I-NEXT: lbu s8, 25(a0) -; RV64I-NEXT: lbu s9, 26(a0) -; RV64I-NEXT: lbu s10, 27(a0) -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: slli s5, s5, 16 -; RV64I-NEXT: slli s6, s6, 24 -; RV64I-NEXT: slli s11, s11, 8 -; RV64I-NEXT: or s2, s4, s2 -; RV64I-NEXT: or s4, s6, s5 -; RV64I-NEXT: or s5, s11, s7 -; RV64I-NEXT: lbu s6, 28(a0) -; RV64I-NEXT: lbu s7, 29(a0) -; RV64I-NEXT: lbu s11, 30(a0) +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t3, 24(a0) +; RV64I-NEXT: lbu t4, 25(a0) +; RV64I-NEXT: lbu t5, 26(a0) +; RV64I-NEXT: lbu t6, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or s0, s5, s4 +; RV64I-NEXT: or s1, s7, s6 +; RV64I-NEXT: or s2, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) ; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t1, t3, t1 -; RV64I-NEXT: addi t3, sp, 32 -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: slli t4, t4, 16 -; RV64I-NEXT: slli t5, t5, 24 -; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s0, s0, 16 -; RV64I-NEXT: slli s1, s1, 24 -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: slli s9, s9, 16 -; RV64I-NEXT: slli s10, s10, 24 -; RV64I-NEXT: slli s7, s7, 8 -; RV64I-NEXT: slli s11, s11, 16 +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: or s6, s11, s10 +; RV64I-NEXT: addi s7, sp, 32 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: or a6, t5, t4 -; RV64I-NEXT: or a4, t6, a4 -; RV64I-NEXT: or s0, s1, s0 -; RV64I-NEXT: or t4, s8, s3 -; RV64I-NEXT: or t5, s10, s9 -; RV64I-NEXT: or t6, s7, s6 -; RV64I-NEXT: or a0, a0, s11 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or t4, t6, t5 +; RV64I-NEXT: or t5, s4, s3 +; RV64I-NEXT: or a0, a0, s5 ; RV64I-NEXT: andi a1, a1, 24 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, t2, t0 -; RV64I-NEXT: or t0, s4, s2 -; RV64I-NEXT: or t1, t1, s5 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: or a4, s0, a4 -; RV64I-NEXT: or a6, t5, t4 -; RV64I-NEXT: or a0, a0, t6 -; RV64I-NEXT: sub t2, t3, a1 -; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or a7, s6, s2 +; RV64I-NEXT: or t0, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: sub t1, s7, a1 ; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: or a1, a7, a5 -; RV64I-NEXT: or a5, t1, t0 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: sd a1, 32(sp) -; RV64I-NEXT: sd a5, 40(sp) -; RV64I-NEXT: sd a3, 48(sp) +; RV64I-NEXT: or a1, a6, a5 +; RV64I-NEXT: or a4, a7, s0 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: sd a3, 32(sp) +; RV64I-NEXT: sd a1, 40(sp) +; RV64I-NEXT: sd a4, 48(sp) ; RV64I-NEXT: sd a0, 56(sp) -; RV64I-NEXT: ld a4, 16(t2) -; RV64I-NEXT: ld a0, 8(t2) -; RV64I-NEXT: ld a1, 0(t2) -; RV64I-NEXT: ld a3, 24(t2) +; RV64I-NEXT: ld a4, 16(t1) +; RV64I-NEXT: ld a0, 8(t1) +; RV64I-NEXT: ld a1, 0(t1) +; RV64I-NEXT: ld a3, 24(t1) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 ; RV64I-NEXT: srli a7, a4, 40 @@ -4338,25 +4324,25 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: srli s5, a1, 48 ; RV64I-NEXT: srli s6, a1, 40 ; RV64I-NEXT: srli s7, a1, 32 -; RV64I-NEXT: srli s8, a1, 24 -; RV64I-NEXT: srli s9, a1, 16 -; RV64I-NEXT: srli s10, a1, 8 -; RV64I-NEXT: srli s11, a0, 56 ; RV64I-NEXT: sb t0, 20(a2) +; RV64I-NEXT: srli t0, a1, 24 ; RV64I-NEXT: sb a7, 21(a2) +; RV64I-NEXT: srli a7, a1, 16 ; RV64I-NEXT: sb a6, 22(a2) +; RV64I-NEXT: srli a6, a1, 8 ; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, a0, 48 +; RV64I-NEXT: srli a5, a0, 56 ; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: srli a4, a0, 48 ; RV64I-NEXT: sb t3, 17(a2) ; RV64I-NEXT: sb t2, 18(a2) ; RV64I-NEXT: sb t1, 19(a2) -; RV64I-NEXT: srli a4, a0, 40 +; RV64I-NEXT: srli t1, a0, 40 ; RV64I-NEXT: sb s0, 28(a2) ; RV64I-NEXT: sb t6, 29(a2) ; RV64I-NEXT: sb t5, 30(a2) ; RV64I-NEXT: sb t4, 31(a2) -; RV64I-NEXT: srli a6, a0, 32 +; RV64I-NEXT: srli t2, a0, 32 ; RV64I-NEXT: sb a3, 24(a2) ; RV64I-NEXT: sb s3, 25(a2) ; RV64I-NEXT: sb s2, 26(a2) @@ -4366,19 +4352,19 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: sb s6, 5(a2) ; RV64I-NEXT: sb s5, 6(a2) ; RV64I-NEXT: sb s4, 7(a2) -; RV64I-NEXT: srli a7, a0, 16 +; RV64I-NEXT: srli t3, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb s10, 1(a2) -; RV64I-NEXT: sb s9, 2(a2) -; RV64I-NEXT: sb s8, 3(a2) +; RV64I-NEXT: sb a6, 1(a2) +; RV64I-NEXT: sb a7, 2(a2) +; RV64I-NEXT: sb t0, 3(a2) ; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: sb a6, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a5, 14(a2) -; RV64I-NEXT: sb s11, 15(a2) +; RV64I-NEXT: sb t2, 12(a2) +; RV64I-NEXT: sb t1, 13(a2) +; RV64I-NEXT: sb a4, 14(a2) +; RV64I-NEXT: sb a5, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: sb a7, 10(a2) +; RV64I-NEXT: sb t3, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) ; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload @@ -4397,129 +4383,128 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; ; RV32I-LABEL: shl_32bytes_dwordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 0(a0) -; RV32I-NEXT: lbu t0, 1(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu s1, 3(a0) -; RV32I-NEXT: lbu s7, 4(a0) -; RV32I-NEXT: lbu s8, 5(a0) -; RV32I-NEXT: lbu s4, 6(a0) -; RV32I-NEXT: lbu s6, 7(a0) -; RV32I-NEXT: lbu s5, 8(a0) -; RV32I-NEXT: lbu s10, 9(a0) -; RV32I-NEXT: lbu s11, 10(a0) -; RV32I-NEXT: lbu ra, 11(a0) -; RV32I-NEXT: lbu t4, 12(a0) -; RV32I-NEXT: lbu t6, 13(a0) -; RV32I-NEXT: lbu a5, 14(a0) -; RV32I-NEXT: lbu a6, 15(a0) -; RV32I-NEXT: lbu a3, 16(a0) -; RV32I-NEXT: lbu t2, 17(a0) -; RV32I-NEXT: lbu t3, 18(a0) -; RV32I-NEXT: lbu t5, 19(a0) -; RV32I-NEXT: lbu a4, 20(a0) -; RV32I-NEXT: lbu s0, 21(a0) -; RV32I-NEXT: lbu s2, 22(a0) -; RV32I-NEXT: lbu s3, 23(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s2, 12(a0) +; RV32I-NEXT: lbu s3, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: slli s8, s8, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, s1, t1 -; RV32I-NEXT: or t1, s8, s7 -; RV32I-NEXT: lbu s1, 24(a0) -; RV32I-NEXT: lbu s7, 25(a0) -; RV32I-NEXT: lbu s8, 26(a0) -; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: lbu s0, 22(a0) +; RV32I-NEXT: lbu s1, 23(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s3, s3, 8 ; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: slli s11, s11, 16 -; RV32I-NEXT: slli ra, ra, 24 -; RV32I-NEXT: or s4, s6, s4 -; RV32I-NEXT: or s5, s10, s5 -; RV32I-NEXT: or s6, ra, s11 -; RV32I-NEXT: lbu s10, 28(a0) -; RV32I-NEXT: lbu s11, 29(a0) -; RV32I-NEXT: lbu ra, 30(a0) +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s3, s2 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu s2, 25(a0) +; RV32I-NEXT: lbu s3, 26(a0) +; RV32I-NEXT: lbu s4, 27(a0) +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or t6, s11, s10 +; RV32I-NEXT: lbu s5, 28(a0) +; RV32I-NEXT: lbu s6, 29(a0) +; RV32I-NEXT: lbu s7, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t4, t6, t4 -; RV32I-NEXT: addi t6, sp, 40 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s2, s2, 16 -; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli s0, s0, 16 +; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: addi s1, sp, 32 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s3, s3, 16 +; RV32I-NEXT: slli s4, s4, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a3, t2, a3 -; RV32I-NEXT: or a6, t5, t3 -; RV32I-NEXT: or a4, s0, a4 -; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: or t3, s7, s1 -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: or s0, s11, s10 -; RV32I-NEXT: or a0, a0, ra +; RV32I-NEXT: or t3, s2, t3 +; RV32I-NEXT: or s2, s4, s3 +; RV32I-NEXT: or s3, s6, s5 +; RV32I-NEXT: or a0, a0, s7 ; RV32I-NEXT: andi a1, a1, 24 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, s4, t1 -; RV32I-NEXT: or t1, s6, s5 -; RV32I-NEXT: or a5, a5, t4 -; RV32I-NEXT: or a3, a6, a3 -; RV32I-NEXT: or a4, t2, a4 -; RV32I-NEXT: or a6, t5, t3 -; RV32I-NEXT: or a0, a0, s0 -; RV32I-NEXT: sub t2, t6, a1 -; RV32I-NEXT: sw a3, 56(sp) -; RV32I-NEXT: sw a4, 60(sp) -; RV32I-NEXT: sw a6, 64(sp) -; RV32I-NEXT: sw a0, 68(sp) -; RV32I-NEXT: sw a7, 40(sp) -; RV32I-NEXT: sw t0, 44(sp) -; RV32I-NEXT: sw t1, 48(sp) -; RV32I-NEXT: sw a5, 52(sp) -; RV32I-NEXT: lw a6, 16(t2) -; RV32I-NEXT: lw a5, 20(t2) -; RV32I-NEXT: lw a7, 24(t2) -; RV32I-NEXT: lw a1, 0(t2) -; RV32I-NEXT: lw a0, 4(t2) -; RV32I-NEXT: lw a4, 8(t2) -; RV32I-NEXT: lw a3, 12(t2) -; RV32I-NEXT: lw t0, 28(t2) +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t5, t4 +; RV32I-NEXT: or t0, s0, t6 +; RV32I-NEXT: or t1, s2, t3 +; RV32I-NEXT: or a0, a0, s3 +; RV32I-NEXT: sub s1, s1, a1 +; RV32I-NEXT: sw a7, 48(sp) +; RV32I-NEXT: sw t0, 52(sp) +; RV32I-NEXT: sw t1, 56(sp) +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: sw a4, 36(sp) +; RV32I-NEXT: sw a5, 40(sp) +; RV32I-NEXT: sw a6, 44(sp) +; RV32I-NEXT: lw a6, 16(s1) +; RV32I-NEXT: lw a5, 20(s1) +; RV32I-NEXT: lw a7, 24(s1) +; RV32I-NEXT: lw a1, 0(s1) +; RV32I-NEXT: lw a0, 4(s1) +; RV32I-NEXT: lw a4, 8(s1) +; RV32I-NEXT: lw a3, 12(s1) +; RV32I-NEXT: lw t0, 28(s1) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -4534,21 +4519,21 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 -; RV32I-NEXT: srli s8, a4, 8 -; RV32I-NEXT: srli s9, a3, 24 -; RV32I-NEXT: srli s10, a3, 16 -; RV32I-NEXT: srli s11, a3, 8 -; RV32I-NEXT: srli ra, a1, 24 ; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) +; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: srli t1, a3, 8 ; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli t4, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -4560,36 +4545,35 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) -; RV32I-NEXT: sb s9, 15(a2) +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb t3, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: sb ra, 3(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb t6, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 @@ -4834,140 +4818,137 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu t6, 0(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu t1, 4(a0) -; RV32I-NEXT: lbu t3, 5(a0) -; RV32I-NEXT: lbu t4, 6(a0) -; RV32I-NEXT: lbu t5, 7(a0) -; RV32I-NEXT: lbu t2, 8(a0) -; RV32I-NEXT: lbu s1, 9(a0) -; RV32I-NEXT: lbu s7, 10(a0) -; RV32I-NEXT: lbu s8, 11(a0) -; RV32I-NEXT: lbu s9, 12(a0) -; RV32I-NEXT: lbu s10, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s6, 15(a0) -; RV32I-NEXT: lbu s5, 16(a0) -; RV32I-NEXT: lbu s11, 17(a0) -; RV32I-NEXT: lbu ra, 18(a0) -; RV32I-NEXT: lbu a3, 19(a0) -; RV32I-NEXT: lbu s2, 20(a0) -; RV32I-NEXT: lbu s3, 21(a0) -; RV32I-NEXT: lbu a7, 22(a0) -; RV32I-NEXT: lbu t0, 23(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: or a4, a4, t6 -; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t3, t1 -; RV32I-NEXT: or a6, t5, t4 -; RV32I-NEXT: lbu t1, 24(a0) -; RV32I-NEXT: lbu t5, 25(a0) -; RV32I-NEXT: lbu t6, 26(a0) -; RV32I-NEXT: lbu s0, 27(a0) +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 ; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: slli s7, s7, 16 -; RV32I-NEXT: slli s8, s8, 24 -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: or t2, s1, t2 -; RV32I-NEXT: or t3, s8, s7 -; RV32I-NEXT: or t4, s10, s9 -; RV32I-NEXT: lbu s1, 28(a0) -; RV32I-NEXT: lbu s7, 29(a0) -; RV32I-NEXT: lbu s8, 30(a0) -; RV32I-NEXT: lbu s9, 31(a0) -; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a0, s6, s4 -; RV32I-NEXT: or s4, s11, s5 -; RV32I-NEXT: or s5, a3, ra -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu s6, 1(a1) -; RV32I-NEXT: lbu s10, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: or s2, s3, s2 -; RV32I-NEXT: addi s3, sp, 8 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: slli t6, t6, 16 -; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t6, 24(a0) +; RV32I-NEXT: lbu s0, 25(a0) +; RV32I-NEXT: lbu s1, 26(a0) +; RV32I-NEXT: lbu s2, 27(a0) +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: or s6, s11, s10 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: lbu s2, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: or s3, s4, s3 +; RV32I-NEXT: mv s4, sp +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, t5, t1 -; RV32I-NEXT: or t1, s0, t6 -; RV32I-NEXT: or t5, s7, s1 -; RV32I-NEXT: or t6, s9, s8 -; RV32I-NEXT: or a3, s6, a3 -; RV32I-NEXT: or a1, a1, s10 -; RV32I-NEXT: srai s0, s9, 31 -; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t3, t2 -; RV32I-NEXT: or a0, a0, t4 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: or a7, a7, s2 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: or t1, t6, t5 -; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: sw s0, 56(sp) -; RV32I-NEXT: sw s0, 60(sp) -; RV32I-NEXT: sw s0, 64(sp) -; RV32I-NEXT: sw s0, 68(sp) -; RV32I-NEXT: sw s0, 40(sp) -; RV32I-NEXT: sw s0, 44(sp) -; RV32I-NEXT: sw s0, 48(sp) -; RV32I-NEXT: sw s0, 52(sp) -; RV32I-NEXT: sw t2, 24(sp) -; RV32I-NEXT: sw a7, 28(sp) -; RV32I-NEXT: sw t0, 32(sp) -; RV32I-NEXT: sw t1, 36(sp) -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) -; RV32I-NEXT: sw a6, 16(sp) -; RV32I-NEXT: sw a0, 20(sp) +; RV32I-NEXT: or s5, a0, s5 +; RV32I-NEXT: or s1, s2, s1 +; RV32I-NEXT: or a1, a1, s7 +; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, s6, t5 +; RV32I-NEXT: or t1, s0, t6 +; RV32I-NEXT: or t2, s5, s3 +; RV32I-NEXT: or a1, a1, s1 +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: sw a0, 56(sp) +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw a0, 32(sp) +; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: sw a0, 40(sp) +; RV32I-NEXT: sw a0, 44(sp) +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t2, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: slli t1, a1, 3 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: add a1, s3, a1 +; RV32I-NEXT: add a1, s4, a1 ; RV32I-NEXT: andi a0, t1, 24 -; RV32I-NEXT: xori t0, a0, 31 +; RV32I-NEXT: xori a7, a0, 31 ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a5, 8(a1) ; RV32I-NEXT: lw a6, 12(a1) -; RV32I-NEXT: lw a7, 16(a1) +; RV32I-NEXT: lw t0, 16(a1) ; RV32I-NEXT: lw t2, 20(a1) ; RV32I-NEXT: lw t3, 24(a1) ; RV32I-NEXT: lw t4, 28(a1) @@ -4976,33 +4957,33 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srl a1, a3, t1 ; RV32I-NEXT: slli t6, a4, 1 ; RV32I-NEXT: srl a3, a6, t1 -; RV32I-NEXT: slli s0, a7, 1 +; RV32I-NEXT: slli s0, t0, 1 ; RV32I-NEXT: srl a4, a5, t1 ; RV32I-NEXT: slli s1, a6, 1 ; RV32I-NEXT: srl a5, t2, t1 ; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: srl a6, a7, t1 +; RV32I-NEXT: srl a6, t0, t1 ; RV32I-NEXT: slli t2, t2, 1 -; RV32I-NEXT: srl a7, t3, t1 +; RV32I-NEXT: srl t0, t3, t1 ; RV32I-NEXT: slli t3, t4, 1 ; RV32I-NEXT: sra t1, t4, t1 -; RV32I-NEXT: sll t4, t5, t0 -; RV32I-NEXT: sll t5, t6, t0 -; RV32I-NEXT: sll t6, s0, t0 -; RV32I-NEXT: sll s0, s1, t0 -; RV32I-NEXT: sll s1, s2, t0 -; RV32I-NEXT: sll t2, t2, t0 -; RV32I-NEXT: sll t3, t3, t0 +; RV32I-NEXT: sll t4, t5, a7 +; RV32I-NEXT: sll t5, t6, a7 +; RV32I-NEXT: sll t6, s0, a7 +; RV32I-NEXT: sll s0, s1, a7 +; RV32I-NEXT: sll s1, s2, a7 +; RV32I-NEXT: sll t2, t2, a7 +; RV32I-NEXT: sll t3, t3, a7 ; RV32I-NEXT: srli s2, t1, 24 ; RV32I-NEXT: srli s3, t1, 16 ; RV32I-NEXT: srli s4, t1, 8 -; RV32I-NEXT: or t0, a0, t4 +; RV32I-NEXT: or a7, a0, t4 ; RV32I-NEXT: or t4, a1, t5 ; RV32I-NEXT: or t5, a3, t6 ; RV32I-NEXT: or s0, a4, s0 ; RV32I-NEXT: or s1, a5, s1 ; RV32I-NEXT: or t2, a6, t2 -; RV32I-NEXT: or t3, a7, t3 +; RV32I-NEXT: or t3, t0, t3 ; RV32I-NEXT: sb t1, 28(a2) ; RV32I-NEXT: sb s4, 29(a2) ; RV32I-NEXT: sb s3, 30(a2) @@ -5019,23 +5000,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s6, s0, 24 ; RV32I-NEXT: srli s7, s0, 16 ; RV32I-NEXT: srli s0, s0, 8 -; RV32I-NEXT: srli s8, t5, 24 -; RV32I-NEXT: srli s9, t5, 16 -; RV32I-NEXT: srli t5, t5, 8 -; RV32I-NEXT: srli s10, t4, 24 -; RV32I-NEXT: srli s11, t4, 16 -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: sb t0, 24(a2) +; RV32I-NEXT: srli t0, t5, 24 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, t5, 16 +; RV32I-NEXT: srli t5, t5, 8 ; RV32I-NEXT: sb t6, 26(a2) +; RV32I-NEXT: srli t6, t4, 24 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, t0, 24 +; RV32I-NEXT: srli t1, t4, 16 +; RV32I-NEXT: srli t4, t4, 8 ; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: srli a6, a7, 24 ; RV32I-NEXT: sb t2, 17(a2) ; RV32I-NEXT: sb s3, 18(a2) ; RV32I-NEXT: sb s2, 19(a2) -; RV32I-NEXT: srli a6, t0, 16 -; RV32I-NEXT: srli t0, t0, 8 +; RV32I-NEXT: srli t2, a7, 16 +; RV32I-NEXT: srli a7, a7, 8 ; RV32I-NEXT: sb a5, 20(a2) ; RV32I-NEXT: sb s1, 21(a2) ; RV32I-NEXT: sb s5, 22(a2) @@ -5046,30 +5027,29 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: sb a3, 12(a2) ; RV32I-NEXT: sb t5, 13(a2) -; RV32I-NEXT: sb s9, 14(a2) -; RV32I-NEXT: sb s8, 15(a2) +; RV32I-NEXT: sb t3, 14(a2) +; RV32I-NEXT: sb t0, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: sb t4, 1(a2) -; RV32I-NEXT: sb s11, 2(a2) -; RV32I-NEXT: sb s10, 3(a2) +; RV32I-NEXT: sb t1, 2(a2) +; RV32I-NEXT: sb t6, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: sb t0, 5(a2) -; RV32I-NEXT: sb a6, 6(a2) -; RV32I-NEXT: sb a7, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb t2, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -5315,130 +5295,129 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; ; RV32I-LABEL: ashr_32bytes_wordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a6, 0(a0) -; RV32I-NEXT: lbu t0, 1(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu t6, 3(a0) -; RV32I-NEXT: lbu s7, 4(a0) -; RV32I-NEXT: lbu s8, 5(a0) -; RV32I-NEXT: lbu s3, 6(a0) -; RV32I-NEXT: lbu s5, 7(a0) -; RV32I-NEXT: lbu s4, 8(a0) -; RV32I-NEXT: lbu s9, 9(a0) -; RV32I-NEXT: lbu s10, 10(a0) -; RV32I-NEXT: lbu s11, 11(a0) -; RV32I-NEXT: lbu s2, 12(a0) -; RV32I-NEXT: lbu s6, 13(a0) -; RV32I-NEXT: lbu a5, 14(a0) -; RV32I-NEXT: lbu a7, 15(a0) -; RV32I-NEXT: lbu a3, 16(a0) -; RV32I-NEXT: lbu t2, 17(a0) -; RV32I-NEXT: lbu t3, 18(a0) -; RV32I-NEXT: lbu t4, 19(a0) -; RV32I-NEXT: lbu a4, 20(a0) -; RV32I-NEXT: lbu t5, 21(a0) -; RV32I-NEXT: lbu s0, 22(a0) -; RV32I-NEXT: lbu s1, 23(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s8, s8, 8 -; RV32I-NEXT: or a6, t0, a6 -; RV32I-NEXT: or t0, t6, t1 -; RV32I-NEXT: or t1, s8, s7 -; RV32I-NEXT: lbu t6, 24(a0) -; RV32I-NEXT: lbu s7, 25(a0) -; RV32I-NEXT: lbu s8, 26(a0) -; RV32I-NEXT: lbu ra, 27(a0) -; RV32I-NEXT: slli s3, s3, 16 -; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu t5, 25(a0) +; RV32I-NEXT: lbu t6, 26(a0) +; RV32I-NEXT: lbu s0, 27(a0) +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s7, s7, 24 ; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: or s3, s5, s3 -; RV32I-NEXT: or s4, s9, s4 -; RV32I-NEXT: or s5, s11, s10 -; RV32I-NEXT: lbu s9, 28(a0) -; RV32I-NEXT: lbu s10, 29(a0) -; RV32I-NEXT: lbu s11, 30(a0) +; RV32I-NEXT: or t4, s5, s4 +; RV32I-NEXT: or s1, s7, s6 +; RV32I-NEXT: or s2, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: slli s6, s6, 8 -; RV32I-NEXT: or s2, s6, s2 -; RV32I-NEXT: addi s6, sp, 8 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: or s6, s11, s10 +; RV32I-NEXT: mv s7, sp ; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: slli s0, s0, 16 -; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli ra, ra, 24 -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: slli s11, s11, 16 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: slli s0, s0, 24 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: or a3, t2, a3 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or a4, t5, a4 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or t2, s7, t6 -; RV32I-NEXT: or t3, ra, s8 -; RV32I-NEXT: or t4, s10, s9 -; RV32I-NEXT: or t5, a0, s11 +; RV32I-NEXT: or t3, t5, t3 +; RV32I-NEXT: or t5, s0, t6 +; RV32I-NEXT: or t6, s4, s3 +; RV32I-NEXT: or s0, a0, s5 ; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: or a6, t0, a6 -; RV32I-NEXT: or t0, s3, t1 -; RV32I-NEXT: or t1, s5, s4 -; RV32I-NEXT: or a5, a5, s2 -; RV32I-NEXT: or a3, a7, a3 -; RV32I-NEXT: or a4, s0, a4 -; RV32I-NEXT: or a7, t3, t2 -; RV32I-NEXT: or t2, t5, t4 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, s1, t4 +; RV32I-NEXT: or t0, s6, s2 +; RV32I-NEXT: or t1, t5, t3 +; RV32I-NEXT: or t2, s0, t6 +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 52(sp) ; RV32I-NEXT: sw a0, 56(sp) ; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a0, 64(sp) -; RV32I-NEXT: sw a0, 68(sp) +; RV32I-NEXT: sw a0, 32(sp) +; RV32I-NEXT: sw a0, 36(sp) ; RV32I-NEXT: sw a0, 40(sp) ; RV32I-NEXT: sw a0, 44(sp) -; RV32I-NEXT: sw a0, 48(sp) -; RV32I-NEXT: sw a0, 52(sp) -; RV32I-NEXT: add s6, s6, a1 -; RV32I-NEXT: sw a3, 24(sp) -; RV32I-NEXT: sw a4, 28(sp) -; RV32I-NEXT: sw a7, 32(sp) -; RV32I-NEXT: sw t2, 36(sp) -; RV32I-NEXT: sw a6, 8(sp) -; RV32I-NEXT: sw t0, 12(sp) -; RV32I-NEXT: sw t1, 16(sp) -; RV32I-NEXT: sw a5, 20(sp) -; RV32I-NEXT: lw a6, 16(s6) -; RV32I-NEXT: lw a5, 20(s6) -; RV32I-NEXT: lw a7, 24(s6) -; RV32I-NEXT: lw a1, 0(s6) -; RV32I-NEXT: lw a0, 4(s6) -; RV32I-NEXT: lw a4, 8(s6) -; RV32I-NEXT: lw a3, 12(s6) -; RV32I-NEXT: lw t0, 28(s6) +; RV32I-NEXT: add s7, s7, a1 +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t2, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: lw a6, 16(s7) +; RV32I-NEXT: lw a5, 20(s7) +; RV32I-NEXT: lw a7, 24(s7) +; RV32I-NEXT: lw a1, 0(s7) +; RV32I-NEXT: lw a0, 4(s7) +; RV32I-NEXT: lw a4, 8(s7) +; RV32I-NEXT: lw a3, 12(s7) +; RV32I-NEXT: lw t0, 28(s7) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -5453,21 +5432,21 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 -; RV32I-NEXT: srli s8, a4, 8 -; RV32I-NEXT: srli s9, a3, 24 -; RV32I-NEXT: srli s10, a3, 16 -; RV32I-NEXT: srli s11, a3, 8 -; RV32I-NEXT: srli ra, a1, 24 ; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) +; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: srli t1, a3, 8 ; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli t4, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -5479,36 +5458,35 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) -; RV32I-NEXT: sb s9, 15(a2) +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb t3, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: sb ra, 3(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb t6, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %wordOff = load i256, ptr %wordOff.ptr, align 1 @@ -5534,112 +5512,112 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a5, 0(a0) -; RV64I-NEXT: lbu a7, 1(a0) -; RV64I-NEXT: lbu t1, 2(a0) -; RV64I-NEXT: lbu s3, 3(a0) -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu s8, 5(a0) -; RV64I-NEXT: lbu s9, 6(a0) -; RV64I-NEXT: lbu s10, 7(a0) -; RV64I-NEXT: lbu s2, 8(a0) -; RV64I-NEXT: lbu s4, 9(a0) -; RV64I-NEXT: lbu s5, 10(a0) -; RV64I-NEXT: lbu s6, 11(a0) -; RV64I-NEXT: lbu s7, 12(a0) -; RV64I-NEXT: lbu s11, 13(a0) -; RV64I-NEXT: lbu t4, 14(a0) -; RV64I-NEXT: lbu t5, 15(a0) -; RV64I-NEXT: lbu a3, 16(a0) -; RV64I-NEXT: lbu a6, 17(a0) -; RV64I-NEXT: lbu t2, 18(a0) -; RV64I-NEXT: lbu t3, 19(a0) -; RV64I-NEXT: lbu a4, 20(a0) -; RV64I-NEXT: lbu t6, 21(a0) -; RV64I-NEXT: lbu s0, 22(a0) -; RV64I-NEXT: lbu s1, 23(a0) -; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 ; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: slli s9, s9, 16 -; RV64I-NEXT: slli s10, s10, 24 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, s3, t1 -; RV64I-NEXT: or t0, s8, t0 -; RV64I-NEXT: or t1, s10, s9 -; RV64I-NEXT: lbu s3, 24(a0) -; RV64I-NEXT: lbu s8, 25(a0) -; RV64I-NEXT: lbu s9, 26(a0) -; RV64I-NEXT: lbu s10, 27(a0) -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: slli s5, s5, 16 -; RV64I-NEXT: slli s6, s6, 24 -; RV64I-NEXT: slli s11, s11, 8 -; RV64I-NEXT: or s2, s4, s2 -; RV64I-NEXT: or s4, s6, s5 -; RV64I-NEXT: or s5, s11, s7 -; RV64I-NEXT: lbu s6, 28(a0) -; RV64I-NEXT: lbu s7, 29(a0) -; RV64I-NEXT: lbu s11, 30(a0) +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t3, 24(a0) +; RV64I-NEXT: lbu t4, 25(a0) +; RV64I-NEXT: lbu t5, 26(a0) +; RV64I-NEXT: lbu t6, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or s0, s5, s4 +; RV64I-NEXT: or s1, s7, s6 +; RV64I-NEXT: or s2, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) ; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: lbu a1, 0(a1) -; RV64I-NEXT: slli t4, t4, 16 -; RV64I-NEXT: slli t5, t5, 24 -; RV64I-NEXT: or t4, t5, t4 -; RV64I-NEXT: mv t5, sp -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s0, s0, 16 -; RV64I-NEXT: slli s1, s1, 24 -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: slli s9, s9, 16 -; RV64I-NEXT: slli s10, s10, 24 -; RV64I-NEXT: slli s7, s7, 8 -; RV64I-NEXT: slli s11, s11, 16 +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: or s6, s11, s10 +; RV64I-NEXT: mv s7, sp +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: or a6, t3, t2 -; RV64I-NEXT: or a4, t6, a4 -; RV64I-NEXT: or s0, s1, s0 -; RV64I-NEXT: or t2, s8, s3 -; RV64I-NEXT: or t3, s10, s9 -; RV64I-NEXT: or t6, s7, s6 -; RV64I-NEXT: or a0, a0, s11 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or t4, t6, t5 +; RV64I-NEXT: or t5, s4, s3 +; RV64I-NEXT: or a0, a0, s5 ; RV64I-NEXT: andi a1, a1, 24 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, t1, t0 -; RV64I-NEXT: or t0, s4, s2 -; RV64I-NEXT: or t1, t4, s5 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: or a4, s0, a4 -; RV64I-NEXT: or a6, t3, t2 -; RV64I-NEXT: or a0, a0, t6 -; RV64I-NEXT: add t5, t5, a1 -; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or a7, s6, s2 +; RV64I-NEXT: or t0, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: add s7, s7, a1 ; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: sraiw a0, a0, 31 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, a7, s0 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: sd a0, 32(sp) ; RV64I-NEXT: sd a0, 40(sp) ; RV64I-NEXT: sd a0, 48(sp) ; RV64I-NEXT: sd a0, 56(sp) -; RV64I-NEXT: sd a5, 0(sp) -; RV64I-NEXT: sd a7, 8(sp) -; RV64I-NEXT: sd a3, 16(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a4, 8(sp) +; RV64I-NEXT: sd a5, 16(sp) ; RV64I-NEXT: sd a1, 24(sp) -; RV64I-NEXT: ld a4, 16(t5) -; RV64I-NEXT: ld a0, 8(t5) -; RV64I-NEXT: ld a1, 0(t5) -; RV64I-NEXT: ld a3, 24(t5) +; RV64I-NEXT: ld a4, 16(s7) +; RV64I-NEXT: ld a0, 8(s7) +; RV64I-NEXT: ld a1, 0(s7) +; RV64I-NEXT: ld a3, 24(s7) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 ; RV64I-NEXT: srli a7, a4, 40 @@ -5658,25 +5636,25 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: srli s5, a1, 48 ; RV64I-NEXT: srli s6, a1, 40 ; RV64I-NEXT: srli s7, a1, 32 -; RV64I-NEXT: srli s8, a1, 24 -; RV64I-NEXT: srli s9, a1, 16 -; RV64I-NEXT: srli s10, a1, 8 -; RV64I-NEXT: srli s11, a0, 56 ; RV64I-NEXT: sb t0, 20(a2) +; RV64I-NEXT: srli t0, a1, 24 ; RV64I-NEXT: sb a7, 21(a2) +; RV64I-NEXT: srli a7, a1, 16 ; RV64I-NEXT: sb a6, 22(a2) +; RV64I-NEXT: srli a6, a1, 8 ; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, a0, 48 +; RV64I-NEXT: srli a5, a0, 56 ; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: srli a4, a0, 48 ; RV64I-NEXT: sb t3, 17(a2) ; RV64I-NEXT: sb t2, 18(a2) ; RV64I-NEXT: sb t1, 19(a2) -; RV64I-NEXT: srli a4, a0, 40 +; RV64I-NEXT: srli t1, a0, 40 ; RV64I-NEXT: sb s0, 28(a2) ; RV64I-NEXT: sb t6, 29(a2) ; RV64I-NEXT: sb t5, 30(a2) ; RV64I-NEXT: sb t4, 31(a2) -; RV64I-NEXT: srli a6, a0, 32 +; RV64I-NEXT: srli t2, a0, 32 ; RV64I-NEXT: sb a3, 24(a2) ; RV64I-NEXT: sb s3, 25(a2) ; RV64I-NEXT: sb s2, 26(a2) @@ -5686,19 +5664,19 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sb s6, 5(a2) ; RV64I-NEXT: sb s5, 6(a2) ; RV64I-NEXT: sb s4, 7(a2) -; RV64I-NEXT: srli a7, a0, 16 +; RV64I-NEXT: srli t3, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb s10, 1(a2) -; RV64I-NEXT: sb s9, 2(a2) -; RV64I-NEXT: sb s8, 3(a2) +; RV64I-NEXT: sb a6, 1(a2) +; RV64I-NEXT: sb a7, 2(a2) +; RV64I-NEXT: sb t0, 3(a2) ; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: sb a6, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a5, 14(a2) -; RV64I-NEXT: sb s11, 15(a2) +; RV64I-NEXT: sb t2, 12(a2) +; RV64I-NEXT: sb t1, 13(a2) +; RV64I-NEXT: sb a4, 14(a2) +; RV64I-NEXT: sb a5, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: sb a7, 10(a2) +; RV64I-NEXT: sb t3, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) ; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload @@ -5717,130 +5695,129 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; ; RV32I-LABEL: ashr_32bytes_dwordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a6, 0(a0) -; RV32I-NEXT: lbu t0, 1(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu t6, 3(a0) -; RV32I-NEXT: lbu s7, 4(a0) -; RV32I-NEXT: lbu s8, 5(a0) -; RV32I-NEXT: lbu s3, 6(a0) -; RV32I-NEXT: lbu s5, 7(a0) -; RV32I-NEXT: lbu s4, 8(a0) -; RV32I-NEXT: lbu s9, 9(a0) -; RV32I-NEXT: lbu s10, 10(a0) -; RV32I-NEXT: lbu s11, 11(a0) -; RV32I-NEXT: lbu s2, 12(a0) -; RV32I-NEXT: lbu s6, 13(a0) -; RV32I-NEXT: lbu a5, 14(a0) -; RV32I-NEXT: lbu a7, 15(a0) -; RV32I-NEXT: lbu a3, 16(a0) -; RV32I-NEXT: lbu t2, 17(a0) -; RV32I-NEXT: lbu t3, 18(a0) -; RV32I-NEXT: lbu t4, 19(a0) -; RV32I-NEXT: lbu a4, 20(a0) -; RV32I-NEXT: lbu t5, 21(a0) -; RV32I-NEXT: lbu s0, 22(a0) -; RV32I-NEXT: lbu s1, 23(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s8, s8, 8 -; RV32I-NEXT: or a6, t0, a6 -; RV32I-NEXT: or t0, t6, t1 -; RV32I-NEXT: or t1, s8, s7 -; RV32I-NEXT: lbu t6, 24(a0) -; RV32I-NEXT: lbu s7, 25(a0) -; RV32I-NEXT: lbu s8, 26(a0) -; RV32I-NEXT: lbu ra, 27(a0) -; RV32I-NEXT: slli s3, s3, 16 -; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu t5, 25(a0) +; RV32I-NEXT: lbu t6, 26(a0) +; RV32I-NEXT: lbu s0, 27(a0) +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s7, s7, 24 ; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: or s3, s5, s3 -; RV32I-NEXT: or s4, s9, s4 -; RV32I-NEXT: or s5, s11, s10 -; RV32I-NEXT: lbu s9, 28(a0) -; RV32I-NEXT: lbu s10, 29(a0) -; RV32I-NEXT: lbu s11, 30(a0) +; RV32I-NEXT: or t4, s5, s4 +; RV32I-NEXT: or s1, s7, s6 +; RV32I-NEXT: or s2, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: slli s6, s6, 8 -; RV32I-NEXT: or s2, s6, s2 -; RV32I-NEXT: addi s6, sp, 8 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: or s6, s11, s10 +; RV32I-NEXT: mv s7, sp ; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: slli s0, s0, 16 -; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli ra, ra, 24 -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: slli s11, s11, 16 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: slli s0, s0, 24 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: or a3, t2, a3 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or a4, t5, a4 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or t2, s7, t6 -; RV32I-NEXT: or t3, ra, s8 -; RV32I-NEXT: or t4, s10, s9 -; RV32I-NEXT: or t5, a0, s11 +; RV32I-NEXT: or t3, t5, t3 +; RV32I-NEXT: or t5, s0, t6 +; RV32I-NEXT: or t6, s4, s3 +; RV32I-NEXT: or s0, a0, s5 ; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: andi a1, a1, 24 -; RV32I-NEXT: or a6, t0, a6 -; RV32I-NEXT: or t0, s3, t1 -; RV32I-NEXT: or t1, s5, s4 -; RV32I-NEXT: or a5, a5, s2 -; RV32I-NEXT: or a3, a7, a3 -; RV32I-NEXT: or a4, s0, a4 -; RV32I-NEXT: or a7, t3, t2 -; RV32I-NEXT: or t2, t5, t4 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, s1, t4 +; RV32I-NEXT: or t0, s6, s2 +; RV32I-NEXT: or t1, t5, t3 +; RV32I-NEXT: or t2, s0, t6 +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 52(sp) ; RV32I-NEXT: sw a0, 56(sp) ; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a0, 64(sp) -; RV32I-NEXT: sw a0, 68(sp) +; RV32I-NEXT: sw a0, 32(sp) +; RV32I-NEXT: sw a0, 36(sp) ; RV32I-NEXT: sw a0, 40(sp) ; RV32I-NEXT: sw a0, 44(sp) -; RV32I-NEXT: sw a0, 48(sp) -; RV32I-NEXT: sw a0, 52(sp) -; RV32I-NEXT: add s6, s6, a1 -; RV32I-NEXT: sw a3, 24(sp) -; RV32I-NEXT: sw a4, 28(sp) -; RV32I-NEXT: sw a7, 32(sp) -; RV32I-NEXT: sw t2, 36(sp) -; RV32I-NEXT: sw a6, 8(sp) -; RV32I-NEXT: sw t0, 12(sp) -; RV32I-NEXT: sw t1, 16(sp) -; RV32I-NEXT: sw a5, 20(sp) -; RV32I-NEXT: lw a6, 16(s6) -; RV32I-NEXT: lw a5, 20(s6) -; RV32I-NEXT: lw a7, 24(s6) -; RV32I-NEXT: lw a1, 0(s6) -; RV32I-NEXT: lw a0, 4(s6) -; RV32I-NEXT: lw a4, 8(s6) -; RV32I-NEXT: lw a3, 12(s6) -; RV32I-NEXT: lw t0, 28(s6) +; RV32I-NEXT: add s7, s7, a1 +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t2, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: lw a6, 16(s7) +; RV32I-NEXT: lw a5, 20(s7) +; RV32I-NEXT: lw a7, 24(s7) +; RV32I-NEXT: lw a1, 0(s7) +; RV32I-NEXT: lw a0, 4(s7) +; RV32I-NEXT: lw a4, 8(s7) +; RV32I-NEXT: lw a3, 12(s7) +; RV32I-NEXT: lw t0, 28(s7) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -5855,21 +5832,21 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 -; RV32I-NEXT: srli s8, a4, 8 -; RV32I-NEXT: srli s9, a3, 24 -; RV32I-NEXT: srli s10, a3, 16 -; RV32I-NEXT: srli s11, a3, 8 -; RV32I-NEXT: srli ra, a1, 24 ; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) +; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: srli t1, a3, 8 ; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli t4, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -5881,36 +5858,35 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) -; RV32I-NEXT: sb s9, 15(a2) +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb t3, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: sb ra, 3(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb t6, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index b2c130c2d7c10..b8952d2cb2b29 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -1530,25 +1530,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: lshr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a6, 2(a0) -; RV32I-NEXT: lbu a7, 3(a0) -; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) @@ -1557,107 +1556,105 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu t5, 10(a0) ; RV32I-NEXT: lbu t6, 11(a0) ; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s2, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu s6, 16(a0) -; RV32I-NEXT: lbu s7, 17(a0) -; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: or a4, a7, a6 -; RV32I-NEXT: lbu s10, 20(a0) -; RV32I-NEXT: lbu s11, 21(a0) -; RV32I-NEXT: lbu ra, 22(a0) -; RV32I-NEXT: lbu a3, 23(a0) +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: slli t4, t4, 8 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 ; RV32I-NEXT: or a7, t4, t3 ; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: lbu s1, 24(a0) -; RV32I-NEXT: lbu s3, 25(a0) -; RV32I-NEXT: lbu t4, 26(a0) -; RV32I-NEXT: lbu t5, 27(a0) -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: or t1, s2, s0 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: or t3, s7, s6 -; RV32I-NEXT: lbu t6, 28(a0) +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t6, 24(a0) +; RV32I-NEXT: lbu s0, 25(a0) +; RV32I-NEXT: lbu s1, 26(a0) +; RV32I-NEXT: lbu s2, 27(a0) +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) ; RV32I-NEXT: lbu s4, 29(a0) ; RV32I-NEXT: lbu s5, 30(a0) ; RV32I-NEXT: lbu s6, 31(a0) -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a0, s9, s8 -; RV32I-NEXT: or s0, s11, s10 -; RV32I-NEXT: or s2, a3, ra -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu s7, 1(a1) -; RV32I-NEXT: lbu s8, 2(a1) +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: or a0, s11, s10 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: lbu s2, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 64(sp) -; RV32I-NEXT: sw zero, 68(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) -; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: or s1, s3, s1 -; RV32I-NEXT: addi s3, sp, 8 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t5, t5, 24 ; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: or s3, s4, s3 +; RV32I-NEXT: mv s4, sp ; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: or t5, s4, t6 -; RV32I-NEXT: or t6, s6, s5 -; RV32I-NEXT: or a3, s7, a3 -; RV32I-NEXT: or a1, a1, s8 -; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, s4 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a7, t2, t1 -; RV32I-NEXT: or t0, a0, t3 -; RV32I-NEXT: or t1, s2, s0 -; RV32I-NEXT: or t2, t4, s1 -; RV32I-NEXT: or t3, t6, t5 -; RV32I-NEXT: or a0, a1, a3 -; RV32I-NEXT: sw t0, 24(sp) -; RV32I-NEXT: sw t1, 28(sp) -; RV32I-NEXT: sw t2, 32(sp) -; RV32I-NEXT: sw t3, 36(sp) -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) -; RV32I-NEXT: sw a6, 16(sp) -; RV32I-NEXT: sw a7, 20(sp) +; RV32I-NEXT: or s5, s6, s5 +; RV32I-NEXT: or s1, s2, s1 +; RV32I-NEXT: or a1, a1, s7 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, a0, t5 +; RV32I-NEXT: or t1, s0, t6 +; RV32I-NEXT: or t2, s5, s3 +; RV32I-NEXT: or a0, a1, s1 +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t2, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: srli a1, a0, 3 ; RV32I-NEXT: andi a3, a0, 31 ; RV32I-NEXT: andi a4, a1, 28 ; RV32I-NEXT: xori a1, a3, 31 -; RV32I-NEXT: add a4, s3, a4 +; RV32I-NEXT: add a4, s4, a4 ; RV32I-NEXT: lw a3, 0(a4) ; RV32I-NEXT: lw a5, 4(a4) ; RV32I-NEXT: lw a6, 8(a4) @@ -1717,13 +1714,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s5, a3, 24 ; RV32I-NEXT: srli s6, a3, 16 ; RV32I-NEXT: srli s7, a3, 8 -; RV32I-NEXT: srli s8, a1, 24 -; RV32I-NEXT: srli s9, a1, 16 ; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t2, 25(a2) +; RV32I-NEXT: srli t2, a1, 16 ; RV32I-NEXT: sb t1, 26(a2) ; RV32I-NEXT: sb t0, 27(a2) -; RV32I-NEXT: srli a7, a1, 8 +; RV32I-NEXT: srli t0, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb t5, 17(a2) ; RV32I-NEXT: sb t4, 18(a2) @@ -1744,27 +1741,26 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 14(a2) ; RV32I-NEXT: sb s5, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: sb s9, 2(a2) -; RV32I-NEXT: sb s8, 3(a2) +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb t2, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 @@ -2006,25 +2002,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a6, 2(a0) -; RV32I-NEXT: lbu a7, 3(a0) -; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) @@ -2033,107 +2028,105 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu t5, 10(a0) ; RV32I-NEXT: lbu t6, 11(a0) ; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s2, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu s6, 16(a0) -; RV32I-NEXT: lbu s7, 17(a0) -; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: or a4, a7, a6 -; RV32I-NEXT: lbu s10, 20(a0) -; RV32I-NEXT: lbu s11, 21(a0) -; RV32I-NEXT: lbu ra, 22(a0) -; RV32I-NEXT: lbu a3, 23(a0) +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: slli t4, t4, 8 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 ; RV32I-NEXT: or a7, t4, t3 ; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: lbu s1, 24(a0) -; RV32I-NEXT: lbu s3, 25(a0) -; RV32I-NEXT: lbu t4, 26(a0) -; RV32I-NEXT: lbu t5, 27(a0) -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: or t1, s2, s0 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: or t3, s7, s6 -; RV32I-NEXT: lbu t6, 28(a0) +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t6, 24(a0) +; RV32I-NEXT: lbu s0, 25(a0) +; RV32I-NEXT: lbu s1, 26(a0) +; RV32I-NEXT: lbu s2, 27(a0) +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) ; RV32I-NEXT: lbu s4, 29(a0) ; RV32I-NEXT: lbu s5, 30(a0) ; RV32I-NEXT: lbu s6, 31(a0) -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a0, s9, s8 -; RV32I-NEXT: or s0, s11, s10 -; RV32I-NEXT: or s2, a3, ra -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu s7, 1(a1) -; RV32I-NEXT: lbu s8, 2(a1) +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: or a0, s11, s10 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: lbu s2, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) -; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: or s1, s3, s1 -; RV32I-NEXT: addi s3, sp, 40 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t5, t5, 24 ; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: or s3, s4, s3 +; RV32I-NEXT: addi s4, sp, 32 ; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: or t5, s4, t6 -; RV32I-NEXT: or t6, s6, s5 -; RV32I-NEXT: or a3, s7, a3 -; RV32I-NEXT: or a1, a1, s8 -; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, s4 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a7, t2, t1 -; RV32I-NEXT: or t0, a0, t3 -; RV32I-NEXT: or t1, s2, s0 -; RV32I-NEXT: or t2, t4, s1 -; RV32I-NEXT: or t3, t6, t5 -; RV32I-NEXT: or a0, a1, a3 -; RV32I-NEXT: sw t0, 56(sp) -; RV32I-NEXT: sw t1, 60(sp) -; RV32I-NEXT: sw t2, 64(sp) -; RV32I-NEXT: sw t3, 68(sp) -; RV32I-NEXT: sw a4, 40(sp) -; RV32I-NEXT: sw a5, 44(sp) -; RV32I-NEXT: sw a6, 48(sp) -; RV32I-NEXT: sw a7, 52(sp) +; RV32I-NEXT: or s5, s6, s5 +; RV32I-NEXT: or s1, s2, s1 +; RV32I-NEXT: or a1, a1, s7 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, a0, t5 +; RV32I-NEXT: or t1, s0, t6 +; RV32I-NEXT: or t2, s5, s3 +; RV32I-NEXT: or a0, a1, s1 +; RV32I-NEXT: sw a7, 48(sp) +; RV32I-NEXT: sw t0, 52(sp) +; RV32I-NEXT: sw t1, 56(sp) +; RV32I-NEXT: sw t2, 60(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: sw a4, 36(sp) +; RV32I-NEXT: sw a5, 40(sp) +; RV32I-NEXT: sw a6, 44(sp) ; RV32I-NEXT: srli a1, a0, 3 ; RV32I-NEXT: andi a3, a0, 31 ; RV32I-NEXT: andi a4, a1, 28 ; RV32I-NEXT: xori a1, a3, 31 -; RV32I-NEXT: sub a3, s3, a4 +; RV32I-NEXT: sub a3, s4, a4 ; RV32I-NEXT: lw a4, 0(a3) ; RV32I-NEXT: lw a5, 4(a3) ; RV32I-NEXT: lw a6, 8(a3) @@ -2193,13 +2186,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s5, a3, 24 ; RV32I-NEXT: srli s6, a3, 16 ; RV32I-NEXT: srli s7, a3, 8 -; RV32I-NEXT: srli s8, a1, 24 -; RV32I-NEXT: srli s9, a1, 16 ; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t2, 25(a2) +; RV32I-NEXT: srli t2, a1, 16 ; RV32I-NEXT: sb t1, 26(a2) ; RV32I-NEXT: sb t0, 27(a2) -; RV32I-NEXT: srli a7, a1, 8 +; RV32I-NEXT: srli t0, a1, 8 ; RV32I-NEXT: sb a6, 28(a2) ; RV32I-NEXT: sb t5, 29(a2) ; RV32I-NEXT: sb t4, 30(a2) @@ -2220,27 +2213,26 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 10(a2) ; RV32I-NEXT: sb s5, 11(a2) ; RV32I-NEXT: sb a1, 12(a2) -; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: sb s9, 14(a2) -; RV32I-NEXT: sb s8, 15(a2) +; RV32I-NEXT: sb t0, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb a7, 15(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 @@ -2483,25 +2475,24 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a6, 2(a0) -; RV32I-NEXT: lbu a7, 3(a0) -; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) @@ -2518,100 +2509,98 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s6, 18(a0) ; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: or a4, a7, a6 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 ; RV32I-NEXT: lbu s8, 20(a0) ; RV32I-NEXT: lbu s9, 21(a0) ; RV32I-NEXT: lbu s10, 22(a0) ; RV32I-NEXT: lbu s11, 23(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: slli t4, t4, 8 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: lbu ra, 24(a0) -; RV32I-NEXT: lbu a3, 25(a0) -; RV32I-NEXT: lbu t4, 26(a0) -; RV32I-NEXT: lbu t5, 27(a0) ; RV32I-NEXT: slli s1, s1, 8 ; RV32I-NEXT: slli s2, s2, 16 ; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 ; RV32I-NEXT: or t1, s1, s0 ; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: or t3, s5, s4 -; RV32I-NEXT: lbu t6, 28(a0) -; RV32I-NEXT: lbu s0, 29(a0) -; RV32I-NEXT: lbu s1, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: lbu t6, 24(a0) +; RV32I-NEXT: lbu s0, 25(a0) +; RV32I-NEXT: lbu s1, 26(a0) +; RV32I-NEXT: lbu s2, 27(a0) +; RV32I-NEXT: slli s5, s5, 8 ; RV32I-NEXT: slli s6, s6, 16 ; RV32I-NEXT: slli s7, s7, 24 ; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: or s2, s7, s6 -; RV32I-NEXT: or s3, s9, s8 -; RV32I-NEXT: or s4, s11, s10 -; RV32I-NEXT: lbu s5, 0(a1) -; RV32I-NEXT: lbu s6, 1(a1) -; RV32I-NEXT: lbu s7, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, ra -; RV32I-NEXT: addi s8, sp, 8 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t5, t5, 24 ; RV32I-NEXT: slli s0, s0, 8 ; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: or s6, s11, s10 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: lbu s2, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: or s3, s4, s3 +; RV32I-NEXT: mv s4, sp +; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s2, s2, 8 ; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: or t5, s0, t6 -; RV32I-NEXT: or s1, a0, s1 -; RV32I-NEXT: or t6, s6, s5 +; RV32I-NEXT: or s5, a0, s5 +; RV32I-NEXT: or s1, s2, s1 ; RV32I-NEXT: or a1, a1, s7 -; RV32I-NEXT: srai s0, a0, 31 -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, a0 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a7, t2, t1 -; RV32I-NEXT: or t0, s2, t3 -; RV32I-NEXT: or t1, s4, s3 -; RV32I-NEXT: or a3, t4, a3 -; RV32I-NEXT: or t2, s1, t5 -; RV32I-NEXT: or a0, a1, t6 -; RV32I-NEXT: sw s0, 56(sp) -; RV32I-NEXT: sw s0, 60(sp) -; RV32I-NEXT: sw s0, 64(sp) -; RV32I-NEXT: sw s0, 68(sp) -; RV32I-NEXT: sw s0, 40(sp) -; RV32I-NEXT: sw s0, 44(sp) -; RV32I-NEXT: sw s0, 48(sp) -; RV32I-NEXT: sw s0, 52(sp) -; RV32I-NEXT: sw t0, 24(sp) -; RV32I-NEXT: sw t1, 28(sp) -; RV32I-NEXT: sw a3, 32(sp) -; RV32I-NEXT: sw t2, 36(sp) -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) -; RV32I-NEXT: sw a6, 16(sp) -; RV32I-NEXT: sw a7, 20(sp) +; RV32I-NEXT: srai s2, a0, 31 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, s6, t5 +; RV32I-NEXT: or t1, s0, t6 +; RV32I-NEXT: or t2, s5, s3 +; RV32I-NEXT: or a0, a1, s1 +; RV32I-NEXT: sw s2, 48(sp) +; RV32I-NEXT: sw s2, 52(sp) +; RV32I-NEXT: sw s2, 56(sp) +; RV32I-NEXT: sw s2, 60(sp) +; RV32I-NEXT: sw s2, 32(sp) +; RV32I-NEXT: sw s2, 36(sp) +; RV32I-NEXT: sw s2, 40(sp) +; RV32I-NEXT: sw s2, 44(sp) +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t2, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: srli a1, a0, 3 ; RV32I-NEXT: andi a3, a0, 31 ; RV32I-NEXT: andi a4, a1, 28 ; RV32I-NEXT: xori a1, a3, 31 -; RV32I-NEXT: add a4, s8, a4 +; RV32I-NEXT: add a4, s4, a4 ; RV32I-NEXT: lw a3, 0(a4) ; RV32I-NEXT: lw a5, 4(a4) ; RV32I-NEXT: lw a6, 8(a4) @@ -2671,13 +2660,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s5, a3, 24 ; RV32I-NEXT: srli s6, a3, 16 ; RV32I-NEXT: srli s7, a3, 8 -; RV32I-NEXT: srli s8, a1, 24 -; RV32I-NEXT: srli s9, a1, 16 ; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t2, 25(a2) +; RV32I-NEXT: srli t2, a1, 16 ; RV32I-NEXT: sb t1, 26(a2) ; RV32I-NEXT: sb t0, 27(a2) -; RV32I-NEXT: srli a7, a1, 8 +; RV32I-NEXT: srli t0, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb t5, 17(a2) ; RV32I-NEXT: sb t4, 18(a2) @@ -2698,27 +2687,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 14(a2) ; RV32I-NEXT: sb s5, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: sb s9, 2(a2) -; RV32I-NEXT: sb s8, 3(a2) +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb t2, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 From e96f7f7898790da1fe9cdc5cd3be7e3ae8eb8705 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Tue, 3 Dec 2024 21:44:29 +0800 Subject: [PATCH 2/3] Test commit: add a parameter to keep reserved --- .../include/llvm/CodeGen/TargetRegisterInfo.h | 4 +- llvm/lib/CodeGen/RegisterClassInfo.cpp | 3 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 3 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 4 +- llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 8 +- llvm/lib/Target/RISCV/RISCVRegisterInfo.h | 4 +- llvm/test/CodeGen/RISCV/pr69586.ll | 844 +++-- .../RISCV/rvv/fixed-vectors-masked-scatter.ll | 78 +- .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll | 2104 +++++------ .../RISCV/rvv/intrinsic-vector-match.ll | 472 +-- ...lar-shift-by-byte-multiple-legalization.ll | 3238 +++++++++-------- .../RISCV/wide-scalar-shift-legalization.ll | 646 ++-- llvm/unittests/CodeGen/MFCommon.inc | 4 +- llvm/utils/TableGen/RegisterInfoEmitter.cpp | 5 +- 14 files changed, 3813 insertions(+), 3604 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 292fa3c94969b..eaed26e33c4eb 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -914,8 +914,10 @@ class TargetRegisterInfo : public MCRegisterInfo { /// Get the register unit pressure limit for this dimension. /// This limit must be adjusted dynamically for reserved registers. + /// If RemoveReserved is true, the target should remove reserved registers. virtual unsigned getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const = 0; + unsigned Idx, + bool RemoveReserved = true) const = 0; /// Get the dimensions of register pressure impacted by this register class. /// Returns a -1 terminated array of pressure set IDs. diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp index 9312bc03bc522..0a33915ed1e40 100644 --- a/llvm/lib/CodeGen/RegisterClassInfo.cpp +++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp @@ -222,7 +222,8 @@ unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const { assert(RC && "Failed to find register class"); compute(RC); unsigned NAllocatableRegs = getNumAllocatableRegs(RC); - unsigned RegPressureSetLimit = TRI->getRegPressureSetLimit(*MF, Idx); + unsigned RegPressureSetLimit = + TRI->getRegPressureSetLimit(*MF, Idx, /*RemoveReserved=*/false); // If all the regs are reserved, return raw RegPressureSetLimit. // One example is VRSAVERC in PowerPC. // Avoid returning zero, getRegPressureSetLimit(Idx) assumes computePSetLimit diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 049f4af4dd2f9..9883454ed7829 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3640,7 +3640,8 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const { + unsigned Idx, + bool RemoveReserved) const { if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || Idx == AMDGPU::RegisterPressureSets::AGPR_32) return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 8e481e3ac2304..b55f5f2c418b0 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -331,8 +331,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; - unsigned getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const override; + unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx, + bool RemoveReserved = true) const override; const int *getRegUnitPressureSets(unsigned RegUnit) const override; diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index a73bd1621a739..d5a769b6c78c7 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -936,8 +936,12 @@ bool RISCVRegisterInfo::getRegAllocationHints( } unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const { + unsigned Idx, + bool RemoveReserved) const { if (Idx == RISCV::RegisterPressureSets::GPRAll) { + if (!RemoveReserved) + return 32; + unsigned Reserved = 0; BitVector ReservedRegs = getReservedRegs(MF); for (MCPhysReg Reg = RISCV::X0_H; Reg <= RISCV::X31_H; Reg++) @@ -946,5 +950,5 @@ unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, return 32 - Reserved; } - return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx); + return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx, RemoveReserved); } diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h index ca4934de2f52d..58f97394ec559 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h @@ -144,8 +144,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo { static bool isRVVRegClass(const TargetRegisterClass *RC) { return RISCVRI::isVRegClass(RC->TSFlags); } - unsigned getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const override; + unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx, + bool RemoveReserved = true) const override; }; } // namespace llvm diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll index 21e64ada7061a..8e6a7add781c9 100644 --- a/llvm/test/CodeGen/RISCV/pr69586.ll +++ b/llvm/test/CodeGen/RISCV/pr69586.ll @@ -39,384 +39,388 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: slli a2, a2, 1 ; NOREMAT-NEXT: sub sp, sp, a2 ; NOREMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb -; NOREMAT-NEXT: mv a7, a0 -; NOREMAT-NEXT: li a0, 32 -; NOREMAT-NEXT: addi a5, a7, 512 -; NOREMAT-NEXT: addi a4, a7, 1024 -; NOREMAT-NEXT: addi a6, a7, 1536 -; NOREMAT-NEXT: li t1, 1 +; NOREMAT-NEXT: li a7, 32 +; NOREMAT-NEXT: addi a6, a0, 512 +; NOREMAT-NEXT: addi a4, a0, 1024 +; NOREMAT-NEXT: addi a5, a0, 1536 +; NOREMAT-NEXT: li t0, 1 ; NOREMAT-NEXT: li a3, 5 -; NOREMAT-NEXT: li t0, 3 +; NOREMAT-NEXT: li t1, 3 ; NOREMAT-NEXT: li a2, 7 ; NOREMAT-NEXT: lui t2, 1 -; NOREMAT-NEXT: li s5, 9 -; NOREMAT-NEXT: li s8, 11 -; NOREMAT-NEXT: lui s1, 2 -; NOREMAT-NEXT: lui t5, 3 -; NOREMAT-NEXT: lui s11, 4 -; NOREMAT-NEXT: lui ra, 5 -; NOREMAT-NEXT: lui t3, 6 -; NOREMAT-NEXT: lui s0, 7 -; NOREMAT-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; NOREMAT-NEXT: slli t4, t1, 11 -; NOREMAT-NEXT: slli t6, a3, 9 -; NOREMAT-NEXT: slli s2, t0, 10 -; NOREMAT-NEXT: slli s4, a2, 9 -; NOREMAT-NEXT: add a0, a7, t2 -; NOREMAT-NEXT: vle32.v v8, (a5) -; NOREMAT-NEXT: slli s5, s5, 9 +; NOREMAT-NEXT: li s4, 9 +; NOREMAT-NEXT: li s6, 11 +; NOREMAT-NEXT: li s9, 13 +; NOREMAT-NEXT: lui s7, 2 +; NOREMAT-NEXT: lui s1, 3 +; NOREMAT-NEXT: lui ra, 4 +; NOREMAT-NEXT: lui t3, 5 +; NOREMAT-NEXT: lui s0, 6 +; NOREMAT-NEXT: lui s3, 7 +; NOREMAT-NEXT: vsetvli zero, a7, e32, m2, ta, ma +; NOREMAT-NEXT: slli t0, t0, 11 +; NOREMAT-NEXT: sd t0, 504(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: slli t5, a3, 9 +; NOREMAT-NEXT: slli t6, t1, 10 +; NOREMAT-NEXT: slli s2, a2, 9 +; NOREMAT-NEXT: add a7, a0, t2 +; NOREMAT-NEXT: lui s11, 1 +; NOREMAT-NEXT: slli s4, s4, 9 +; NOREMAT-NEXT: slli s5, a3, 10 +; NOREMAT-NEXT: vle32.v v8, (a6) +; NOREMAT-NEXT: slli s6, s6, 9 +; NOREMAT-NEXT: slli s8, t1, 11 ; NOREMAT-NEXT: vle32.v v10, (a4) ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: slli s6, a3, 10 -; NOREMAT-NEXT: vle32.v v0, (a6) -; NOREMAT-NEXT: vle32.v v12, (a6) -; NOREMAT-NEXT: slli s8, s8, 9 -; NOREMAT-NEXT: slli s9, t0, 11 -; NOREMAT-NEXT: vle32.v v4, (a0) -; NOREMAT-NEXT: vle32.v v20, (a0) -; NOREMAT-NEXT: add a4, a7, s1 +; NOREMAT-NEXT: slli s9, s9, 9 +; NOREMAT-NEXT: vle32.v v0, (a5) +; NOREMAT-NEXT: vle32.v v12, (a5) +; NOREMAT-NEXT: slli s10, a2, 10 +; NOREMAT-NEXT: vle32.v v4, (a7) +; NOREMAT-NEXT: vle32.v v20, (a7) +; NOREMAT-NEXT: add a4, a0, s7 ; NOREMAT-NEXT: vle32.v v6, (a4) ; NOREMAT-NEXT: vle32.v v30, (a4) -; NOREMAT-NEXT: add a4, a7, t5 +; NOREMAT-NEXT: add a4, a0, s1 ; NOREMAT-NEXT: vle32.v v28, (a4) ; NOREMAT-NEXT: vle32.v v26, (a4) -; NOREMAT-NEXT: add a4, a7, s11 +; NOREMAT-NEXT: add a4, a0, ra ; NOREMAT-NEXT: vle32.v v24, (a4) ; NOREMAT-NEXT: vle32.v v22, (a4) -; NOREMAT-NEXT: add a4, a7, ra -; NOREMAT-NEXT: vle32.v v14, (a7) +; NOREMAT-NEXT: add a4, a0, t3 +; NOREMAT-NEXT: vle32.v v14, (a0) ; NOREMAT-NEXT: vle32.v v18, (a4) ; NOREMAT-NEXT: vle32.v v16, (a4) -; NOREMAT-NEXT: add a4, a7, t3 +; NOREMAT-NEXT: add a4, a0, s0 ; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v8 ; NOREMAT-NEXT: vle32.v v14, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: addi a0, sp, 640 -; NOREMAT-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; NOREMAT-NEXT: add a4, a7, t4 +; NOREMAT-NEXT: addi a4, sp, 640 +; NOREMAT-NEXT: vs2r.v v8, (a4) # Unknown-size Folded Spill +; NOREMAT-NEXT: add a4, a0, t0 ; NOREMAT-NEXT: vle32.v v10, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: add a4, a7, t6 +; NOREMAT-NEXT: add a4, a0, t5 ; NOREMAT-NEXT: vle32.v v0, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v10 ; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: add a4, a7, s2 +; NOREMAT-NEXT: add a4, a0, t6 ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: add a4, a7, s4 +; NOREMAT-NEXT: add a4, a0, s2 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s0 +; NOREMAT-NEXT: add a4, a0, s3 ; NOREMAT-NEXT: vle32.v v0, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v8 ; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: add a4, a7, s5 +; NOREMAT-NEXT: add a4, a0, s4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s6 +; NOREMAT-NEXT: add a4, a0, s5 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v8 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: add a4, a7, s8 +; NOREMAT-NEXT: add a4, a0, s6 ; NOREMAT-NEXT: vle32.v v20, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s9 +; NOREMAT-NEXT: add a4, a0, s8 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: li t5, 13 -; NOREMAT-NEXT: slli a4, t5, 9 -; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: add a4, a0, s9 ; NOREMAT-NEXT: vle32.v v20, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: slli a4, a2, 10 -; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: add a4, a0, s10 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: li a6, 15 -; NOREMAT-NEXT: slli a4, a6, 9 -; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: li t2, 15 +; NOREMAT-NEXT: slli a4, t2, 9 +; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a0, a4 ; NOREMAT-NEXT: vle32.v v2, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 -; NOREMAT-NEXT: lui t1, 8 -; NOREMAT-NEXT: add a5, a7, t1 +; NOREMAT-NEXT: lui t4, 8 +; NOREMAT-NEXT: add a5, a0, t4 ; NOREMAT-NEXT: vle32.v v20, (a5) ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v2 ; NOREMAT-NEXT: li a4, 17 ; NOREMAT-NEXT: slli a4, a4, 9 -; NOREMAT-NEXT: li t2, 17 -; NOREMAT-NEXT: sd a4, 600(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: li s1, 17 +; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a0, a4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v6 ; NOREMAT-NEXT: li a5, 9 ; NOREMAT-NEXT: slli a4, a5, 10 -; NOREMAT-NEXT: sd a4, 592(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a0, a4 ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: vle32.v v6, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 ; NOREMAT-NEXT: li a4, 19 ; NOREMAT-NEXT: slli a4, a4, 9 -; NOREMAT-NEXT: li s1, 19 -; NOREMAT-NEXT: sd a4, 584(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: li t1, 19 +; NOREMAT-NEXT: sd a4, 600(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a0, a4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: vle32.v v30, (a4) ; NOREMAT-NEXT: slli a3, a3, 11 -; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a3, 592(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 ; NOREMAT-NEXT: li s7, 21 ; NOREMAT-NEXT: slli a3, s7, 9 -; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: sd a3, 584(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v6, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 -; NOREMAT-NEXT: li a4, 11 -; NOREMAT-NEXT: slli a3, a4, 10 -; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: li a6, 11 +; NOREMAT-NEXT: slli a3, a6, 10 +; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8 ; NOREMAT-NEXT: li s3, 23 -; NOREMAT-NEXT: slli s10, s3, 9 -; NOREMAT-NEXT: add a3, a7, s10 +; NOREMAT-NEXT: slli a3, s3, 9 +; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 ; NOREMAT-NEXT: li s0, 25 ; NOREMAT-NEXT: slli a3, s0, 9 -; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v6, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 -; NOREMAT-NEXT: slli a3, t5, 10 -; NOREMAT-NEXT: sd a3, 544(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: li a7, 13 +; NOREMAT-NEXT: slli a3, a7, 10 +; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28 ; NOREMAT-NEXT: li t3, 27 ; NOREMAT-NEXT: slli a3, t3, 9 -; NOREMAT-NEXT: sd a3, 536(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: sd a3, 544(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v28, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) ; NOREMAT-NEXT: slli a2, a2, 11 -; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a2, 536(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 ; NOREMAT-NEXT: li t0, 29 ; NOREMAT-NEXT: slli a2, t0, 9 -; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v28 -; NOREMAT-NEXT: slli a2, a6, 10 -; NOREMAT-NEXT: sd a2, 512(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: slli a2, t2, 10 +; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: li t2, 15 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 ; NOREMAT-NEXT: li a3, 31 -; NOREMAT-NEXT: slli a0, a3, 9 -; NOREMAT-NEXT: sd a0, 504(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a0, a7, a0 -; NOREMAT-NEXT: vle32.v v12, (a0) -; NOREMAT-NEXT: vle32.v v4, (a0) +; NOREMAT-NEXT: slli a2, a3, 9 +; NOREMAT-NEXT: sd a2, 512(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: vle32.v v12, (a2) +; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8 -; NOREMAT-NEXT: addiw a2, s11, 512 +; NOREMAT-NEXT: addiw a2, ra, 512 ; NOREMAT-NEXT: sd a2, 496(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v28 -; NOREMAT-NEXT: slli a2, t2, 10 +; NOREMAT-NEXT: slli a2, s1, 10 ; NOREMAT-NEXT: sd a2, 488(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 -; NOREMAT-NEXT: addiw a2, s11, 1536 +; NOREMAT-NEXT: addiw a2, ra, 1536 ; NOREMAT-NEXT: sd a2, 480(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: slli a2, a5, 11 ; NOREMAT-NEXT: sd a2, 472(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v24 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v8 -; NOREMAT-NEXT: addiw a2, ra, -1536 +; NOREMAT-NEXT: lui a4, 5 +; NOREMAT-NEXT: addiw a2, a4, -1536 ; NOREMAT-NEXT: sd a2, 464(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v28 -; NOREMAT-NEXT: slli a2, s1, 10 +; NOREMAT-NEXT: slli a2, t1, 10 ; NOREMAT-NEXT: sd a2, 456(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: li t1, 19 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 -; NOREMAT-NEXT: addiw a2, ra, -512 +; NOREMAT-NEXT: addiw a2, a4, -512 ; NOREMAT-NEXT: sd a2, 448(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v24 -; NOREMAT-NEXT: addiw a2, ra, 512 +; NOREMAT-NEXT: addiw a2, a4, 512 ; NOREMAT-NEXT: sd a2, 440(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: slli a2, s7, 10 ; NOREMAT-NEXT: sd a2, 432(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v26 -; NOREMAT-NEXT: addiw a2, ra, 1536 +; NOREMAT-NEXT: addiw a2, a4, 1536 ; NOREMAT-NEXT: sd a2, 424(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v26, (a2) -; NOREMAT-NEXT: slli a2, a4, 11 +; NOREMAT-NEXT: slli a2, a6, 11 ; NOREMAT-NEXT: sd a2, 416(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v12 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v18 -; NOREMAT-NEXT: lui a4, 6 -; NOREMAT-NEXT: addiw a2, a4, -1536 +; NOREMAT-NEXT: lui a5, 6 +; NOREMAT-NEXT: addiw a2, a5, -1536 ; NOREMAT-NEXT: sd a2, 408(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: slli a2, s3, 10 ; NOREMAT-NEXT: sd a2, 400(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v24 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 -; NOREMAT-NEXT: addiw a2, a4, -512 +; NOREMAT-NEXT: addiw a2, a5, -512 ; NOREMAT-NEXT: sd a2, 392(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v22 -; NOREMAT-NEXT: addiw a2, a4, 512 +; NOREMAT-NEXT: addiw a2, a5, 512 ; NOREMAT-NEXT: sd a2, 384(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: slli a2, s0, 10 ; NOREMAT-NEXT: sd a2, 376(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v2, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v18 -; NOREMAT-NEXT: addiw a2, a4, 1536 +; NOREMAT-NEXT: addiw a2, a5, 1536 ; NOREMAT-NEXT: sd a2, 368(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) -; NOREMAT-NEXT: slli a2, t5, 11 +; NOREMAT-NEXT: slli a2, a7, 11 ; NOREMAT-NEXT: sd a2, 360(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v16 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v8 -; NOREMAT-NEXT: lui a5, 7 -; NOREMAT-NEXT: addiw a2, a5, -1536 +; NOREMAT-NEXT: lui a7, 7 +; NOREMAT-NEXT: addiw a2, a7, -1536 ; NOREMAT-NEXT: sd a2, 352(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: slli a2, t3, 10 ; NOREMAT-NEXT: sd a2, 344(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v14 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v14, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: addi a0, sp, 640 -; NOREMAT-NEXT: vl2r.v v12, (a0) # Unknown-size Folded Reload +; NOREMAT-NEXT: addi a2, sp, 640 +; NOREMAT-NEXT: vl2r.v v12, (a2) # Unknown-size Folded Reload ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v22 -; NOREMAT-NEXT: addiw a2, a5, -512 +; NOREMAT-NEXT: addiw a2, a7, -512 ; NOREMAT-NEXT: sd a2, 336(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v26 -; NOREMAT-NEXT: addiw a2, a5, 512 +; NOREMAT-NEXT: addiw a2, a7, 512 ; NOREMAT-NEXT: sd a2, 328(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: slli a2, t0, 10 ; NOREMAT-NEXT: sd a2, 320(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v18 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v2, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v16 -; NOREMAT-NEXT: addiw a2, a5, 1536 +; NOREMAT-NEXT: addiw a2, a7, 1536 ; NOREMAT-NEXT: sd a2, 312(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) -; NOREMAT-NEXT: slli a2, a6, 11 +; NOREMAT-NEXT: slli a2, t2, 11 ; NOREMAT-NEXT: sd a2, 304(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v14 -; NOREMAT-NEXT: addiw a2, t1, -1536 +; NOREMAT-NEXT: addiw a2, t4, -1536 ; NOREMAT-NEXT: sd a2, 296(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v14, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: slli a2, a3, 10 ; NOREMAT-NEXT: sd a2, 288(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v22 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: addiw a0, t1, -512 -; NOREMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a0, a7, a0 +; NOREMAT-NEXT: addiw a2, t4, -512 +; NOREMAT-NEXT: sd a2, 280(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a0, a0, a2 ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0 ; NOREMAT-NEXT: vle32.v v12, (a0) ; NOREMAT-NEXT: vle32.v v0, (a0) @@ -431,33 +435,32 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: addi a0, a1, 1024 ; NOREMAT-NEXT: vse32.v v8, (a0) -; NOREMAT-NEXT: lui a0, 1 -; NOREMAT-NEXT: add a0, a1, a0 -; NOREMAT-NEXT: sd a0, 272(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add s11, a1, s11 +; NOREMAT-NEXT: sd s11, 272(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui a0, 2 ; NOREMAT-NEXT: add a0, a1, a0 ; NOREMAT-NEXT: sd a0, 264(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui a0, 3 ; NOREMAT-NEXT: add a0, a1, a0 ; NOREMAT-NEXT: sd a0, 256(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add s11, a1, s11 -; NOREMAT-NEXT: sd s11, 248(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 240(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 248(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a4, a1, a4 -; NOREMAT-NEXT: sd a4, 232(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a4, 240(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a5, a1, a5 -; NOREMAT-NEXT: sd a5, 224(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a0, a1, t1 +; NOREMAT-NEXT: sd a5, 232(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a7, a1, a7 +; NOREMAT-NEXT: sd a7, 224(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a0, a1, t4 ; NOREMAT-NEXT: sd a0, 216(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: addiw a0, t1, 512 +; NOREMAT-NEXT: addiw a0, t4, 512 ; NOREMAT-NEXT: sd a0, 192(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: addiw a0, t1, 1024 +; NOREMAT-NEXT: addiw a0, t4, 1024 ; NOREMAT-NEXT: sd a0, 176(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: addiw a0, t1, 1536 +; NOREMAT-NEXT: addiw a0, t4, 1536 ; NOREMAT-NEXT: sd a0, 160(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: slli t2, t2, 11 -; NOREMAT-NEXT: sd t2, 128(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: slli s1, s1, 11 +; NOREMAT-NEXT: sd s1, 128(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui a0, 9 ; NOREMAT-NEXT: addiw a2, a0, -1536 ; NOREMAT-NEXT: sd a2, 88(sp) # 8-byte Folded Spill @@ -470,7 +473,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: addiw s11, a0, 512 ; NOREMAT-NEXT: addiw s7, a0, 1024 ; NOREMAT-NEXT: addiw s3, a0, 1536 -; NOREMAT-NEXT: slli s1, s1, 11 +; NOREMAT-NEXT: slli s1, t1, 11 ; NOREMAT-NEXT: lui a0, 10 ; NOREMAT-NEXT: addiw t2, a0, -1536 ; NOREMAT-NEXT: addiw a7, a0, -1024 @@ -478,52 +481,52 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: add a2, a1, a0 ; NOREMAT-NEXT: sd a2, 200(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: addiw a0, a0, 512 -; NOREMAT-NEXT: add a2, a1, t4 -; NOREMAT-NEXT: add a3, a1, t6 -; NOREMAT-NEXT: add a5, a1, s2 -; NOREMAT-NEXT: add a6, a1, s4 -; NOREMAT-NEXT: add t0, a1, s5 -; NOREMAT-NEXT: add t1, a1, s6 -; NOREMAT-NEXT: add t3, a1, s8 -; NOREMAT-NEXT: add t4, a1, s9 -; NOREMAT-NEXT: ld t5, 624(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add t5, a1, t5 -; NOREMAT-NEXT: ld t6, 616(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add t6, a1, t6 -; NOREMAT-NEXT: ld s0, 608(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a2, 504(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add a2, a1, a2 +; NOREMAT-NEXT: add a3, a1, t5 +; NOREMAT-NEXT: add a5, a1, t6 +; NOREMAT-NEXT: add a6, a1, s2 +; NOREMAT-NEXT: add t0, a1, s4 +; NOREMAT-NEXT: add t1, a1, s5 +; NOREMAT-NEXT: add t3, a1, s6 +; NOREMAT-NEXT: add t4, a1, s8 +; NOREMAT-NEXT: add t5, a1, s9 +; NOREMAT-NEXT: add t6, a1, s10 +; NOREMAT-NEXT: ld s0, 624(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s0, a1, s0 -; NOREMAT-NEXT: ld s2, 600(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s2, 616(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s2, a1, s2 -; NOREMAT-NEXT: ld s4, 592(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s4, 608(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s4, a1, s4 -; NOREMAT-NEXT: ld s5, 584(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s5, 600(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s5, a1, s5 -; NOREMAT-NEXT: ld s6, 576(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s6, 592(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s6, a1, s6 -; NOREMAT-NEXT: ld s8, 568(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s8, 584(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s8, a1, s8 -; NOREMAT-NEXT: ld s9, 560(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s9, 576(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s9, a1, s9 +; NOREMAT-NEXT: ld s10, 568(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s10, a1, s10 -; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 560(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 16(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 32(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 48(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 512(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 64(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 504(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 512(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 80(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 496(sp) # 8-byte Folded Reload @@ -917,9 +920,10 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: .cfi_offset s10, -96 ; REMAT-NEXT: .cfi_offset s11, -104 ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 3 +; REMAT-NEXT: li a3, 14 +; REMAT-NEXT: mul a2, a2, a3 ; REMAT-NEXT: sub sp, sp, a2 -; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 8 * vlenb +; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 14 * vlenb ; REMAT-NEXT: li a4, 32 ; REMAT-NEXT: addi a5, a0, 512 ; REMAT-NEXT: addi a3, a0, 1024 @@ -956,13 +960,20 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: slli s6, s6, 9 ; REMAT-NEXT: li s7, 5 ; REMAT-NEXT: slli s7, s7, 11 +; REMAT-NEXT: li s8, 21 +; REMAT-NEXT: slli s8, s8, 9 +; REMAT-NEXT: li s9, 11 +; REMAT-NEXT: slli s9, s9, 10 +; REMAT-NEXT: li s10, 23 +; REMAT-NEXT: slli s10, s10, 9 +; REMAT-NEXT: lui s11, 3 ; REMAT-NEXT: vsetvli zero, a4, e32, m2, ta, ma ; REMAT-NEXT: vle32.v v8, (a5) -; REMAT-NEXT: li a4, 21 +; REMAT-NEXT: li a4, 25 ; REMAT-NEXT: slli a4, a4, 9 ; REMAT-NEXT: vle32.v v10, (a3) ; REMAT-NEXT: vle32.v v12, (a3) -; REMAT-NEXT: li a3, 11 +; REMAT-NEXT: li a3, 13 ; REMAT-NEXT: slli a3, a3, 10 ; REMAT-NEXT: vle32.v v14, (a2) ; REMAT-NEXT: vle32.v v16, (a2) @@ -979,7 +990,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a5, 6 +; REMAT-NEXT: li a5, 12 ; REMAT-NEXT: mul a2, a2, a5 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 @@ -989,7 +1000,8 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 2 +; REMAT-NEXT: li a5, 10 +; REMAT-NEXT: mul a2, a2, a5 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 ; REMAT-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill @@ -1003,11 +1015,16 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v14 ; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: add a2, a0, t5 -; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: vle32.v v14, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v18 -; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: slli a2, a2, 3 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill ; REMAT-NEXT: add a2, a0, t6 -; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: vle32.v v18, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22 ; REMAT-NEXT: vle32.v v20, (a2) ; REMAT-NEXT: add a2, a0, s0 @@ -1017,340 +1034,383 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: add a2, a0, s1 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v28, v30 -; REMAT-NEXT: vle32.v v14, (a2) +; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: add a2, a0, s2 -; REMAT-NEXT: vle32.v v12, (a2) +; REMAT-NEXT: vle32.v v8, (a2) ; REMAT-NEXT: csrr a5, vlenb -; REMAT-NEXT: li a6, 6 +; REMAT-NEXT: li a6, 12 ; REMAT-NEXT: mul a5, a5, a6 ; REMAT-NEXT: add a5, sp, a5 ; REMAT-NEXT: addi a5, a5, 432 -; REMAT-NEXT: vl2r.v v28, (a5) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v28, v2 +; REMAT-NEXT: vl2r.v v12, (a5) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v12, v2 ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: add a2, a0, s3 -; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: vle32.v v12, (a2) ; REMAT-NEXT: csrr a5, vlenb -; REMAT-NEXT: slli a5, a5, 2 +; REMAT-NEXT: li a6, 10 +; REMAT-NEXT: mul a5, a5, a6 ; REMAT-NEXT: add a5, sp, a5 ; REMAT-NEXT: addi a5, a5, 432 -; REMAT-NEXT: vl2r.v v30, (a5) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v30, v4 -; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: add a2, a0, s4 +; REMAT-NEXT: vl2r.v v16, (a5) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v30, (a2) +; REMAT-NEXT: add a2, a0, s4 +; REMAT-NEXT: vle32.v v16, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: add a2, a0, s5 ; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v0, v8 -; REMAT-NEXT: vle32.v v0, (a2) +; REMAT-NEXT: add a2, a0, s5 +; REMAT-NEXT: vle32.v v10, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v0, v14 +; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: add a2, a0, s6 -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v18, v16 -; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: vle32.v v14, (a2) +; REMAT-NEXT: csrr a5, vlenb +; REMAT-NEXT: slli a5, a5, 3 +; REMAT-NEXT: add a5, sp, a5 +; REMAT-NEXT: addi a5, a5, 432 +; REMAT-NEXT: vl2r.v v0, (a5) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18 +; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: add a2, a0, s7 -; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: vle32.v v18, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22 -; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: add a2, a0, a4 +; REMAT-NEXT: vle32.v v20, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: slli a2, a2, 3 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v20, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: add a2, a0, s8 ; REMAT-NEXT: vle32.v v20, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v24, v26 ; REMAT-NEXT: vle32.v v24, (a2) +; REMAT-NEXT: add a2, a0, s9 +; REMAT-NEXT: vle32.v v22, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v28, v8 +; REMAT-NEXT: vle32.v v26, (a2) +; REMAT-NEXT: add a2, a0, s10 +; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v2, v12 +; REMAT-NEXT: vle32.v v12, (a2) +; REMAT-NEXT: add a2, a0, s11 +; REMAT-NEXT: vle32.v v2, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v30, v16 +; REMAT-NEXT: vle32.v v16, (a2) ; REMAT-NEXT: addi a2, sp, 432 -; REMAT-NEXT: vs2r.v v24, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: add a2, a0, a4 +; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10 +; REMAT-NEXT: vle32.v v10, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: slli a2, a2, 1 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill ; REMAT-NEXT: add a2, a0, a3 -; REMAT-NEXT: vle32.v v24, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v14, v12 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: li a5, 23 +; REMAT-NEXT: vle32.v v10, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v4, v14 +; REMAT-NEXT: vle32.v v14, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: li a3, 12 +; REMAT-NEXT: mul a2, a2, a3 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: li a5, 27 ; REMAT-NEXT: slli a5, a5, 9 ; REMAT-NEXT: add a2, a0, a5 -; REMAT-NEXT: vle32.v v26, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v2, v28 ; REMAT-NEXT: vle32.v v14, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18 +; REMAT-NEXT: vle32.v v18, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 6 +; REMAT-NEXT: li a3, 10 ; REMAT-NEXT: mul a2, a2, a3 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: lui s8, 3 -; REMAT-NEXT: add a2, a0, s8 +; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: li ra, 7 +; REMAT-NEXT: slli ra, ra, 11 +; REMAT-NEXT: add a2, a0, ra ; REMAT-NEXT: vle32.v v28, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v4, v30 -; REMAT-NEXT: vle32.v v14, (a2) +; REMAT-NEXT: csrr a3, vlenb +; REMAT-NEXT: slli a3, a3, 3 +; REMAT-NEXT: add a3, sp, a3 +; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: vl2r.v v18, (a3) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v18, v20 +; REMAT-NEXT: vle32.v v18, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 2 +; REMAT-NEXT: slli a2, a2, 3 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: li s9, 25 -; REMAT-NEXT: slli s9, s9, 9 -; REMAT-NEXT: add a2, a0, s9 +; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: li a2, 29 +; REMAT-NEXT: slli a2, a2, 9 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v30, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v6 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: li s10, 13 -; REMAT-NEXT: slli s10, s10, 10 -; REMAT-NEXT: add a2, a0, s10 +; REMAT-NEXT: sf.vc.vv 3, 0, v24, v22 +; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: li a3, 6 +; REMAT-NEXT: mul a2, a2, a3 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: li a2, 15 +; REMAT-NEXT: slli a2, a2, 10 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v0, v8 +; REMAT-NEXT: sf.vc.vv 3, 0, v26, v8 ; REMAT-NEXT: vle32.v v8, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 1 +; REMAT-NEXT: slli a2, a2, 2 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 ; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: li s11, 27 -; REMAT-NEXT: slli s11, s11, 9 -; REMAT-NEXT: add a2, a0, s11 +; REMAT-NEXT: li a2, 31 +; REMAT-NEXT: slli a2, a2, 9 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v18, v16 +; REMAT-NEXT: sf.vc.vv 3, 0, v12, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: li ra, 7 -; REMAT-NEXT: slli ra, ra, 11 -; REMAT-NEXT: add a2, a0, ra +; REMAT-NEXT: lui a2, 4 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v2, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v22, v20 +; REMAT-NEXT: addi a3, sp, 432 +; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v16 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: li a2, 29 -; REMAT-NEXT: slli a2, a2, 9 +; REMAT-NEXT: lui a2, 4 +; REMAT-NEXT: addiw a2, a2, 512 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v0, (a2) -; REMAT-NEXT: addi a3, sp, 432 +; REMAT-NEXT: csrr a3, vlenb +; REMAT-NEXT: slli a3, a3, 1 +; REMAT-NEXT: add a3, sp, a3 +; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v24 +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: li a2, 15 +; REMAT-NEXT: li a2, 17 ; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v24, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v12, v26 +; REMAT-NEXT: csrr a3, vlenb +; REMAT-NEXT: li a4, 12 +; REMAT-NEXT: mul a3, a3, a4 +; REMAT-NEXT: add a3, sp, a3 +; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li a2, 31 -; REMAT-NEXT: slli a2, a2, 9 +; REMAT-NEXT: lui a2, 4 +; REMAT-NEXT: addiw a2, a2, 1536 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: li a4, 6 +; REMAT-NEXT: li a4, 10 ; REMAT-NEXT: mul a3, a3, a4 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v10, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: lui a2, 4 +; REMAT-NEXT: li a2, 9 +; REMAT-NEXT: slli a2, a2, 11 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 2 +; REMAT-NEXT: slli a3, a3, 3 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v12, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: lui a2, 4 -; REMAT-NEXT: addiw a2, a2, 512 +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: addiw a2, a2, -1536 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v30, (a2) +; REMAT-NEXT: csrr a3, vlenb +; REMAT-NEXT: li a4, 6 +; REMAT-NEXT: mul a3, a3, a4 +; REMAT-NEXT: add a3, sp, a3 +; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: vl2r.v v14, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: li a2, 17 +; REMAT-NEXT: li a2, 19 ; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 1 +; REMAT-NEXT: slli a3, a3, 2 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v16, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: lui a2, 4 -; REMAT-NEXT: addiw a2, a2, 1536 +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: addiw a2, a2, -512 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: li a2, 9 -; REMAT-NEXT: slli a2, a2, 11 +; REMAT-NEXT: lui a2, 5 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 ; REMAT-NEXT: vle32.v v20, (a2) ; REMAT-NEXT: lui a2, 5 -; REMAT-NEXT: addiw a2, a2, -1536 +; REMAT-NEXT: addiw a2, a2, 512 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: li a2, 19 +; REMAT-NEXT: li a2, 21 ; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: lui a2, 5 -; REMAT-NEXT: addiw a2, a2, -512 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui s4, 5 +; REMAT-NEXT: addiw s4, s4, 1536 +; REMAT-NEXT: add a2, a0, s4 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: li a2, 11 +; REMAT-NEXT: slli a2, a2, 11 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: lui a2, 5 -; REMAT-NEXT: addiw a2, a2, 512 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui s3, 6 +; REMAT-NEXT: addiw s3, s3, -1536 +; REMAT-NEXT: add a2, a0, s3 ; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: li a2, 21 -; REMAT-NEXT: slli a2, a2, 10 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: li s2, 23 +; REMAT-NEXT: slli s2, s2, 10 +; REMAT-NEXT: add a2, a0, s2 ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: lui a2, 5 -; REMAT-NEXT: addiw a2, a2, 1536 +; REMAT-NEXT: lui a2, 6 +; REMAT-NEXT: addiw a2, a2, -512 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: li a2, 11 -; REMAT-NEXT: slli a2, a2, 11 +; REMAT-NEXT: lui a2, 6 ; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui s1, 6 ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: lui a2, 6 -; REMAT-NEXT: addiw a2, a2, -1536 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui s0, 6 +; REMAT-NEXT: addiw s0, s0, 512 +; REMAT-NEXT: add a2, a0, s0 ; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: li a2, 23 +; REMAT-NEXT: li a2, 25 ; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: lui a2, 6 -; REMAT-NEXT: addiw a2, a2, -512 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui t6, 6 +; REMAT-NEXT: addiw t6, t6, 1536 +; REMAT-NEXT: add a2, a0, t6 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: lui a2, 6 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: lui s1, 6 +; REMAT-NEXT: li t5, 13 +; REMAT-NEXT: slli t5, t5, 11 +; REMAT-NEXT: add a2, a0, t5 ; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: lui s0, 6 -; REMAT-NEXT: addiw s0, s0, 512 -; REMAT-NEXT: add a2, a0, s0 +; REMAT-NEXT: lui a2, 7 +; REMAT-NEXT: addiw a2, a2, -1536 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: li a2, 25 -; REMAT-NEXT: slli a2, a2, 10 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: li t4, 27 +; REMAT-NEXT: slli t4, t4, 10 +; REMAT-NEXT: add a2, a0, t4 ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: lui t6, 6 -; REMAT-NEXT: addiw t6, t6, 1536 -; REMAT-NEXT: add a2, a0, t6 +; REMAT-NEXT: lui a2, 7 +; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: li t5, 13 -; REMAT-NEXT: slli t5, t5, 11 -; REMAT-NEXT: add a2, a0, t5 +; REMAT-NEXT: lui a2, 7 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui t3, 7 ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: lui a2, 7 -; REMAT-NEXT: addiw a2, a2, -1536 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui t2, 7 +; REMAT-NEXT: addiw t2, t2, 512 +; REMAT-NEXT: add a2, a0, t2 ; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: li t4, 27 -; REMAT-NEXT: slli t4, t4, 10 -; REMAT-NEXT: add a2, a0, t4 +; REMAT-NEXT: li t1, 29 +; REMAT-NEXT: slli t1, t1, 10 +; REMAT-NEXT: add a2, a0, t1 ; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: lui a2, 7 -; REMAT-NEXT: addiw a2, a2, -512 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui t0, 7 +; REMAT-NEXT: addiw t0, t0, 1536 +; REMAT-NEXT: add a2, a0, t0 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: lui a2, 7 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: lui t3, 7 +; REMAT-NEXT: li a7, 15 +; REMAT-NEXT: slli a7, a7, 11 +; REMAT-NEXT: add a2, a0, a7 ; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: lui t2, 7 -; REMAT-NEXT: addiw t2, t2, 512 -; REMAT-NEXT: add a2, a0, t2 +; REMAT-NEXT: lui a6, 8 +; REMAT-NEXT: addiw a6, a6, -1536 +; REMAT-NEXT: add a2, a0, a6 ; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: li t1, 29 -; REMAT-NEXT: slli t1, t1, 10 -; REMAT-NEXT: add a2, a0, t1 +; REMAT-NEXT: li a4, 31 +; REMAT-NEXT: slli a4, a4, 10 +; REMAT-NEXT: add a2, a0, a4 ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: lui t0, 7 -; REMAT-NEXT: addiw t0, t0, 1536 -; REMAT-NEXT: add a2, a0, t0 +; REMAT-NEXT: lui a3, 8 +; REMAT-NEXT: addiw a3, a3, -512 +; REMAT-NEXT: add a2, a0, a3 ; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: li a7, 15 -; REMAT-NEXT: slli a7, a7, 11 -; REMAT-NEXT: add a2, a0, a7 -; REMAT-NEXT: vle32.v v2, (a2) +; REMAT-NEXT: lui a2, 8 +; REMAT-NEXT: add a0, a0, a2 +; REMAT-NEXT: vle32.v v2, (a0) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 -; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: lui a6, 8 -; REMAT-NEXT: addiw a6, a6, -1536 -; REMAT-NEXT: add a2, a0, a6 -; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 -; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: li a4, 31 -; REMAT-NEXT: slli a4, a4, 10 -; REMAT-NEXT: add a2, a0, a4 -; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: lui a3, 8 -; REMAT-NEXT: addiw a3, a3, -512 -; REMAT-NEXT: add a2, a0, a3 -; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: lui a2, 8 -; REMAT-NEXT: add a0, a0, a2 -; REMAT-NEXT: vle32.v v28, (a0) ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 -; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 -; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: addi a0, a1, 1024 ; REMAT-NEXT: vse32.v v8, (a0) @@ -1397,36 +1457,41 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 ; REMAT-NEXT: sd a0, 336(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s2, a1, s2 -; REMAT-NEXT: sd s2, 328(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s3, a1, s3 -; REMAT-NEXT: sd s3, 320(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s4, a1, s4 -; REMAT-NEXT: sd s4, 312(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 15 +; REMAT-NEXT: slli a0, a0, 9 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 328(sp) # 8-byte Folded Spill +; REMAT-NEXT: lui a0, 2 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 320(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 17 +; REMAT-NEXT: slli a0, a0, 9 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 312(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s5, a1, s5 ; REMAT-NEXT: sd s5, 304(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s6, a1, s6 ; REMAT-NEXT: sd s6, 296(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s7, a1, s7 ; REMAT-NEXT: sd s7, 288(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 21 +; REMAT-NEXT: add s8, a1, s8 +; REMAT-NEXT: sd s8, 280(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s9, a1, s9 +; REMAT-NEXT: sd s9, 272(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s10, a1, s10 +; REMAT-NEXT: sd s10, 264(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s11, a1, s11 +; REMAT-NEXT: sd s11, 256(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 25 ; REMAT-NEXT: slli a0, a0, 9 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 11 +; REMAT-NEXT: sd a0, 248(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 13 ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 272(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd a0, 240(sp) # 8-byte Folded Spill ; REMAT-NEXT: add a5, a1, a5 -; REMAT-NEXT: sd a5, 264(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s8, a1, s8 -; REMAT-NEXT: sd s8, 256(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s9, a1, s9 -; REMAT-NEXT: sd s9, 248(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s10, a1, s10 -; REMAT-NEXT: sd s10, 240(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s11, a1, s11 -; REMAT-NEXT: sd s11, 232(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd a5, 232(sp) # 8-byte Folded Spill ; REMAT-NEXT: add ra, a1, ra ; REMAT-NEXT: sd ra, 224(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 29 @@ -1483,22 +1548,16 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 ; REMAT-NEXT: sd a0, 112(sp) # 8-byte Folded Spill -; REMAT-NEXT: lui a0, 5 -; REMAT-NEXT: addiw a0, a0, 1536 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 104(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s4, a1, s4 +; REMAT-NEXT: sd s4, 104(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 11 ; REMAT-NEXT: slli a0, a0, 11 ; REMAT-NEXT: add a0, a1, a0 ; REMAT-NEXT: sd a0, 96(sp) # 8-byte Folded Spill -; REMAT-NEXT: lui a0, 6 -; REMAT-NEXT: addiw a0, a0, -1536 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 88(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 23 -; REMAT-NEXT: slli a0, a0, 10 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 80(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s3, a1, s3 +; REMAT-NEXT: sd s3, 88(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s2, a1, s2 +; REMAT-NEXT: sd s2, 80(sp) # 8-byte Folded Spill ; REMAT-NEXT: lui a0, 6 ; REMAT-NEXT: addiw a0, a0, -512 ; REMAT-NEXT: add a0, a1, a0 @@ -1795,7 +1854,8 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: csrr a0, vlenb -; REMAT-NEXT: slli a0, a0, 3 +; REMAT-NEXT: li a1, 14 +; REMAT-NEXT: mul a0, a0, a1 ; REMAT-NEXT: add sp, sp, a0 ; REMAT-NEXT: .cfi_def_cfa sp, 544 ; REMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index 0b5856a7000dd..575a757149ebb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -5682,16 +5682,28 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -16 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s3, 0(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: addi sp, sp, -48 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 48 +; RV32ZVE32F-NEXT: sw s0, 44(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 40(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 36(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s3, 32(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s4, 28(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s6, 20(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s7, 16(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s8, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s9, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 ; RV32ZVE32F-NEXT: .cfi_offset s3, -16 +; RV32ZVE32F-NEXT: .cfi_offset s4, -20 +; RV32ZVE32F-NEXT: .cfi_offset s5, -24 +; RV32ZVE32F-NEXT: .cfi_offset s6, -28 +; RV32ZVE32F-NEXT: .cfi_offset s7, -32 +; RV32ZVE32F-NEXT: .cfi_offset s8, -36 +; RV32ZVE32F-NEXT: .cfi_offset s9, -40 ; RV32ZVE32F-NEXT: .cfi_remember_state ; RV32ZVE32F-NEXT: lw a3, 56(a0) ; RV32ZVE32F-NEXT: lw a4, 60(a0) @@ -5703,30 +5715,30 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: lw t4, 28(a0) ; RV32ZVE32F-NEXT: lw t1, 32(a0) ; RV32ZVE32F-NEXT: lw t2, 36(a0) -; RV32ZVE32F-NEXT: lw t5, 0(a2) -; RV32ZVE32F-NEXT: lw t6, 8(a2) -; RV32ZVE32F-NEXT: lw s0, 16(a2) -; RV32ZVE32F-NEXT: lw s1, 24(a2) -; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v8, t5 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t6 -; RV32ZVE32F-NEXT: lw t5, 32(a2) -; RV32ZVE32F-NEXT: lw t6, 40(a2) -; RV32ZVE32F-NEXT: lw s2, 48(a2) -; RV32ZVE32F-NEXT: lw s3, 56(a2) -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s0 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s1 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t5 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t6 ; RV32ZVE32F-NEXT: lw s0, 8(a0) ; RV32ZVE32F-NEXT: lw s1, 12(a0) ; RV32ZVE32F-NEXT: lw t5, 16(a0) ; RV32ZVE32F-NEXT: lw t6, 20(a0) -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2 +; RV32ZVE32F-NEXT: lw s2, 32(a2) +; RV32ZVE32F-NEXT: lw s3, 40(a2) +; RV32ZVE32F-NEXT: lw s4, 48(a2) +; RV32ZVE32F-NEXT: lw s5, 56(a2) +; RV32ZVE32F-NEXT: lw s6, 0(a2) +; RV32ZVE32F-NEXT: lw s7, 8(a2) +; RV32ZVE32F-NEXT: lw s8, 16(a2) +; RV32ZVE32F-NEXT: lw s9, 24(a2) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vmv.v.x v8, s6 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s7 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s8 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s9 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s3 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s4 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s5 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: andi s2, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 @@ -5759,15 +5771,27 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a4, 4(a0) ; RV32ZVE32F-NEXT: .LBB51_9: # %else14 -; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s3, 0(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s0, 44(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 40(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 36(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s3, 32(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s4, 28(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s5, 24(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s6, 20(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s7, 16(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s8, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s9, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: .cfi_restore s0 ; RV32ZVE32F-NEXT: .cfi_restore s1 ; RV32ZVE32F-NEXT: .cfi_restore s2 ; RV32ZVE32F-NEXT: .cfi_restore s3 -; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: .cfi_restore s4 +; RV32ZVE32F-NEXT: .cfi_restore s5 +; RV32ZVE32F-NEXT: .cfi_restore s6 +; RV32ZVE32F-NEXT: .cfi_restore s7 +; RV32ZVE32F-NEXT: .cfi_restore s8 +; RV32ZVE32F-NEXT: .cfi_restore s9 +; RV32ZVE32F-NEXT: addi sp, sp, 48 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 0 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB51_10: # %cond.store diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll index 036fee6a13ca4..a11c02dd5e2cb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll @@ -1306,12 +1306,6 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: sb a0, 219(sp) ; ZVFHMIN32-NEXT: lh a0, 564(sp) ; ZVFHMIN32-NEXT: lh a1, 308(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 218(sp) -; ZVFHMIN32-NEXT: lh a0, 562(sp) -; ZVFHMIN32-NEXT: lh a1, 306(sp) ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 7 ; ZVFHMIN32-NEXT: csrr a2, vlenb @@ -1364,82 +1358,86 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN32-NEXT: vslidedown.vi v26, v8, 15 -; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 14 -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 13 -; ZVFHMIN32-NEXT: addi a2, sp, 848 +; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 14 +; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 13 +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 12 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a2, a2, 1 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v6, v8, 12 -; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 11 -; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 10 -; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 9 -; ZVFHMIN32-NEXT: vslidedown.vi v18, v8, 8 -; ZVFHMIN32-NEXT: vmv.x.s a3, v16 +; ZVFHMIN32-NEXT: vslidedown.vi v4, v8, 11 +; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 10 +; ZVFHMIN32-NEXT: vslidedown.vi v30, v8, 9 +; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 8 +; ZVFHMIN32-NEXT: vmv.x.s t5, v16 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 217(sp) -; ZVFHMIN32-NEXT: lh a0, 560(sp) -; ZVFHMIN32-NEXT: lh a1, 304(sp) +; ZVFHMIN32-NEXT: sb a0, 218(sp) +; ZVFHMIN32-NEXT: lh a0, 562(sp) +; ZVFHMIN32-NEXT: lh a1, 306(sp) ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v21, v16, 7 -; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 6 -; ZVFHMIN32-NEXT: vslidedown.vi v19, v16, 5 +; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 7 +; ZVFHMIN32-NEXT: vslidedown.vi v31, v16, 6 +; ZVFHMIN32-NEXT: vslidedown.vi v5, v16, 5 ; ZVFHMIN32-NEXT: vslidedown.vi v23, v16, 4 ; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 3 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a4, 10 -; ZVFHMIN32-NEXT: mul a2, a2, a4 +; ZVFHMIN32-NEXT: li a3, 18 +; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 2 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 4 +; ZVFHMIN32-NEXT: li a3, 22 +; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 1 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a4, a2, 4 -; ZVFHMIN32-NEXT: sub a2, a4, a2 +; ZVFHMIN32-NEXT: li a3, 21 +; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 15 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 14 -; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 13 -; ZVFHMIN32-NEXT: vslidedown.vi v12, v16, 12 -; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 11 +; ZVFHMIN32-NEXT: vslidedown.vi v18, v16, 15 +; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 14 +; ZVFHMIN32-NEXT: vslidedown.vi v12, v16, 13 +; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 12 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 11 +; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 10 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a4, a2, 4 -; ZVFHMIN32-NEXT: add a2, a4, a2 +; ZVFHMIN32-NEXT: li a3, 19 +; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 10 +; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 9 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a4, 11 -; ZVFHMIN32-NEXT: mul a2, a2, a4 +; ZVFHMIN32-NEXT: li a3, 14 +; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v4, v16, 9 -; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 8 +; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 216(sp) -; ZVFHMIN32-NEXT: lh a0, 558(sp) -; ZVFHMIN32-NEXT: lh a1, 302(sp) +; ZVFHMIN32-NEXT: sb a0, 217(sp) +; ZVFHMIN32-NEXT: lh a0, 560(sp) +; ZVFHMIN32-NEXT: lh a1, 304(sp) ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 7 -; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 6 -; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 5 +; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 7 +; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 6 +; ZVFHMIN32-NEXT: vslidedown.vi v13, v0, 5 ; ZVFHMIN32-NEXT: vslidedown.vi v29, v0, 4 -; ZVFHMIN32-NEXT: vslidedown.vi v31, v0, 3 -; ZVFHMIN32-NEXT: vslidedown.vi v5, v0, 2 -; ZVFHMIN32-NEXT: vslidedown.vi v27, v0, 1 +; ZVFHMIN32-NEXT: vslidedown.vi v27, v0, 3 +; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 2 +; ZVFHMIN32-NEXT: vslidedown.vi v21, v0, 1 ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 15 ; ZVFHMIN32-NEXT: csrr a2, vlenb @@ -1449,88 +1447,99 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 14 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 1 +; ZVFHMIN32-NEXT: slli a2, a2, 3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 13 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a4, 6 -; ZVFHMIN32-NEXT: mul a2, a2, a4 +; ZVFHMIN32-NEXT: li a3, 6 +; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 12 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 3 +; ZVFHMIN32-NEXT: li a3, 12 +; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 11 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a4, 13 -; ZVFHMIN32-NEXT: mul a2, a2, a4 +; ZVFHMIN32-NEXT: li a3, 10 +; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 10 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a4, 19 -; ZVFHMIN32-NEXT: mul a2, a2, a4 +; ZVFHMIN32-NEXT: slli a2, a2, 4 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 9 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a4, 21 -; ZVFHMIN32-NEXT: mul a2, a2, a4 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v0, v0, 8 +; ZVFHMIN32-NEXT: addi a2, sp, 848 +; ZVFHMIN32-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s t4, v26 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 216(sp) +; ZVFHMIN32-NEXT: lh a0, 558(sp) +; ZVFHMIN32-NEXT: lh a1, 302(sp) +; ZVFHMIN32-NEXT: vmv.x.s t3, v20 +; ZVFHMIN32-NEXT: vmv.x.s t1, v28 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 215(sp) ; ZVFHMIN32-NEXT: lh a0, 556(sp) ; ZVFHMIN32-NEXT: lh a1, 300(sp) -; ZVFHMIN32-NEXT: vmv.x.s t3, v26 -; ZVFHMIN32-NEXT: vmv.x.s t2, v28 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a2, a2, 1 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s t2, v0 +; ZVFHMIN32-NEXT: vmv.x.s t0, v4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 214(sp) ; ZVFHMIN32-NEXT: lh a0, 554(sp) ; ZVFHMIN32-NEXT: lh a1, 298(sp) -; ZVFHMIN32-NEXT: addi a2, sp, 848 -; ZVFHMIN32-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s t1, v16 -; ZVFHMIN32-NEXT: vmv.x.s t0, v6 +; ZVFHMIN32-NEXT: vmv.x.s a7, v2 +; ZVFHMIN32-NEXT: vmv.x.s a6, v30 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 213(sp) ; ZVFHMIN32-NEXT: lh a0, 552(sp) ; ZVFHMIN32-NEXT: lh a1, 296(sp) -; ZVFHMIN32-NEXT: vmv.x.s a7, v2 -; ZVFHMIN32-NEXT: vmv.x.s a6, v22 +; ZVFHMIN32-NEXT: vmv.x.s a2, v22 +; ZVFHMIN32-NEXT: sw a2, 104(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s a2, v18 +; ZVFHMIN32-NEXT: sw a2, 108(sp) # 4-byte Folded Spill ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 212(sp) ; ZVFHMIN32-NEXT: lh a0, 550(sp) ; ZVFHMIN32-NEXT: lh a1, 294(sp) -; ZVFHMIN32-NEXT: vmv.x.s a5, v20 -; ZVFHMIN32-NEXT: vmv.x.s a2, v18 -; ZVFHMIN32-NEXT: sw a2, 108(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s a2, v14 +; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s a2, v12 +; ZVFHMIN32-NEXT: sw a2, 116(sp) # 4-byte Folded Spill ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 211(sp) ; ZVFHMIN32-NEXT: lh a0, 548(sp) ; ZVFHMIN32-NEXT: lh a1, 292(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v14 -; ZVFHMIN32-NEXT: sw a2, 116(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s a2, v10 +; ZVFHMIN32-NEXT: sw a2, 120(sp) # 4-byte Folded Spill ; ZVFHMIN32-NEXT: vmv.x.s a2, v8 ; ZVFHMIN32-NEXT: sw a2, 124(sp) # 4-byte Folded Spill ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 @@ -1539,208 +1548,204 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: sb a0, 210(sp) ; ZVFHMIN32-NEXT: lh a0, 546(sp) ; ZVFHMIN32-NEXT: lh a1, 290(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN32-NEXT: vmv.x.s a3, v24 +; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN32-NEXT: vmv.x.s t5, v24 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 ; ZVFHMIN32-NEXT: sb a0, 209(sp) ; ZVFHMIN32-NEXT: lh a0, 544(sp) ; ZVFHMIN32-NEXT: lh a1, 288(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t5 +; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a3, 192(sp) +; ZVFHMIN32-NEXT: sb t5, 192(sp) ; ZVFHMIN32-NEXT: sb a0, 208(sp) ; ZVFHMIN32-NEXT: lh a0, 738(sp) ; ZVFHMIN32-NEXT: lh a1, 482(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v10 -; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: vmv.x.s a2, v12 -; ZVFHMIN32-NEXT: sw a2, 120(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 177(sp) -; ZVFHMIN32-NEXT: lh a0, 736(sp) -; ZVFHMIN32-NEXT: lh a1, 480(sp) ; ZVFHMIN32-NEXT: csrr a2, vlenb ; ZVFHMIN32-NEXT: li a3, 29 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s7, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: csrr a2, vlenb ; ZVFHMIN32-NEXT: li a3, 28 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 176(sp) -; ZVFHMIN32-NEXT: lh a0, 734(sp) -; ZVFHMIN32-NEXT: lh a1, 478(sp) +; ZVFHMIN32-NEXT: sb a0, 177(sp) +; ZVFHMIN32-NEXT: lh a0, 736(sp) +; ZVFHMIN32-NEXT: lh a1, 480(sp) ; ZVFHMIN32-NEXT: csrr a2, vlenb ; ZVFHMIN32-NEXT: li a3, 27 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s6, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s8, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: csrr a2, vlenb ; ZVFHMIN32-NEXT: li a3, 26 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 175(sp) -; ZVFHMIN32-NEXT: lh a0, 732(sp) -; ZVFHMIN32-NEXT: lh a1, 476(sp) +; ZVFHMIN32-NEXT: sb a0, 176(sp) +; ZVFHMIN32-NEXT: lh a0, 734(sp) +; ZVFHMIN32-NEXT: lh a1, 478(sp) ; ZVFHMIN32-NEXT: csrr a2, vlenb ; ZVFHMIN32-NEXT: li a3, 25 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s7, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s9, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: csrr a2, vlenb ; ZVFHMIN32-NEXT: li a3, 24 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s6, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 174(sp) -; ZVFHMIN32-NEXT: lh a0, 730(sp) -; ZVFHMIN32-NEXT: lh a1, 474(sp) +; ZVFHMIN32-NEXT: sb a0, 175(sp) +; ZVFHMIN32-NEXT: lh a0, 732(sp) +; ZVFHMIN32-NEXT: lh a1, 476(sp) ; ZVFHMIN32-NEXT: csrr a2, vlenb ; ZVFHMIN32-NEXT: li a3, 23 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s8, 848(a2) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s t4, v21 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 173(sp) -; ZVFHMIN32-NEXT: lh a0, 728(sp) -; ZVFHMIN32-NEXT: lh a1, 472(sp) -; ZVFHMIN32-NEXT: vmv.x.s t6, v3 -; ZVFHMIN32-NEXT: vmv.x.s t5, v19 +; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s t5, v3 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 172(sp) -; ZVFHMIN32-NEXT: lh a0, 726(sp) -; ZVFHMIN32-NEXT: lh a1, 470(sp) -; ZVFHMIN32-NEXT: vmv.x.s s10, v11 -; ZVFHMIN32-NEXT: vmv.x.s s11, v7 +; ZVFHMIN32-NEXT: sb a0, 174(sp) +; ZVFHMIN32-NEXT: lh a0, 730(sp) +; ZVFHMIN32-NEXT: lh a1, 474(sp) +; ZVFHMIN32-NEXT: vmv.x.s s2, v31 +; ZVFHMIN32-NEXT: vmv.x.s t6, v5 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 171(sp) -; ZVFHMIN32-NEXT: lh a0, 724(sp) -; ZVFHMIN32-NEXT: lh s9, 468(sp) -; ZVFHMIN32-NEXT: vmv.x.s a4, v9 -; ZVFHMIN32-NEXT: vmv.x.s ra, v29 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s9 +; ZVFHMIN32-NEXT: sb a0, 173(sp) +; ZVFHMIN32-NEXT: lh a1, 728(sp) +; ZVFHMIN32-NEXT: lh s10, 472(sp) +; ZVFHMIN32-NEXT: vmv.x.s a3, v9 +; ZVFHMIN32-NEXT: vmv.x.s a4, v11 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, s10 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 172(sp) +; ZVFHMIN32-NEXT: lh a1, 726(sp) +; ZVFHMIN32-NEXT: lh s10, 470(sp) +; ZVFHMIN32-NEXT: vmv.x.s a2, v13 +; ZVFHMIN32-NEXT: vmv.x.s s11, v29 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, s10 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 171(sp) +; ZVFHMIN32-NEXT: lh ra, 724(sp) +; ZVFHMIN32-NEXT: lh a0, 468(sp) +; ZVFHMIN32-NEXT: vmv.x.s a5, v27 +; ZVFHMIN32-NEXT: vmv.x.s s10, v7 +; ZVFHMIN32-NEXT: fmv.h.x fa5, ra +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 170(sp) ; ZVFHMIN32-NEXT: lh a0, 722(sp) ; ZVFHMIN32-NEXT: lh a1, 466(sp) -; ZVFHMIN32-NEXT: vmv.x.s s9, v31 -; ZVFHMIN32-NEXT: vmv.x.s a3, v5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 169(sp) -; ZVFHMIN32-NEXT: lh a0, 720(sp) -; ZVFHMIN32-NEXT: lh a1, 464(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v27 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s5 +; ZVFHMIN32-NEXT: vmv.x.s ra, v21 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s7 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 -; ZVFHMIN32-NEXT: sb a0, 168(sp) -; ZVFHMIN32-NEXT: lh a0, 718(sp) -; ZVFHMIN32-NEXT: lh a1, 462(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s2 -; ZVFHMIN32-NEXT: fmv.h.x fa3, s6 +; ZVFHMIN32-NEXT: sb a0, 169(sp) +; ZVFHMIN32-NEXT: lh a0, 720(sp) +; ZVFHMIN32-NEXT: lh a1, 464(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, s4 +; ZVFHMIN32-NEXT: fmv.h.x fa3, s8 ; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 -; ZVFHMIN32-NEXT: sb a0, 167(sp) -; ZVFHMIN32-NEXT: lh a0, 716(sp) -; ZVFHMIN32-NEXT: lh a1, 460(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa2, s3 -; ZVFHMIN32-NEXT: fmv.h.x fa1, s7 +; ZVFHMIN32-NEXT: sb a0, 168(sp) +; ZVFHMIN32-NEXT: lh a0, 718(sp) +; ZVFHMIN32-NEXT: lh a1, 462(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa2, s5 +; ZVFHMIN32-NEXT: fmv.h.x fa1, s9 ; ZVFHMIN32-NEXT: fmv.h.x fa0, a0 ; ZVFHMIN32-NEXT: fmv.h.x ft0, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa0, ft0 -; ZVFHMIN32-NEXT: sb a0, 166(sp) -; ZVFHMIN32-NEXT: lh a0, 714(sp) -; ZVFHMIN32-NEXT: lh a1, 458(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa0, s4 -; ZVFHMIN32-NEXT: fmv.h.x ft0, s8 +; ZVFHMIN32-NEXT: sb a0, 167(sp) +; ZVFHMIN32-NEXT: lh a0, 716(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa0, s6 +; ZVFHMIN32-NEXT: lh a1, 460(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft0, a3 ; ZVFHMIN32-NEXT: fmv.h.x ft1, a0 -; ZVFHMIN32-NEXT: fmv.h.x ft2, a1 -; ZVFHMIN32-NEXT: feq.h a0, ft1, ft2 -; ZVFHMIN32-NEXT: sb a0, 165(sp) -; ZVFHMIN32-NEXT: lh a0, 712(sp) -; ZVFHMIN32-NEXT: lh a1, 456(sp) -; ZVFHMIN32-NEXT: fmv.h.x ft1, s10 -; ZVFHMIN32-NEXT: fmv.h.x ft2, s11 -; ZVFHMIN32-NEXT: fmv.h.x ft3, a0 -; ZVFHMIN32-NEXT: fmv.h.x ft4, a1 -; ZVFHMIN32-NEXT: feq.h a0, ft3, ft4 -; ZVFHMIN32-NEXT: sb a0, 164(sp) -; ZVFHMIN32-NEXT: lh a0, 710(sp) -; ZVFHMIN32-NEXT: fmv.h.x ft3, a4 -; ZVFHMIN32-NEXT: lh a1, 454(sp) -; ZVFHMIN32-NEXT: fmv.h.x ft4, ra -; ZVFHMIN32-NEXT: fmv.h.x ft5, a0 -; ZVFHMIN32-NEXT: feq.h a0, fa5, ft1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: feq.h a1, ft5, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN32-NEXT: sb a1, 163(sp) -; ZVFHMIN32-NEXT: lh a1, 708(sp) -; ZVFHMIN32-NEXT: fmv.h.x ft1, a2 -; ZVFHMIN32-NEXT: lh a2, 452(sp) -; ZVFHMIN32-NEXT: feq.h a3, fa0, fa5 +; ZVFHMIN32-NEXT: feq.h a1, ft1, fa5 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN32-NEXT: sb a1, 166(sp) +; ZVFHMIN32-NEXT: lh a1, 714(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft0, a2 +; ZVFHMIN32-NEXT: lh a2, 458(sp) +; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: feq.h a1, ft0, ft1 -; ZVFHMIN32-NEXT: fmv.h.x fa0, a2 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa0 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN32-NEXT: feq.h a1, fa3, ft0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s3 +; ZVFHMIN32-NEXT: sb a2, 165(sp) +; ZVFHMIN32-NEXT: lh a2, 712(sp) +; ZVFHMIN32-NEXT: lh a4, 456(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, s11 +; ZVFHMIN32-NEXT: feq.h s3, fa2, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa4, fa3 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN32-NEXT: sb a2, 164(sp) +; ZVFHMIN32-NEXT: lh a2, 710(sp) +; ZVFHMIN32-NEXT: lh a4, 454(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa3, s10 +; ZVFHMIN32-NEXT: feq.h a5, fa1, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa4, fa2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, ra +; ZVFHMIN32-NEXT: sb a2, 163(sp) +; ZVFHMIN32-NEXT: lh a2, 708(sp) +; ZVFHMIN32-NEXT: lh a4, 452(sp) +; ZVFHMIN32-NEXT: feq.h s4, fa0, fa3 +; ZVFHMIN32-NEXT: feq.h s5, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN32-NEXT: sb a2, 162(sp) ; ZVFHMIN32-NEXT: lh a2, 706(sp) ; ZVFHMIN32-NEXT: lh a4, 450(sp) -; ZVFHMIN32-NEXT: sb a1, 129(sp) -; ZVFHMIN32-NEXT: feq.h a1, fa1, fa5 -; ZVFHMIN32-NEXT: sb a3, 130(sp) -; ZVFHMIN32-NEXT: feq.h a3, fa2, ft4 -; ZVFHMIN32-NEXT: sb a1, 131(sp) -; ZVFHMIN32-NEXT: feq.h a1, fa4, ft2 -; ZVFHMIN32-NEXT: sb a3, 132(sp) -; ZVFHMIN32-NEXT: feq.h a3, fa3, ft3 +; ZVFHMIN32-NEXT: sb s5, 129(sp) +; ZVFHMIN32-NEXT: sb s4, 130(sp) +; ZVFHMIN32-NEXT: sb a5, 131(sp) +; ZVFHMIN32-NEXT: sb s3, 132(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 ; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: sb a3, 133(sp) -; ZVFHMIN32-NEXT: sb a1, 134(sp) +; ZVFHMIN32-NEXT: sb a1, 133(sp) +; ZVFHMIN32-NEXT: sb a3, 134(sp) ; ZVFHMIN32-NEXT: sb a0, 135(sp) ; ZVFHMIN32-NEXT: sb a2, 161(sp) ; ZVFHMIN32-NEXT: lh a0, 610(sp) ; ZVFHMIN32-NEXT: lh a1, 354(sp) -; ZVFHMIN32-NEXT: vmv.x.s s4, v23 +; ZVFHMIN32-NEXT: vmv.x.s s6, v23 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 10 +; ZVFHMIN32-NEXT: li a3, 18 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -1748,12 +1753,13 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 608(sp) ; ZVFHMIN32-NEXT: lh a1, 352(sp) ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 4 +; ZVFHMIN32-NEXT: li a3, 22 +; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a3, a2, 4 -; ZVFHMIN32-NEXT: sub a2, a3, a2 +; ZVFHMIN32-NEXT: li a3, 21 +; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 @@ -1762,148 +1768,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: sb a0, 240(sp) ; ZVFHMIN32-NEXT: lh a0, 606(sp) ; ZVFHMIN32-NEXT: lh a1, 350(sp) -; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7 -; ZVFHMIN32-NEXT: vmv.x.s s6, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa3, t5 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa4, fa2 ; ZVFHMIN32-NEXT: sb a0, 239(sp) ; ZVFHMIN32-NEXT: lh a0, 604(sp) ; ZVFHMIN32-NEXT: lh a1, 348(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6 -; ZVFHMIN32-NEXT: vmv.x.s s7, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 238(sp) ; ZVFHMIN32-NEXT: lh a0, 602(sp) ; ZVFHMIN32-NEXT: lh a1, 346(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5 -; ZVFHMIN32-NEXT: vmv.x.s s8, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: vmv.x.s a2, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 237(sp) ; ZVFHMIN32-NEXT: lh a0, 600(sp) ; ZVFHMIN32-NEXT: lh a1, 344(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4 -; ZVFHMIN32-NEXT: vmv.x.s s9, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: vmv.x.s a3, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 236(sp) ; ZVFHMIN32-NEXT: lh a0, 598(sp) ; ZVFHMIN32-NEXT: lh a1, 342(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3 -; ZVFHMIN32-NEXT: vmv.x.s s10, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: vmv.x.s a4, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 235(sp) ; ZVFHMIN32-NEXT: lh a0, 596(sp) ; ZVFHMIN32-NEXT: lh a1, 340(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2 -; ZVFHMIN32-NEXT: vmv.x.s s11, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: vmv.x.s a5, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 234(sp) ; ZVFHMIN32-NEXT: lh a0, 594(sp) ; ZVFHMIN32-NEXT: lh a1, 338(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1 -; ZVFHMIN32-NEXT: vmv.x.s ra, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: vmv.x.s t6, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 233(sp) ; ZVFHMIN32-NEXT: lh a0, 592(sp) ; ZVFHMIN32-NEXT: lh a1, 336(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: vmv.x.s s2, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 232(sp) ; ZVFHMIN32-NEXT: lh a0, 590(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa2, a2 ; ZVFHMIN32-NEXT: lh a1, 334(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa3, t5 -; ZVFHMIN32-NEXT: fmv.h.x fa2, s4 +; ZVFHMIN32-NEXT: vmv.x.s a2, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa1, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa0, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa1, fa0 +; ZVFHMIN32-NEXT: feq.h t5, fa3, fa2 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa1, fa3 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a3 ; ZVFHMIN32-NEXT: sb a0, 231(sp) ; ZVFHMIN32-NEXT: lh a0, 588(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa2, a4 ; ZVFHMIN32-NEXT: lh a1, 332(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa1, s2 -; ZVFHMIN32-NEXT: fmv.h.x fa0, s5 -; ZVFHMIN32-NEXT: fmv.h.x ft0, a0 -; ZVFHMIN32-NEXT: fmv.h.x ft1, a1 -; ZVFHMIN32-NEXT: feq.h a0, ft0, ft1 -; ZVFHMIN32-NEXT: sb a0, 230(sp) -; ZVFHMIN32-NEXT: lh a0, 586(sp) -; ZVFHMIN32-NEXT: fmv.h.x ft0, s3 -; ZVFHMIN32-NEXT: lh a1, 330(sp) -; ZVFHMIN32-NEXT: fmv.h.x ft1, s6 -; ZVFHMIN32-NEXT: fmv.h.x ft2, a0 -; ZVFHMIN32-NEXT: feq.h a0, fa5, ft1 +; ZVFHMIN32-NEXT: feq.h a3, fa5, fa3 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: feq.h a0, fa4, fa2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s6 +; ZVFHMIN32-NEXT: sb a1, 230(sp) +; ZVFHMIN32-NEXT: lh a1, 586(sp) +; ZVFHMIN32-NEXT: lh a4, 330(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: feq.h a1, ft2, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s7 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s5 ; ZVFHMIN32-NEXT: sb a1, 229(sp) ; ZVFHMIN32-NEXT: lh a1, 584(sp) -; ZVFHMIN32-NEXT: fmv.h.x ft1, s8 -; ZVFHMIN32-NEXT: lh a2, 328(sp) -; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5 +; ZVFHMIN32-NEXT: lh a4, 328(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN32-NEXT: feq.h t6, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: feq.h a1, fa3, ft1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s9 -; ZVFHMIN32-NEXT: sb a2, 228(sp) -; ZVFHMIN32-NEXT: lh a2, 582(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s4 +; ZVFHMIN32-NEXT: sb a1, 228(sp) +; ZVFHMIN32-NEXT: lh a1, 582(sp) ; ZVFHMIN32-NEXT: lh a4, 326(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN32-NEXT: feq.h t4, fa2, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a4 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa3 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s11 -; ZVFHMIN32-NEXT: fmv.h.x fa3, ra -; ZVFHMIN32-NEXT: sb a2, 227(sp) -; ZVFHMIN32-NEXT: lh a2, 580(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, s2 +; ZVFHMIN32-NEXT: feq.h s2, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s3 +; ZVFHMIN32-NEXT: sb a1, 227(sp) +; ZVFHMIN32-NEXT: lh a1, 580(sp) ; ZVFHMIN32-NEXT: lh a4, 324(sp) -; ZVFHMIN32-NEXT: feq.h t5, fa0, fa5 -; ZVFHMIN32-NEXT: feq.h t6, ft0, fa3 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a4 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa3 -; ZVFHMIN32-NEXT: sb a2, 226(sp) -; ZVFHMIN32-NEXT: lh a2, 578(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 226(sp) +; ZVFHMIN32-NEXT: lh a1, 578(sp) ; ZVFHMIN32-NEXT: lh a4, 322(sp) -; ZVFHMIN32-NEXT: sb t6, 193(sp) -; ZVFHMIN32-NEXT: feq.h t6, fa1, fa4 -; ZVFHMIN32-NEXT: sb t5, 194(sp) +; ZVFHMIN32-NEXT: sb a2, 193(sp) +; ZVFHMIN32-NEXT: sb s2, 194(sp) ; ZVFHMIN32-NEXT: sb t6, 195(sp) -; ZVFHMIN32-NEXT: sb t4, 196(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: sb a5, 196(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 197(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 197(sp) ; ZVFHMIN32-NEXT: sb a3, 198(sp) -; ZVFHMIN32-NEXT: sb a0, 199(sp) -; ZVFHMIN32-NEXT: sb a2, 225(sp) +; ZVFHMIN32-NEXT: sb t5, 199(sp) +; ZVFHMIN32-NEXT: sb a1, 225(sp) ; ZVFHMIN32-NEXT: lh a0, 766(sp) ; ZVFHMIN32-NEXT: lh a1, 510(sp) ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a3, a2, 4 -; ZVFHMIN32-NEXT: add a2, a3, a2 +; ZVFHMIN32-NEXT: li a3, 19 +; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload ; ZVFHMIN32-NEXT: vmv.x.s s2, v8 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 11 +; ZVFHMIN32-NEXT: li a3, 14 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 @@ -1915,305 +1921,301 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: sb a0, 191(sp) ; ZVFHMIN32-NEXT: lh a0, 764(sp) ; ZVFHMIN32-NEXT: lh a1, 508(sp) -; ZVFHMIN32-NEXT: vmv.x.s t5, v4 -; ZVFHMIN32-NEXT: vmv.x.s t4, v30 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 190(sp) -; ZVFHMIN32-NEXT: lh a0, 762(sp) -; ZVFHMIN32-NEXT: lh a1, 506(sp) +; ZVFHMIN32-NEXT: vmv.x.s t5, v6 ; ZVFHMIN32-NEXT: csrr a2, vlenb ; ZVFHMIN32-NEXT: slli a2, a2, 2 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload ; ZVFHMIN32-NEXT: vmv.x.s a2, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 190(sp) +; ZVFHMIN32-NEXT: lh a0, 762(sp) +; ZVFHMIN32-NEXT: lh a1, 506(sp) ; ZVFHMIN32-NEXT: csrr a3, vlenb -; ZVFHMIN32-NEXT: slli a3, a3, 1 +; ZVFHMIN32-NEXT: slli a3, a3, 3 ; ZVFHMIN32-NEXT: add a3, sp, a3 ; ZVFHMIN32-NEXT: addi a3, a3, 848 ; ZVFHMIN32-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload ; ZVFHMIN32-NEXT: vmv.x.s a3, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 189(sp) -; ZVFHMIN32-NEXT: lh a0, 760(sp) -; ZVFHMIN32-NEXT: lh a1, 504(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, t3 ; ZVFHMIN32-NEXT: csrr a4, vlenb -; ZVFHMIN32-NEXT: li t3, 6 -; ZVFHMIN32-NEXT: mul a4, a4, t3 +; ZVFHMIN32-NEXT: li a5, 6 +; ZVFHMIN32-NEXT: mul a4, a4, a5 ; ZVFHMIN32-NEXT: add a4, sp, a4 ; ZVFHMIN32-NEXT: addi a4, a4, 848 ; ZVFHMIN32-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload ; ZVFHMIN32-NEXT: vmv.x.s a4, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 -; ZVFHMIN32-NEXT: sb a0, 188(sp) -; ZVFHMIN32-NEXT: lh a0, 758(sp) -; ZVFHMIN32-NEXT: lh a1, 502(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 -; ZVFHMIN32-NEXT: csrr t2, vlenb -; ZVFHMIN32-NEXT: slli t2, t2, 3 -; ZVFHMIN32-NEXT: add t2, sp, t2 -; ZVFHMIN32-NEXT: addi t2, t2, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (t2) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s t2, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN32-NEXT: sb a0, 187(sp) -; ZVFHMIN32-NEXT: lh a0, 756(sp) -; ZVFHMIN32-NEXT: lh a1, 500(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa3, t1 -; ZVFHMIN32-NEXT: csrr t1, vlenb -; ZVFHMIN32-NEXT: li t3, 13 -; ZVFHMIN32-NEXT: mul t1, t1, t3 -; ZVFHMIN32-NEXT: add t1, sp, t1 -; ZVFHMIN32-NEXT: addi t1, t1, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (t1) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s t3, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 -; ZVFHMIN32-NEXT: sb a0, 186(sp) -; ZVFHMIN32-NEXT: lh a0, 754(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa2, t0 -; ZVFHMIN32-NEXT: lh a1, 498(sp) -; ZVFHMIN32-NEXT: csrr t0, vlenb -; ZVFHMIN32-NEXT: li t1, 19 -; ZVFHMIN32-NEXT: mul t0, t0, t1 -; ZVFHMIN32-NEXT: add t0, sp, t0 -; ZVFHMIN32-NEXT: addi t0, t0, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (t0) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s s3, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa1, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 189(sp) +; ZVFHMIN32-NEXT: lh a1, 760(sp) +; ZVFHMIN32-NEXT: lh a5, 504(sp) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li s3, 12 +; ZVFHMIN32-NEXT: mul a0, a0, s3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s5, v8 ; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li t0, 21 -; ZVFHMIN32-NEXT: mul a0, a0, t0 +; ZVFHMIN32-NEXT: li s3, 10 +; ZVFHMIN32-NEXT: mul a0, a0, s3 ; ZVFHMIN32-NEXT: add a0, sp, a0 ; ZVFHMIN32-NEXT: addi a0, a0, 848 ; ZVFHMIN32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN32-NEXT: vmv.x.s a0, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa0, a1 -; ZVFHMIN32-NEXT: feq.h a1, fa1, fa0 -; ZVFHMIN32-NEXT: fmv.h.x fa1, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 188(sp) +; ZVFHMIN32-NEXT: lh a1, 758(sp) +; ZVFHMIN32-NEXT: lh a5, 502(sp) +; ZVFHMIN32-NEXT: csrr s3, vlenb +; ZVFHMIN32-NEXT: slli s3, s3, 4 +; ZVFHMIN32-NEXT: add s3, sp, s3 +; ZVFHMIN32-NEXT: addi s3, s3, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s4, v8 +; ZVFHMIN32-NEXT: vmv.x.s s3, v16 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 +; ZVFHMIN32-NEXT: sb a1, 187(sp) +; ZVFHMIN32-NEXT: lh a1, 756(sp) +; ZVFHMIN32-NEXT: lh a5, 500(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h t4, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, t3 +; ZVFHMIN32-NEXT: sb a1, 186(sp) +; ZVFHMIN32-NEXT: lh a1, 754(sp) +; ZVFHMIN32-NEXT: lh a2, 498(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h t3, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, t1 ; ZVFHMIN32-NEXT: sb a1, 185(sp) ; ZVFHMIN32-NEXT: lh a1, 752(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa0, a3 ; ZVFHMIN32-NEXT: lh a2, 496(sp) -; ZVFHMIN32-NEXT: feq.h t0, fa5, fa1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h t1, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: feq.h t1, fa4, fa0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 ; ZVFHMIN32-NEXT: sb a1, 184(sp) ; ZVFHMIN32-NEXT: lh a1, 750(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 ; ZVFHMIN32-NEXT: lh a2, 494(sp) -; ZVFHMIN32-NEXT: feq.h a3, fa3, fa5 +; ZVFHMIN32-NEXT: fmv.h.x fa4, s5 +; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: feq.h a1, fa2, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, t0 +; ZVFHMIN32-NEXT: sb a1, 183(sp) +; ZVFHMIN32-NEXT: lh a1, 748(sp) +; ZVFHMIN32-NEXT: lh a2, 492(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a7 -; ZVFHMIN32-NEXT: sb a2, 183(sp) -; ZVFHMIN32-NEXT: lh a2, 748(sp) -; ZVFHMIN32-NEXT: lh a4, 492(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 182(sp) +; ZVFHMIN32-NEXT: lh a1, 746(sp) +; ZVFHMIN32-NEXT: lh a2, 490(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, s4 +; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN32-NEXT: sb a2, 182(sp) -; ZVFHMIN32-NEXT: lh a2, 746(sp) -; ZVFHMIN32-NEXT: lh a4, 490(sp) +; ZVFHMIN32-NEXT: sb a1, 181(sp) +; ZVFHMIN32-NEXT: lh a1, 744(sp) +; ZVFHMIN32-NEXT: lh a2, 488(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, s3 ; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: lw a2, 104(sp) # 4-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a5 -; ZVFHMIN32-NEXT: sb a2, 181(sp) -; ZVFHMIN32-NEXT: lh a2, 744(sp) -; ZVFHMIN32-NEXT: lh a4, 488(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: lw a4, 108(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: vmv.x.s a5, v0 +; ZVFHMIN32-NEXT: addi a2, sp, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s a2, v8 ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 15 -; ZVFHMIN32-NEXT: vmv.x.s a4, v8 -; ZVFHMIN32-NEXT: sb a2, 180(sp) -; ZVFHMIN32-NEXT: lh a2, 742(sp) -; ZVFHMIN32-NEXT: lh t2, 486(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN32-NEXT: vmv.x.s a5, v8 +; ZVFHMIN32-NEXT: sb a1, 180(sp) +; ZVFHMIN32-NEXT: lh a1, 742(sp) +; ZVFHMIN32-NEXT: lh a7, 486(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: sb a2, 179(sp) -; ZVFHMIN32-NEXT: lh a2, 740(sp) -; ZVFHMIN32-NEXT: lh t2, 484(sp) -; ZVFHMIN32-NEXT: sb a1, 140(sp) -; ZVFHMIN32-NEXT: sb a3, 141(sp) -; ZVFHMIN32-NEXT: sb t1, 142(sp) -; ZVFHMIN32-NEXT: sb t0, 143(sp) -; ZVFHMIN32-NEXT: sb a5, 136(sp) -; ZVFHMIN32-NEXT: sb a0, 137(sp) -; ZVFHMIN32-NEXT: sb a6, 138(sp) -; ZVFHMIN32-NEXT: sb a7, 139(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 179(sp) +; ZVFHMIN32-NEXT: lh a1, 740(sp) +; ZVFHMIN32-NEXT: lh a7, 484(sp) +; ZVFHMIN32-NEXT: sb a3, 140(sp) +; ZVFHMIN32-NEXT: sb t1, 141(sp) +; ZVFHMIN32-NEXT: sb t3, 142(sp) +; ZVFHMIN32-NEXT: sb t4, 143(sp) +; ZVFHMIN32-NEXT: sb a2, 136(sp) +; ZVFHMIN32-NEXT: sb a6, 137(sp) +; ZVFHMIN32-NEXT: sb a4, 138(sp) +; ZVFHMIN32-NEXT: sb a0, 139(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 178(sp) -; ZVFHMIN32-NEXT: lh a0, 638(sp) -; ZVFHMIN32-NEXT: lh a1, 382(sp) +; ZVFHMIN32-NEXT: lh a1, 638(sp) +; ZVFHMIN32-NEXT: lh a2, 382(sp) ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 14 -; ZVFHMIN32-NEXT: vmv.x.s t2, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 255(sp) -; ZVFHMIN32-NEXT: lh a0, 636(sp) -; ZVFHMIN32-NEXT: lh a1, 380(sp) +; ZVFHMIN32-NEXT: vmv.x.s a0, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 255(sp) +; ZVFHMIN32-NEXT: lh a1, 636(sp) +; ZVFHMIN32-NEXT: lh a2, 380(sp) ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 13 -; ZVFHMIN32-NEXT: vmv.x.s t1, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 254(sp) -; ZVFHMIN32-NEXT: lh a0, 634(sp) -; ZVFHMIN32-NEXT: lh a1, 378(sp) +; ZVFHMIN32-NEXT: vmv.x.s t2, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 254(sp) +; ZVFHMIN32-NEXT: lh a1, 634(sp) +; ZVFHMIN32-NEXT: lh a2, 378(sp) ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 12 -; ZVFHMIN32-NEXT: vmv.x.s t0, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 253(sp) -; ZVFHMIN32-NEXT: lh a0, 632(sp) -; ZVFHMIN32-NEXT: lh a1, 376(sp) +; ZVFHMIN32-NEXT: vmv.x.s t1, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 253(sp) +; ZVFHMIN32-NEXT: lh a1, 632(sp) +; ZVFHMIN32-NEXT: lh a2, 376(sp) ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 11 -; ZVFHMIN32-NEXT: vmv.x.s a7, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 252(sp) -; ZVFHMIN32-NEXT: lh a0, 630(sp) -; ZVFHMIN32-NEXT: lh a1, 374(sp) +; ZVFHMIN32-NEXT: vmv.x.s t0, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 252(sp) +; ZVFHMIN32-NEXT: lh a1, 630(sp) +; ZVFHMIN32-NEXT: lh a2, 374(sp) ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 10 -; ZVFHMIN32-NEXT: vmv.x.s a6, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 251(sp) -; ZVFHMIN32-NEXT: lh a0, 628(sp) -; ZVFHMIN32-NEXT: lh a1, 372(sp) +; ZVFHMIN32-NEXT: vmv.x.s a7, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 251(sp) +; ZVFHMIN32-NEXT: lh a1, 628(sp) +; ZVFHMIN32-NEXT: lh a2, 372(sp) ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 9 -; ZVFHMIN32-NEXT: vmv.x.s a5, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: vmv.x.s a6, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: lw a2, 108(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: sb a1, 250(sp) +; ZVFHMIN32-NEXT: lh a1, 626(sp) +; ZVFHMIN32-NEXT: lh a2, 370(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: lw a2, 112(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: sb a1, 249(sp) +; ZVFHMIN32-NEXT: lh a1, 624(sp) +; ZVFHMIN32-NEXT: lh a2, 368(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: lw a1, 116(sp) # 4-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: sb a0, 250(sp) -; ZVFHMIN32-NEXT: lh a0, 626(sp) -; ZVFHMIN32-NEXT: lh a1, 370(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: sb a0, 248(sp) +; ZVFHMIN32-NEXT: lh a0, 622(sp) +; ZVFHMIN32-NEXT: lh a1, 366(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 ; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: lw a1, 124(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: sb a0, 249(sp) -; ZVFHMIN32-NEXT: lh a1, 624(sp) -; ZVFHMIN32-NEXT: lh a3, 368(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: lw a1, 120(sp) # 4-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: lw a3, 112(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN32-NEXT: sb a1, 248(sp) -; ZVFHMIN32-NEXT: lh a1, 622(sp) -; ZVFHMIN32-NEXT: lh a3, 366(sp) +; ZVFHMIN32-NEXT: sb a0, 247(sp) +; ZVFHMIN32-NEXT: lh a0, 620(sp) +; ZVFHMIN32-NEXT: lh a1, 364(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: lw a1, 124(sp) # 4-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: lw a3, 120(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN32-NEXT: sb a1, 247(sp) -; ZVFHMIN32-NEXT: lh a1, 620(sp) -; ZVFHMIN32-NEXT: lh a3, 364(sp) +; ZVFHMIN32-NEXT: sb a0, 246(sp) +; ZVFHMIN32-NEXT: lh a0, 618(sp) +; ZVFHMIN32-NEXT: lh a1, 362(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, t0 ; ZVFHMIN32-NEXT: feq.h t0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, s2 -; ZVFHMIN32-NEXT: sb a1, 246(sp) -; ZVFHMIN32-NEXT: lh a1, 618(sp) -; ZVFHMIN32-NEXT: lh a3, 362(sp) +; ZVFHMIN32-NEXT: sb a0, 245(sp) +; ZVFHMIN32-NEXT: lh a0, 616(sp) +; ZVFHMIN32-NEXT: lh a1, 360(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 ; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN32-NEXT: sb a1, 245(sp) -; ZVFHMIN32-NEXT: lh a1, 616(sp) -; ZVFHMIN32-NEXT: lh a3, 360(sp) +; ZVFHMIN32-NEXT: sb a0, 244(sp) +; ZVFHMIN32-NEXT: lh a0, 614(sp) +; ZVFHMIN32-NEXT: lh a1, 358(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, a6 ; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN32-NEXT: sb a1, 244(sp) -; ZVFHMIN32-NEXT: lh a1, 614(sp) -; ZVFHMIN32-NEXT: lh a3, 358(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 8 -; ZVFHMIN32-NEXT: vmv.x.s a3, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: sb a1, 243(sp) -; ZVFHMIN32-NEXT: lh a1, 612(sp) -; ZVFHMIN32-NEXT: lh a3, 356(sp) -; ZVFHMIN32-NEXT: sb t0, 204(sp) -; ZVFHMIN32-NEXT: sb a4, 205(sp) -; ZVFHMIN32-NEXT: sb a0, 206(sp) -; ZVFHMIN32-NEXT: sb a2, 207(sp) +; ZVFHMIN32-NEXT: vmv.x.s a1, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: sb a0, 243(sp) +; ZVFHMIN32-NEXT: lh a0, 612(sp) +; ZVFHMIN32-NEXT: lh a1, 356(sp) +; ZVFHMIN32-NEXT: sb a5, 204(sp) +; ZVFHMIN32-NEXT: sb a2, 205(sp) +; ZVFHMIN32-NEXT: sb a3, 206(sp) +; ZVFHMIN32-NEXT: sb a4, 207(sp) +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: sb a2, 200(sp) +; ZVFHMIN32-NEXT: sb a6, 201(sp) +; ZVFHMIN32-NEXT: sb a7, 202(sp) +; ZVFHMIN32-NEXT: sb t0, 203(sp) +; ZVFHMIN32-NEXT: li a2, 128 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 200(sp) -; ZVFHMIN32-NEXT: sb a5, 201(sp) -; ZVFHMIN32-NEXT: sb a6, 202(sp) -; ZVFHMIN32-NEXT: sb a7, 203(sp) -; ZVFHMIN32-NEXT: li a0, 128 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 242(sp) -; ZVFHMIN32-NEXT: addi a1, sp, 128 -; ZVFHMIN32-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; ZVFHMIN32-NEXT: vle8.v v8, (a1) +; ZVFHMIN32-NEXT: sb a0, 242(sp) +; ZVFHMIN32-NEXT: addi a0, sp, 128 +; ZVFHMIN32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; ZVFHMIN32-NEXT: vle8.v v8, (a0) ; ZVFHMIN32-NEXT: vand.vi v8, v8, 1 ; ZVFHMIN32-NEXT: vmsne.vi v0, v8, 0 ; ZVFHMIN32-NEXT: addi sp, s0, -896 @@ -2440,12 +2442,6 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: sb a0, 219(sp) ; ZVFHMIN64-NEXT: lh a0, 564(sp) ; ZVFHMIN64-NEXT: lh a1, 308(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 218(sp) -; ZVFHMIN64-NEXT: lh a0, 562(sp) -; ZVFHMIN64-NEXT: lh a1, 306(sp) ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 7 ; ZVFHMIN64-NEXT: csrr a2, vlenb @@ -2498,82 +2494,86 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN64-NEXT: vslidedown.vi v26, v8, 15 -; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 14 -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 13 -; ZVFHMIN64-NEXT: addi a2, sp, 800 +; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 14 +; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 13 +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 12 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a2, a2, 1 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v6, v8, 12 -; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 11 -; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 10 -; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 9 -; ZVFHMIN64-NEXT: vslidedown.vi v18, v8, 8 -; ZVFHMIN64-NEXT: vmv.x.s a3, v16 +; ZVFHMIN64-NEXT: vslidedown.vi v4, v8, 11 +; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 10 +; ZVFHMIN64-NEXT: vslidedown.vi v30, v8, 9 +; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 8 +; ZVFHMIN64-NEXT: vmv.x.s t5, v16 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 217(sp) -; ZVFHMIN64-NEXT: lh a0, 560(sp) -; ZVFHMIN64-NEXT: lh a1, 304(sp) +; ZVFHMIN64-NEXT: sb a0, 218(sp) +; ZVFHMIN64-NEXT: lh a0, 562(sp) +; ZVFHMIN64-NEXT: lh a1, 306(sp) ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v21, v16, 7 -; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 6 -; ZVFHMIN64-NEXT: vslidedown.vi v19, v16, 5 +; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 7 +; ZVFHMIN64-NEXT: vslidedown.vi v31, v16, 6 +; ZVFHMIN64-NEXT: vslidedown.vi v5, v16, 5 ; ZVFHMIN64-NEXT: vslidedown.vi v23, v16, 4 ; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 3 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a4, 10 -; ZVFHMIN64-NEXT: mul a2, a2, a4 +; ZVFHMIN64-NEXT: li a3, 18 +; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 2 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 4 +; ZVFHMIN64-NEXT: li a3, 22 +; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 1 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a4, a2, 4 -; ZVFHMIN64-NEXT: sub a2, a4, a2 +; ZVFHMIN64-NEXT: li a3, 21 +; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 15 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 14 -; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 13 -; ZVFHMIN64-NEXT: vslidedown.vi v12, v16, 12 -; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 11 +; ZVFHMIN64-NEXT: vslidedown.vi v18, v16, 15 +; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 14 +; ZVFHMIN64-NEXT: vslidedown.vi v12, v16, 13 +; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 12 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 11 +; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 10 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a4, a2, 4 -; ZVFHMIN64-NEXT: add a2, a4, a2 +; ZVFHMIN64-NEXT: li a3, 19 +; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 10 +; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 9 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a4, 11 -; ZVFHMIN64-NEXT: mul a2, a2, a4 +; ZVFHMIN64-NEXT: li a3, 14 +; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v4, v16, 9 -; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 8 +; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 216(sp) -; ZVFHMIN64-NEXT: lh a0, 558(sp) -; ZVFHMIN64-NEXT: lh a1, 302(sp) -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 7 -; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 6 -; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 5 -; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 4 -; ZVFHMIN64-NEXT: vslidedown.vi v31, v0, 3 -; ZVFHMIN64-NEXT: vslidedown.vi v5, v0, 2 -; ZVFHMIN64-NEXT: vslidedown.vi v27, v0, 1 +; ZVFHMIN64-NEXT: sb a0, 217(sp) +; ZVFHMIN64-NEXT: lh a0, 560(sp) +; ZVFHMIN64-NEXT: lh a1, 304(sp) +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 7 +; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 6 +; ZVFHMIN64-NEXT: vslidedown.vi v13, v0, 5 +; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 4 +; ZVFHMIN64-NEXT: vslidedown.vi v27, v0, 3 +; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 2 +; ZVFHMIN64-NEXT: vslidedown.vi v21, v0, 1 ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 15 ; ZVFHMIN64-NEXT: csrr a2, vlenb @@ -2583,88 +2583,99 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 14 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 1 +; ZVFHMIN64-NEXT: slli a2, a2, 3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 13 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a4, 6 -; ZVFHMIN64-NEXT: mul a2, a2, a4 +; ZVFHMIN64-NEXT: li a3, 6 +; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 12 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 3 +; ZVFHMIN64-NEXT: li a3, 12 +; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 11 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a4, 13 -; ZVFHMIN64-NEXT: mul a2, a2, a4 +; ZVFHMIN64-NEXT: li a3, 10 +; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 10 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a4, 19 -; ZVFHMIN64-NEXT: mul a2, a2, a4 +; ZVFHMIN64-NEXT: slli a2, a2, 4 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 9 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a4, 21 -; ZVFHMIN64-NEXT: mul a2, a2, a4 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v0, v0, 8 +; ZVFHMIN64-NEXT: addi a2, sp, 800 +; ZVFHMIN64-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s t4, v26 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 216(sp) +; ZVFHMIN64-NEXT: lh a0, 558(sp) +; ZVFHMIN64-NEXT: lh a1, 302(sp) +; ZVFHMIN64-NEXT: vmv.x.s t3, v20 +; ZVFHMIN64-NEXT: vmv.x.s t1, v28 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 215(sp) ; ZVFHMIN64-NEXT: lh a0, 556(sp) ; ZVFHMIN64-NEXT: lh a1, 300(sp) -; ZVFHMIN64-NEXT: vmv.x.s t3, v26 -; ZVFHMIN64-NEXT: vmv.x.s t2, v28 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a2, a2, 1 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s t2, v0 +; ZVFHMIN64-NEXT: vmv.x.s t0, v4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 214(sp) ; ZVFHMIN64-NEXT: lh a0, 554(sp) ; ZVFHMIN64-NEXT: lh a1, 298(sp) -; ZVFHMIN64-NEXT: addi a2, sp, 800 -; ZVFHMIN64-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s t1, v16 -; ZVFHMIN64-NEXT: vmv.x.s t0, v6 +; ZVFHMIN64-NEXT: vmv.x.s a7, v2 +; ZVFHMIN64-NEXT: vmv.x.s a6, v30 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 213(sp) ; ZVFHMIN64-NEXT: lh a0, 552(sp) ; ZVFHMIN64-NEXT: lh a1, 296(sp) -; ZVFHMIN64-NEXT: vmv.x.s a7, v2 -; ZVFHMIN64-NEXT: vmv.x.s a6, v22 +; ZVFHMIN64-NEXT: vmv.x.s a2, v22 +; ZVFHMIN64-NEXT: sd a2, 80(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s a2, v18 +; ZVFHMIN64-NEXT: sd a2, 88(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 212(sp) ; ZVFHMIN64-NEXT: lh a0, 550(sp) ; ZVFHMIN64-NEXT: lh a1, 294(sp) -; ZVFHMIN64-NEXT: vmv.x.s a5, v20 -; ZVFHMIN64-NEXT: vmv.x.s a2, v18 -; ZVFHMIN64-NEXT: sd a2, 88(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s a2, v14 +; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s a2, v12 +; ZVFHMIN64-NEXT: sd a2, 104(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 211(sp) ; ZVFHMIN64-NEXT: lh a0, 548(sp) ; ZVFHMIN64-NEXT: lh a1, 292(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v14 -; ZVFHMIN64-NEXT: sd a2, 104(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s a2, v10 +; ZVFHMIN64-NEXT: sd a2, 112(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: vmv.x.s a2, v8 ; ZVFHMIN64-NEXT: sd a2, 120(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 @@ -2673,208 +2684,204 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: sb a0, 210(sp) ; ZVFHMIN64-NEXT: lh a0, 546(sp) ; ZVFHMIN64-NEXT: lh a1, 290(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN64-NEXT: vmv.x.s a3, v24 +; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN64-NEXT: vmv.x.s t5, v24 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 ; ZVFHMIN64-NEXT: sb a0, 209(sp) ; ZVFHMIN64-NEXT: lh a0, 544(sp) ; ZVFHMIN64-NEXT: lh a1, 288(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t5 +; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a3, 192(sp) +; ZVFHMIN64-NEXT: sb t5, 192(sp) ; ZVFHMIN64-NEXT: sb a0, 208(sp) ; ZVFHMIN64-NEXT: lh a0, 738(sp) ; ZVFHMIN64-NEXT: lh a1, 482(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v10 -; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: vmv.x.s a2, v12 -; ZVFHMIN64-NEXT: sd a2, 112(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 177(sp) -; ZVFHMIN64-NEXT: lh a0, 736(sp) -; ZVFHMIN64-NEXT: lh a1, 480(sp) ; ZVFHMIN64-NEXT: csrr a2, vlenb ; ZVFHMIN64-NEXT: li a3, 29 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s7, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: csrr a2, vlenb ; ZVFHMIN64-NEXT: li a3, 28 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 176(sp) -; ZVFHMIN64-NEXT: lh a0, 734(sp) -; ZVFHMIN64-NEXT: lh a1, 478(sp) +; ZVFHMIN64-NEXT: sb a0, 177(sp) +; ZVFHMIN64-NEXT: lh a0, 736(sp) +; ZVFHMIN64-NEXT: lh a1, 480(sp) ; ZVFHMIN64-NEXT: csrr a2, vlenb ; ZVFHMIN64-NEXT: li a3, 27 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s6, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s8, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: csrr a2, vlenb ; ZVFHMIN64-NEXT: li a3, 26 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 175(sp) -; ZVFHMIN64-NEXT: lh a0, 732(sp) -; ZVFHMIN64-NEXT: lh a1, 476(sp) +; ZVFHMIN64-NEXT: sb a0, 176(sp) +; ZVFHMIN64-NEXT: lh a0, 734(sp) +; ZVFHMIN64-NEXT: lh a1, 478(sp) ; ZVFHMIN64-NEXT: csrr a2, vlenb ; ZVFHMIN64-NEXT: li a3, 25 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s7, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s9, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: csrr a2, vlenb ; ZVFHMIN64-NEXT: li a3, 24 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s6, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 174(sp) -; ZVFHMIN64-NEXT: lh a0, 730(sp) -; ZVFHMIN64-NEXT: lh a1, 474(sp) +; ZVFHMIN64-NEXT: sb a0, 175(sp) +; ZVFHMIN64-NEXT: lh a0, 732(sp) +; ZVFHMIN64-NEXT: lh a1, 476(sp) ; ZVFHMIN64-NEXT: csrr a2, vlenb ; ZVFHMIN64-NEXT: li a3, 23 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s8, 800(a2) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s t4, v21 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 173(sp) -; ZVFHMIN64-NEXT: lh a0, 728(sp) -; ZVFHMIN64-NEXT: lh a1, 472(sp) -; ZVFHMIN64-NEXT: vmv.x.s t6, v3 -; ZVFHMIN64-NEXT: vmv.x.s t5, v19 +; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s t5, v3 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 172(sp) -; ZVFHMIN64-NEXT: lh a0, 726(sp) -; ZVFHMIN64-NEXT: lh a1, 470(sp) -; ZVFHMIN64-NEXT: vmv.x.s s10, v11 -; ZVFHMIN64-NEXT: vmv.x.s s11, v7 +; ZVFHMIN64-NEXT: sb a0, 174(sp) +; ZVFHMIN64-NEXT: lh a0, 730(sp) +; ZVFHMIN64-NEXT: lh a1, 474(sp) +; ZVFHMIN64-NEXT: vmv.x.s s2, v31 +; ZVFHMIN64-NEXT: vmv.x.s t6, v5 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 171(sp) -; ZVFHMIN64-NEXT: lh a0, 724(sp) -; ZVFHMIN64-NEXT: lh s9, 468(sp) -; ZVFHMIN64-NEXT: vmv.x.s a4, v9 -; ZVFHMIN64-NEXT: vmv.x.s ra, v29 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s9 +; ZVFHMIN64-NEXT: sb a0, 173(sp) +; ZVFHMIN64-NEXT: lh a1, 728(sp) +; ZVFHMIN64-NEXT: lh s10, 472(sp) +; ZVFHMIN64-NEXT: vmv.x.s a3, v9 +; ZVFHMIN64-NEXT: vmv.x.s a4, v11 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, s10 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 172(sp) +; ZVFHMIN64-NEXT: lh a1, 726(sp) +; ZVFHMIN64-NEXT: lh s10, 470(sp) +; ZVFHMIN64-NEXT: vmv.x.s a2, v13 +; ZVFHMIN64-NEXT: vmv.x.s s11, v29 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, s10 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 171(sp) +; ZVFHMIN64-NEXT: lh ra, 724(sp) +; ZVFHMIN64-NEXT: lh a0, 468(sp) +; ZVFHMIN64-NEXT: vmv.x.s a5, v27 +; ZVFHMIN64-NEXT: vmv.x.s s10, v7 +; ZVFHMIN64-NEXT: fmv.h.x fa5, ra +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 170(sp) ; ZVFHMIN64-NEXT: lh a0, 722(sp) ; ZVFHMIN64-NEXT: lh a1, 466(sp) -; ZVFHMIN64-NEXT: vmv.x.s s9, v31 -; ZVFHMIN64-NEXT: vmv.x.s a3, v5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 169(sp) -; ZVFHMIN64-NEXT: lh a0, 720(sp) -; ZVFHMIN64-NEXT: lh a1, 464(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v27 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s5 +; ZVFHMIN64-NEXT: vmv.x.s ra, v21 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s7 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 -; ZVFHMIN64-NEXT: sb a0, 168(sp) -; ZVFHMIN64-NEXT: lh a0, 718(sp) -; ZVFHMIN64-NEXT: lh a1, 462(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s2 -; ZVFHMIN64-NEXT: fmv.h.x fa3, s6 +; ZVFHMIN64-NEXT: sb a0, 169(sp) +; ZVFHMIN64-NEXT: lh a0, 720(sp) +; ZVFHMIN64-NEXT: lh a1, 464(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, s4 +; ZVFHMIN64-NEXT: fmv.h.x fa3, s8 ; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 -; ZVFHMIN64-NEXT: sb a0, 167(sp) -; ZVFHMIN64-NEXT: lh a0, 716(sp) -; ZVFHMIN64-NEXT: lh a1, 460(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa2, s3 -; ZVFHMIN64-NEXT: fmv.h.x fa1, s7 +; ZVFHMIN64-NEXT: sb a0, 168(sp) +; ZVFHMIN64-NEXT: lh a0, 718(sp) +; ZVFHMIN64-NEXT: lh a1, 462(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa2, s5 +; ZVFHMIN64-NEXT: fmv.h.x fa1, s9 ; ZVFHMIN64-NEXT: fmv.h.x fa0, a0 ; ZVFHMIN64-NEXT: fmv.h.x ft0, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa0, ft0 -; ZVFHMIN64-NEXT: sb a0, 166(sp) -; ZVFHMIN64-NEXT: lh a0, 714(sp) -; ZVFHMIN64-NEXT: lh a1, 458(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa0, s4 -; ZVFHMIN64-NEXT: fmv.h.x ft0, s8 +; ZVFHMIN64-NEXT: sb a0, 167(sp) +; ZVFHMIN64-NEXT: lh a0, 716(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa0, s6 +; ZVFHMIN64-NEXT: lh a1, 460(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft0, a3 ; ZVFHMIN64-NEXT: fmv.h.x ft1, a0 -; ZVFHMIN64-NEXT: fmv.h.x ft2, a1 -; ZVFHMIN64-NEXT: feq.h a0, ft1, ft2 -; ZVFHMIN64-NEXT: sb a0, 165(sp) -; ZVFHMIN64-NEXT: lh a0, 712(sp) -; ZVFHMIN64-NEXT: lh a1, 456(sp) -; ZVFHMIN64-NEXT: fmv.h.x ft1, s10 -; ZVFHMIN64-NEXT: fmv.h.x ft2, s11 -; ZVFHMIN64-NEXT: fmv.h.x ft3, a0 -; ZVFHMIN64-NEXT: fmv.h.x ft4, a1 -; ZVFHMIN64-NEXT: feq.h a0, ft3, ft4 -; ZVFHMIN64-NEXT: sb a0, 164(sp) -; ZVFHMIN64-NEXT: lh a0, 710(sp) -; ZVFHMIN64-NEXT: fmv.h.x ft3, a4 -; ZVFHMIN64-NEXT: lh a1, 454(sp) -; ZVFHMIN64-NEXT: fmv.h.x ft4, ra -; ZVFHMIN64-NEXT: fmv.h.x ft5, a0 -; ZVFHMIN64-NEXT: feq.h a0, fa5, ft1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: feq.h a1, ft5, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN64-NEXT: sb a1, 163(sp) -; ZVFHMIN64-NEXT: lh a1, 708(sp) -; ZVFHMIN64-NEXT: fmv.h.x ft1, a2 -; ZVFHMIN64-NEXT: lh a2, 452(sp) -; ZVFHMIN64-NEXT: feq.h a3, fa0, fa5 +; ZVFHMIN64-NEXT: feq.h a1, ft1, fa5 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN64-NEXT: sb a1, 166(sp) +; ZVFHMIN64-NEXT: lh a1, 714(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft0, a2 +; ZVFHMIN64-NEXT: lh a2, 458(sp) +; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: feq.h a1, ft0, ft1 -; ZVFHMIN64-NEXT: fmv.h.x fa0, a2 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa0 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN64-NEXT: feq.h a1, fa3, ft0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s3 +; ZVFHMIN64-NEXT: sb a2, 165(sp) +; ZVFHMIN64-NEXT: lh a2, 712(sp) +; ZVFHMIN64-NEXT: lh a4, 456(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, s11 +; ZVFHMIN64-NEXT: feq.h s3, fa2, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa4, fa3 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN64-NEXT: sb a2, 164(sp) +; ZVFHMIN64-NEXT: lh a2, 710(sp) +; ZVFHMIN64-NEXT: lh a4, 454(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa3, s10 +; ZVFHMIN64-NEXT: feq.h a5, fa1, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa4, fa2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, ra +; ZVFHMIN64-NEXT: sb a2, 163(sp) +; ZVFHMIN64-NEXT: lh a2, 708(sp) +; ZVFHMIN64-NEXT: lh a4, 452(sp) +; ZVFHMIN64-NEXT: feq.h s4, fa0, fa3 +; ZVFHMIN64-NEXT: feq.h s5, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN64-NEXT: sb a2, 162(sp) ; ZVFHMIN64-NEXT: lh a2, 706(sp) ; ZVFHMIN64-NEXT: lh a4, 450(sp) -; ZVFHMIN64-NEXT: sb a1, 129(sp) -; ZVFHMIN64-NEXT: feq.h a1, fa1, fa5 -; ZVFHMIN64-NEXT: sb a3, 130(sp) -; ZVFHMIN64-NEXT: feq.h a3, fa2, ft4 -; ZVFHMIN64-NEXT: sb a1, 131(sp) -; ZVFHMIN64-NEXT: feq.h a1, fa4, ft2 -; ZVFHMIN64-NEXT: sb a3, 132(sp) -; ZVFHMIN64-NEXT: feq.h a3, fa3, ft3 +; ZVFHMIN64-NEXT: sb s5, 129(sp) +; ZVFHMIN64-NEXT: sb s4, 130(sp) +; ZVFHMIN64-NEXT: sb a5, 131(sp) +; ZVFHMIN64-NEXT: sb s3, 132(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 ; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: sb a3, 133(sp) -; ZVFHMIN64-NEXT: sb a1, 134(sp) +; ZVFHMIN64-NEXT: sb a1, 133(sp) +; ZVFHMIN64-NEXT: sb a3, 134(sp) ; ZVFHMIN64-NEXT: sb a0, 135(sp) ; ZVFHMIN64-NEXT: sb a2, 161(sp) ; ZVFHMIN64-NEXT: lh a0, 610(sp) ; ZVFHMIN64-NEXT: lh a1, 354(sp) -; ZVFHMIN64-NEXT: vmv.x.s s4, v23 +; ZVFHMIN64-NEXT: vmv.x.s s6, v23 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 10 +; ZVFHMIN64-NEXT: li a3, 18 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -2882,12 +2889,13 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 608(sp) ; ZVFHMIN64-NEXT: lh a1, 352(sp) ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 4 +; ZVFHMIN64-NEXT: li a3, 22 +; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a3, a2, 4 -; ZVFHMIN64-NEXT: sub a2, a3, a2 +; ZVFHMIN64-NEXT: li a3, 21 +; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 @@ -2896,148 +2904,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: sb a0, 240(sp) ; ZVFHMIN64-NEXT: lh a0, 606(sp) ; ZVFHMIN64-NEXT: lh a1, 350(sp) -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7 -; ZVFHMIN64-NEXT: vmv.x.s s6, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa3, t5 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa4, fa2 ; ZVFHMIN64-NEXT: sb a0, 239(sp) ; ZVFHMIN64-NEXT: lh a0, 604(sp) ; ZVFHMIN64-NEXT: lh a1, 348(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6 -; ZVFHMIN64-NEXT: vmv.x.s s7, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 238(sp) ; ZVFHMIN64-NEXT: lh a0, 602(sp) ; ZVFHMIN64-NEXT: lh a1, 346(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5 -; ZVFHMIN64-NEXT: vmv.x.s s8, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: vmv.x.s a2, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 237(sp) ; ZVFHMIN64-NEXT: lh a0, 600(sp) ; ZVFHMIN64-NEXT: lh a1, 344(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4 -; ZVFHMIN64-NEXT: vmv.x.s s9, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: vmv.x.s a3, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 236(sp) ; ZVFHMIN64-NEXT: lh a0, 598(sp) ; ZVFHMIN64-NEXT: lh a1, 342(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3 -; ZVFHMIN64-NEXT: vmv.x.s s10, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: vmv.x.s a4, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 235(sp) ; ZVFHMIN64-NEXT: lh a0, 596(sp) ; ZVFHMIN64-NEXT: lh a1, 340(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2 -; ZVFHMIN64-NEXT: vmv.x.s s11, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: vmv.x.s a5, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 234(sp) ; ZVFHMIN64-NEXT: lh a0, 594(sp) ; ZVFHMIN64-NEXT: lh a1, 338(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1 -; ZVFHMIN64-NEXT: vmv.x.s ra, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: vmv.x.s t6, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 233(sp) ; ZVFHMIN64-NEXT: lh a0, 592(sp) ; ZVFHMIN64-NEXT: lh a1, 336(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: vmv.x.s s2, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 232(sp) ; ZVFHMIN64-NEXT: lh a0, 590(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa2, a2 ; ZVFHMIN64-NEXT: lh a1, 334(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa3, t5 -; ZVFHMIN64-NEXT: fmv.h.x fa2, s4 +; ZVFHMIN64-NEXT: vmv.x.s a2, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa1, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa0, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa1, fa0 +; ZVFHMIN64-NEXT: feq.h t5, fa3, fa2 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa1, fa3 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a3 ; ZVFHMIN64-NEXT: sb a0, 231(sp) ; ZVFHMIN64-NEXT: lh a0, 588(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa2, a4 ; ZVFHMIN64-NEXT: lh a1, 332(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa1, s2 -; ZVFHMIN64-NEXT: fmv.h.x fa0, s5 -; ZVFHMIN64-NEXT: fmv.h.x ft0, a0 -; ZVFHMIN64-NEXT: fmv.h.x ft1, a1 -; ZVFHMIN64-NEXT: feq.h a0, ft0, ft1 -; ZVFHMIN64-NEXT: sb a0, 230(sp) -; ZVFHMIN64-NEXT: lh a0, 586(sp) -; ZVFHMIN64-NEXT: fmv.h.x ft0, s3 -; ZVFHMIN64-NEXT: lh a1, 330(sp) -; ZVFHMIN64-NEXT: fmv.h.x ft1, s6 -; ZVFHMIN64-NEXT: fmv.h.x ft2, a0 -; ZVFHMIN64-NEXT: feq.h a0, fa5, ft1 +; ZVFHMIN64-NEXT: feq.h a3, fa5, fa3 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: feq.h a0, fa4, fa2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s6 +; ZVFHMIN64-NEXT: sb a1, 230(sp) +; ZVFHMIN64-NEXT: lh a1, 586(sp) +; ZVFHMIN64-NEXT: lh a4, 330(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: feq.h a1, ft2, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s7 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s5 ; ZVFHMIN64-NEXT: sb a1, 229(sp) ; ZVFHMIN64-NEXT: lh a1, 584(sp) -; ZVFHMIN64-NEXT: fmv.h.x ft1, s8 -; ZVFHMIN64-NEXT: lh a2, 328(sp) -; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5 +; ZVFHMIN64-NEXT: lh a4, 328(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN64-NEXT: feq.h t6, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: feq.h a1, fa3, ft1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s9 -; ZVFHMIN64-NEXT: sb a2, 228(sp) -; ZVFHMIN64-NEXT: lh a2, 582(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s4 +; ZVFHMIN64-NEXT: sb a1, 228(sp) +; ZVFHMIN64-NEXT: lh a1, 582(sp) ; ZVFHMIN64-NEXT: lh a4, 326(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN64-NEXT: feq.h t4, fa2, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a4 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa3 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s11 -; ZVFHMIN64-NEXT: fmv.h.x fa3, ra -; ZVFHMIN64-NEXT: sb a2, 227(sp) -; ZVFHMIN64-NEXT: lh a2, 580(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, s2 +; ZVFHMIN64-NEXT: feq.h s2, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s3 +; ZVFHMIN64-NEXT: sb a1, 227(sp) +; ZVFHMIN64-NEXT: lh a1, 580(sp) ; ZVFHMIN64-NEXT: lh a4, 324(sp) -; ZVFHMIN64-NEXT: feq.h t5, fa0, fa5 -; ZVFHMIN64-NEXT: feq.h t6, ft0, fa3 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a4 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa3 -; ZVFHMIN64-NEXT: sb a2, 226(sp) -; ZVFHMIN64-NEXT: lh a2, 578(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 226(sp) +; ZVFHMIN64-NEXT: lh a1, 578(sp) ; ZVFHMIN64-NEXT: lh a4, 322(sp) -; ZVFHMIN64-NEXT: sb t6, 193(sp) -; ZVFHMIN64-NEXT: feq.h t6, fa1, fa4 -; ZVFHMIN64-NEXT: sb t5, 194(sp) +; ZVFHMIN64-NEXT: sb a2, 193(sp) +; ZVFHMIN64-NEXT: sb s2, 194(sp) ; ZVFHMIN64-NEXT: sb t6, 195(sp) -; ZVFHMIN64-NEXT: sb t4, 196(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: sb a5, 196(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 197(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 197(sp) ; ZVFHMIN64-NEXT: sb a3, 198(sp) -; ZVFHMIN64-NEXT: sb a0, 199(sp) -; ZVFHMIN64-NEXT: sb a2, 225(sp) +; ZVFHMIN64-NEXT: sb t5, 199(sp) +; ZVFHMIN64-NEXT: sb a1, 225(sp) ; ZVFHMIN64-NEXT: lh a0, 766(sp) ; ZVFHMIN64-NEXT: lh a1, 510(sp) ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a3, a2, 4 -; ZVFHMIN64-NEXT: add a2, a3, a2 +; ZVFHMIN64-NEXT: li a3, 19 +; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload ; ZVFHMIN64-NEXT: vmv.x.s s2, v8 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 11 +; ZVFHMIN64-NEXT: li a3, 14 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 @@ -3049,305 +3057,301 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: sb a0, 191(sp) ; ZVFHMIN64-NEXT: lh a0, 764(sp) ; ZVFHMIN64-NEXT: lh a1, 508(sp) -; ZVFHMIN64-NEXT: vmv.x.s t5, v4 -; ZVFHMIN64-NEXT: vmv.x.s t4, v30 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 190(sp) -; ZVFHMIN64-NEXT: lh a0, 762(sp) -; ZVFHMIN64-NEXT: lh a1, 506(sp) +; ZVFHMIN64-NEXT: vmv.x.s t5, v6 ; ZVFHMIN64-NEXT: csrr a2, vlenb ; ZVFHMIN64-NEXT: slli a2, a2, 2 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload ; ZVFHMIN64-NEXT: vmv.x.s a2, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 190(sp) +; ZVFHMIN64-NEXT: lh a0, 762(sp) +; ZVFHMIN64-NEXT: lh a1, 506(sp) ; ZVFHMIN64-NEXT: csrr a3, vlenb -; ZVFHMIN64-NEXT: slli a3, a3, 1 +; ZVFHMIN64-NEXT: slli a3, a3, 3 ; ZVFHMIN64-NEXT: add a3, sp, a3 ; ZVFHMIN64-NEXT: addi a3, a3, 800 ; ZVFHMIN64-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload ; ZVFHMIN64-NEXT: vmv.x.s a3, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 189(sp) -; ZVFHMIN64-NEXT: lh a0, 760(sp) -; ZVFHMIN64-NEXT: lh a1, 504(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, t3 ; ZVFHMIN64-NEXT: csrr a4, vlenb -; ZVFHMIN64-NEXT: li t3, 6 -; ZVFHMIN64-NEXT: mul a4, a4, t3 +; ZVFHMIN64-NEXT: li a5, 6 +; ZVFHMIN64-NEXT: mul a4, a4, a5 ; ZVFHMIN64-NEXT: add a4, sp, a4 ; ZVFHMIN64-NEXT: addi a4, a4, 800 ; ZVFHMIN64-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload ; ZVFHMIN64-NEXT: vmv.x.s a4, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 -; ZVFHMIN64-NEXT: sb a0, 188(sp) -; ZVFHMIN64-NEXT: lh a0, 758(sp) -; ZVFHMIN64-NEXT: lh a1, 502(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 -; ZVFHMIN64-NEXT: csrr t2, vlenb -; ZVFHMIN64-NEXT: slli t2, t2, 3 -; ZVFHMIN64-NEXT: add t2, sp, t2 -; ZVFHMIN64-NEXT: addi t2, t2, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (t2) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s t2, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN64-NEXT: sb a0, 187(sp) -; ZVFHMIN64-NEXT: lh a0, 756(sp) -; ZVFHMIN64-NEXT: lh a1, 500(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa3, t1 -; ZVFHMIN64-NEXT: csrr t1, vlenb -; ZVFHMIN64-NEXT: li t3, 13 -; ZVFHMIN64-NEXT: mul t1, t1, t3 -; ZVFHMIN64-NEXT: add t1, sp, t1 -; ZVFHMIN64-NEXT: addi t1, t1, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (t1) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s t3, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 -; ZVFHMIN64-NEXT: sb a0, 186(sp) -; ZVFHMIN64-NEXT: lh a0, 754(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa2, t0 -; ZVFHMIN64-NEXT: lh a1, 498(sp) -; ZVFHMIN64-NEXT: csrr t0, vlenb -; ZVFHMIN64-NEXT: li t1, 19 -; ZVFHMIN64-NEXT: mul t0, t0, t1 -; ZVFHMIN64-NEXT: add t0, sp, t0 -; ZVFHMIN64-NEXT: addi t0, t0, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (t0) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s s3, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa1, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 189(sp) +; ZVFHMIN64-NEXT: lh a1, 760(sp) +; ZVFHMIN64-NEXT: lh a5, 504(sp) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li s3, 12 +; ZVFHMIN64-NEXT: mul a0, a0, s3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s5, v8 ; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li t0, 21 -; ZVFHMIN64-NEXT: mul a0, a0, t0 +; ZVFHMIN64-NEXT: li s3, 10 +; ZVFHMIN64-NEXT: mul a0, a0, s3 ; ZVFHMIN64-NEXT: add a0, sp, a0 ; ZVFHMIN64-NEXT: addi a0, a0, 800 ; ZVFHMIN64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN64-NEXT: vmv.x.s a0, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa0, a1 -; ZVFHMIN64-NEXT: feq.h a1, fa1, fa0 -; ZVFHMIN64-NEXT: fmv.h.x fa1, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 188(sp) +; ZVFHMIN64-NEXT: lh a1, 758(sp) +; ZVFHMIN64-NEXT: lh a5, 502(sp) +; ZVFHMIN64-NEXT: csrr s3, vlenb +; ZVFHMIN64-NEXT: slli s3, s3, 4 +; ZVFHMIN64-NEXT: add s3, sp, s3 +; ZVFHMIN64-NEXT: addi s3, s3, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s4, v8 +; ZVFHMIN64-NEXT: vmv.x.s s3, v16 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 +; ZVFHMIN64-NEXT: sb a1, 187(sp) +; ZVFHMIN64-NEXT: lh a1, 756(sp) +; ZVFHMIN64-NEXT: lh a5, 500(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h t4, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, t3 +; ZVFHMIN64-NEXT: sb a1, 186(sp) +; ZVFHMIN64-NEXT: lh a1, 754(sp) +; ZVFHMIN64-NEXT: lh a2, 498(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h t3, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, t1 ; ZVFHMIN64-NEXT: sb a1, 185(sp) ; ZVFHMIN64-NEXT: lh a1, 752(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa0, a3 ; ZVFHMIN64-NEXT: lh a2, 496(sp) -; ZVFHMIN64-NEXT: feq.h t0, fa5, fa1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h t1, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: feq.h t1, fa4, fa0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 ; ZVFHMIN64-NEXT: sb a1, 184(sp) ; ZVFHMIN64-NEXT: lh a1, 750(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 ; ZVFHMIN64-NEXT: lh a2, 494(sp) -; ZVFHMIN64-NEXT: feq.h a3, fa3, fa5 +; ZVFHMIN64-NEXT: fmv.h.x fa4, s5 +; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: feq.h a1, fa2, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, t0 +; ZVFHMIN64-NEXT: sb a1, 183(sp) +; ZVFHMIN64-NEXT: lh a1, 748(sp) +; ZVFHMIN64-NEXT: lh a2, 492(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a7 -; ZVFHMIN64-NEXT: sb a2, 183(sp) -; ZVFHMIN64-NEXT: lh a2, 748(sp) -; ZVFHMIN64-NEXT: lh a4, 492(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 182(sp) +; ZVFHMIN64-NEXT: lh a1, 746(sp) +; ZVFHMIN64-NEXT: lh a2, 490(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, s4 +; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN64-NEXT: sb a2, 182(sp) -; ZVFHMIN64-NEXT: lh a2, 746(sp) -; ZVFHMIN64-NEXT: lh a4, 490(sp) +; ZVFHMIN64-NEXT: sb a1, 181(sp) +; ZVFHMIN64-NEXT: lh a1, 744(sp) +; ZVFHMIN64-NEXT: lh a2, 488(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, s3 ; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: ld a2, 80(sp) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a5 -; ZVFHMIN64-NEXT: sb a2, 181(sp) -; ZVFHMIN64-NEXT: lh a2, 744(sp) -; ZVFHMIN64-NEXT: lh a4, 488(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: ld a4, 88(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: vmv.x.s a5, v0 +; ZVFHMIN64-NEXT: addi a2, sp, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s a2, v8 ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 15 -; ZVFHMIN64-NEXT: vmv.x.s a4, v8 -; ZVFHMIN64-NEXT: sb a2, 180(sp) -; ZVFHMIN64-NEXT: lh a2, 742(sp) -; ZVFHMIN64-NEXT: lh t2, 486(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN64-NEXT: vmv.x.s a5, v8 +; ZVFHMIN64-NEXT: sb a1, 180(sp) +; ZVFHMIN64-NEXT: lh a1, 742(sp) +; ZVFHMIN64-NEXT: lh a7, 486(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: sb a2, 179(sp) -; ZVFHMIN64-NEXT: lh a2, 740(sp) -; ZVFHMIN64-NEXT: lh t2, 484(sp) -; ZVFHMIN64-NEXT: sb a1, 140(sp) -; ZVFHMIN64-NEXT: sb a3, 141(sp) -; ZVFHMIN64-NEXT: sb t1, 142(sp) -; ZVFHMIN64-NEXT: sb t0, 143(sp) -; ZVFHMIN64-NEXT: sb a5, 136(sp) -; ZVFHMIN64-NEXT: sb a0, 137(sp) -; ZVFHMIN64-NEXT: sb a6, 138(sp) -; ZVFHMIN64-NEXT: sb a7, 139(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 179(sp) +; ZVFHMIN64-NEXT: lh a1, 740(sp) +; ZVFHMIN64-NEXT: lh a7, 484(sp) +; ZVFHMIN64-NEXT: sb a3, 140(sp) +; ZVFHMIN64-NEXT: sb t1, 141(sp) +; ZVFHMIN64-NEXT: sb t3, 142(sp) +; ZVFHMIN64-NEXT: sb t4, 143(sp) +; ZVFHMIN64-NEXT: sb a2, 136(sp) +; ZVFHMIN64-NEXT: sb a6, 137(sp) +; ZVFHMIN64-NEXT: sb a4, 138(sp) +; ZVFHMIN64-NEXT: sb a0, 139(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 178(sp) -; ZVFHMIN64-NEXT: lh a0, 638(sp) -; ZVFHMIN64-NEXT: lh a1, 382(sp) +; ZVFHMIN64-NEXT: lh a1, 638(sp) +; ZVFHMIN64-NEXT: lh a2, 382(sp) ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 14 -; ZVFHMIN64-NEXT: vmv.x.s t2, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 255(sp) -; ZVFHMIN64-NEXT: lh a0, 636(sp) -; ZVFHMIN64-NEXT: lh a1, 380(sp) +; ZVFHMIN64-NEXT: vmv.x.s a0, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 255(sp) +; ZVFHMIN64-NEXT: lh a1, 636(sp) +; ZVFHMIN64-NEXT: lh a2, 380(sp) ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 13 -; ZVFHMIN64-NEXT: vmv.x.s t1, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 254(sp) -; ZVFHMIN64-NEXT: lh a0, 634(sp) -; ZVFHMIN64-NEXT: lh a1, 378(sp) +; ZVFHMIN64-NEXT: vmv.x.s t2, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 254(sp) +; ZVFHMIN64-NEXT: lh a1, 634(sp) +; ZVFHMIN64-NEXT: lh a2, 378(sp) ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 12 -; ZVFHMIN64-NEXT: vmv.x.s t0, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 253(sp) -; ZVFHMIN64-NEXT: lh a0, 632(sp) -; ZVFHMIN64-NEXT: lh a1, 376(sp) +; ZVFHMIN64-NEXT: vmv.x.s t1, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 253(sp) +; ZVFHMIN64-NEXT: lh a1, 632(sp) +; ZVFHMIN64-NEXT: lh a2, 376(sp) ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 11 -; ZVFHMIN64-NEXT: vmv.x.s a7, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 252(sp) -; ZVFHMIN64-NEXT: lh a0, 630(sp) -; ZVFHMIN64-NEXT: lh a1, 374(sp) +; ZVFHMIN64-NEXT: vmv.x.s t0, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 252(sp) +; ZVFHMIN64-NEXT: lh a1, 630(sp) +; ZVFHMIN64-NEXT: lh a2, 374(sp) ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 10 -; ZVFHMIN64-NEXT: vmv.x.s a6, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 251(sp) -; ZVFHMIN64-NEXT: lh a0, 628(sp) -; ZVFHMIN64-NEXT: lh a1, 372(sp) +; ZVFHMIN64-NEXT: vmv.x.s a7, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 251(sp) +; ZVFHMIN64-NEXT: lh a1, 628(sp) +; ZVFHMIN64-NEXT: lh a2, 372(sp) ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 9 -; ZVFHMIN64-NEXT: vmv.x.s a5, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: vmv.x.s a6, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: ld a2, 88(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: sb a1, 250(sp) +; ZVFHMIN64-NEXT: lh a1, 626(sp) +; ZVFHMIN64-NEXT: lh a2, 370(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: ld a2, 96(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: sb a1, 249(sp) +; ZVFHMIN64-NEXT: lh a1, 624(sp) +; ZVFHMIN64-NEXT: lh a2, 368(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: ld a1, 104(sp) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: sb a0, 250(sp) -; ZVFHMIN64-NEXT: lh a0, 626(sp) -; ZVFHMIN64-NEXT: lh a1, 370(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: sb a0, 248(sp) +; ZVFHMIN64-NEXT: lh a0, 622(sp) +; ZVFHMIN64-NEXT: lh a1, 366(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 ; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: ld a1, 120(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld a1, 112(sp) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: sb a0, 249(sp) -; ZVFHMIN64-NEXT: lh a1, 624(sp) -; ZVFHMIN64-NEXT: lh a3, 368(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: ld a3, 96(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN64-NEXT: sb a1, 248(sp) -; ZVFHMIN64-NEXT: lh a1, 622(sp) -; ZVFHMIN64-NEXT: lh a3, 366(sp) +; ZVFHMIN64-NEXT: sb a0, 247(sp) +; ZVFHMIN64-NEXT: lh a0, 620(sp) +; ZVFHMIN64-NEXT: lh a1, 364(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: ld a1, 120(sp) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: ld a3, 112(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN64-NEXT: sb a1, 247(sp) -; ZVFHMIN64-NEXT: lh a1, 620(sp) -; ZVFHMIN64-NEXT: lh a3, 364(sp) +; ZVFHMIN64-NEXT: sb a0, 246(sp) +; ZVFHMIN64-NEXT: lh a0, 618(sp) +; ZVFHMIN64-NEXT: lh a1, 362(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, t0 ; ZVFHMIN64-NEXT: feq.h t0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, s2 -; ZVFHMIN64-NEXT: sb a1, 246(sp) -; ZVFHMIN64-NEXT: lh a1, 618(sp) -; ZVFHMIN64-NEXT: lh a3, 362(sp) +; ZVFHMIN64-NEXT: sb a0, 245(sp) +; ZVFHMIN64-NEXT: lh a0, 616(sp) +; ZVFHMIN64-NEXT: lh a1, 360(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 ; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN64-NEXT: sb a1, 245(sp) -; ZVFHMIN64-NEXT: lh a1, 616(sp) -; ZVFHMIN64-NEXT: lh a3, 360(sp) +; ZVFHMIN64-NEXT: sb a0, 244(sp) +; ZVFHMIN64-NEXT: lh a0, 614(sp) +; ZVFHMIN64-NEXT: lh a1, 358(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, a6 ; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN64-NEXT: sb a1, 244(sp) -; ZVFHMIN64-NEXT: lh a1, 614(sp) -; ZVFHMIN64-NEXT: lh a3, 358(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 8 -; ZVFHMIN64-NEXT: vmv.x.s a3, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: sb a1, 243(sp) -; ZVFHMIN64-NEXT: lh a1, 612(sp) -; ZVFHMIN64-NEXT: lh a3, 356(sp) -; ZVFHMIN64-NEXT: sb t0, 204(sp) -; ZVFHMIN64-NEXT: sb a4, 205(sp) -; ZVFHMIN64-NEXT: sb a0, 206(sp) -; ZVFHMIN64-NEXT: sb a2, 207(sp) +; ZVFHMIN64-NEXT: vmv.x.s a1, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: sb a0, 243(sp) +; ZVFHMIN64-NEXT: lh a0, 612(sp) +; ZVFHMIN64-NEXT: lh a1, 356(sp) +; ZVFHMIN64-NEXT: sb a5, 204(sp) +; ZVFHMIN64-NEXT: sb a2, 205(sp) +; ZVFHMIN64-NEXT: sb a3, 206(sp) +; ZVFHMIN64-NEXT: sb a4, 207(sp) +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: sb a2, 200(sp) +; ZVFHMIN64-NEXT: sb a6, 201(sp) +; ZVFHMIN64-NEXT: sb a7, 202(sp) +; ZVFHMIN64-NEXT: sb t0, 203(sp) +; ZVFHMIN64-NEXT: li a2, 128 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 200(sp) -; ZVFHMIN64-NEXT: sb a5, 201(sp) -; ZVFHMIN64-NEXT: sb a6, 202(sp) -; ZVFHMIN64-NEXT: sb a7, 203(sp) -; ZVFHMIN64-NEXT: li a0, 128 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 242(sp) -; ZVFHMIN64-NEXT: addi a1, sp, 128 -; ZVFHMIN64-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; ZVFHMIN64-NEXT: vle8.v v8, (a1) +; ZVFHMIN64-NEXT: sb a0, 242(sp) +; ZVFHMIN64-NEXT: addi a0, sp, 128 +; ZVFHMIN64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; ZVFHMIN64-NEXT: vle8.v v8, (a0) ; ZVFHMIN64-NEXT: vand.vi v8, v8, 1 ; ZVFHMIN64-NEXT: vmsne.vi v0, v8, 0 ; ZVFHMIN64-NEXT: addi sp, s0, -896 diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll index dd2a8240ee253..5b272c98a1e0a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll +++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll @@ -507,26 +507,34 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask) define @match_nxv16i8_v32i8( %op1, <32 x i8> %op2, %mask) { ; RV32-LABEL: match_nxv16i8_v32i8: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset s0, -4 -; RV32-NEXT: .cfi_offset s1, -8 -; RV32-NEXT: .cfi_offset s2, -12 -; RV32-NEXT: .cfi_offset s3, -16 -; RV32-NEXT: .cfi_offset s4, -20 -; RV32-NEXT: .cfi_offset s5, -24 -; RV32-NEXT: .cfi_offset s6, -28 -; RV32-NEXT: .cfi_offset s7, -32 -; RV32-NEXT: .cfi_offset s8, -36 +; RV32-NEXT: addi sp, sp, -64 +; RV32-NEXT: .cfi_def_cfa_offset 64 +; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: .cfi_offset s1, -12 +; RV32-NEXT: .cfi_offset s2, -16 +; RV32-NEXT: .cfi_offset s3, -20 +; RV32-NEXT: .cfi_offset s4, -24 +; RV32-NEXT: .cfi_offset s5, -28 +; RV32-NEXT: .cfi_offset s6, -32 +; RV32-NEXT: .cfi_offset s7, -36 +; RV32-NEXT: .cfi_offset s8, -40 +; RV32-NEXT: .cfi_offset s9, -44 +; RV32-NEXT: .cfi_offset s10, -48 +; RV32-NEXT: .cfi_offset s11, -52 ; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: vslidedown.vi v12, v10, 1 @@ -584,43 +592,43 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV32-NEXT: vmv.x.s s5, v15 ; RV32-NEXT: vmv.x.s s6, v16 ; RV32-NEXT: vmv.x.s s7, v17 -; RV32-NEXT: vsetvli s8, zero, e8, m2, ta, ma +; RV32-NEXT: vmv.x.s s8, v18 +; RV32-NEXT: vmv.x.s s9, v19 +; RV32-NEXT: vmv.x.s s10, v20 +; RV32-NEXT: vmv.x.s s11, v21 +; RV32-NEXT: vsetvli ra, zero, e8, m2, ta, ma ; RV32-NEXT: vmseq.vx v12, v8, a0 -; RV32-NEXT: vmv.x.s a0, v18 +; RV32-NEXT: vmv.x.s a0, v22 ; RV32-NEXT: vmseq.vx v13, v8, s2 -; RV32-NEXT: vmv.x.s s2, v19 +; RV32-NEXT: vmv.x.s s2, v23 ; RV32-NEXT: vmseq.vx v14, v8, s3 -; RV32-NEXT: vmv.x.s s3, v20 -; RV32-NEXT: vmseq.vx v15, v8, s4 -; RV32-NEXT: vmv.x.s s4, v21 -; RV32-NEXT: vmseq.vx v16, v8, s5 -; RV32-NEXT: vmv.x.s s5, v22 -; RV32-NEXT: vmseq.vx v17, v8, s6 -; RV32-NEXT: vmv.x.s s6, v23 -; RV32-NEXT: vmseq.vx v18, v8, s7 -; RV32-NEXT: vmv.x.s s7, v11 -; RV32-NEXT: vmseq.vx v11, v8, a0 -; RV32-NEXT: vmv.x.s a0, v24 -; RV32-NEXT: vmseq.vx v19, v8, s2 -; RV32-NEXT: vmv.x.s s2, v10 +; RV32-NEXT: vmv.x.s s3, v11 +; RV32-NEXT: vmseq.vx v11, v8, s4 +; RV32-NEXT: vmv.x.s s4, v24 +; RV32-NEXT: vmseq.vx v15, v8, s5 +; RV32-NEXT: vmv.x.s s5, v10 ; RV32-NEXT: vmor.mm v10, v12, v13 +; RV32-NEXT: vmseq.vx v12, v8, s6 ; RV32-NEXT: vmor.mm v10, v10, v14 +; RV32-NEXT: vmseq.vx v13, v8, s7 +; RV32-NEXT: vmor.mm v10, v10, v11 +; RV32-NEXT: vmseq.vx v11, v8, s8 ; RV32-NEXT: vmor.mm v10, v10, v15 -; RV32-NEXT: vmor.mm v10, v10, v16 -; RV32-NEXT: vmor.mm v10, v10, v17 -; RV32-NEXT: vmseq.vx v12, v8, s3 -; RV32-NEXT: vmor.mm v10, v10, v18 -; RV32-NEXT: vmseq.vx v13, v8, s4 +; RV32-NEXT: vmseq.vx v14, v8, s9 +; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmseq.vx v12, v8, s10 +; RV32-NEXT: vmor.mm v10, v10, v13 +; RV32-NEXT: vmseq.vx v13, v8, s11 ; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmseq.vx v11, v8, s5 -; RV32-NEXT: vmor.mm v10, v10, v19 -; RV32-NEXT: vmseq.vx v14, v8, s6 +; RV32-NEXT: vmseq.vx v11, v8, a0 +; RV32-NEXT: vmor.mm v10, v10, v14 +; RV32-NEXT: vmseq.vx v14, v8, s2 ; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, s7 +; RV32-NEXT: vmseq.vx v12, v8, s3 ; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, a0 +; RV32-NEXT: vmseq.vx v13, v8, s4 ; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmseq.vx v11, v8, s2 +; RV32-NEXT: vmseq.vx v11, v8, s5 ; RV32-NEXT: vmor.mm v10, v10, v14 ; RV32-NEXT: vmseq.vx v14, v8, a1 ; RV32-NEXT: vmor.mm v10, v10, v12 @@ -658,15 +666,20 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV32-NEXT: vmseq.vx v11, v8, s1 ; RV32-NEXT: vmor.mm v8, v10, v11 ; RV32-NEXT: vmand.mm v0, v8, v0 -; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: .cfi_restore s0 ; RV32-NEXT: .cfi_restore s1 ; RV32-NEXT: .cfi_restore s2 @@ -676,32 +689,43 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV32-NEXT: .cfi_restore s6 ; RV32-NEXT: .cfi_restore s7 ; RV32-NEXT: .cfi_restore s8 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: .cfi_restore s9 +; RV32-NEXT: .cfi_restore s10 +; RV32-NEXT: .cfi_restore s11 +; RV32-NEXT: addi sp, sp, 64 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: match_nxv16i8_v32i8: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -80 -; RV64-NEXT: .cfi_def_cfa_offset 80 -; RV64-NEXT: sd s0, 72(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s1, 64(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s2, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s3, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s4, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s5, 32(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s6, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s7, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s8, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset s0, -8 -; RV64-NEXT: .cfi_offset s1, -16 -; RV64-NEXT: .cfi_offset s2, -24 -; RV64-NEXT: .cfi_offset s3, -32 -; RV64-NEXT: .cfi_offset s4, -40 -; RV64-NEXT: .cfi_offset s5, -48 -; RV64-NEXT: .cfi_offset s6, -56 -; RV64-NEXT: .cfi_offset s7, -64 -; RV64-NEXT: .cfi_offset s8, -72 +; RV64-NEXT: addi sp, sp, -112 +; RV64-NEXT: .cfi_def_cfa_offset 112 +; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: .cfi_offset s1, -24 +; RV64-NEXT: .cfi_offset s2, -32 +; RV64-NEXT: .cfi_offset s3, -40 +; RV64-NEXT: .cfi_offset s4, -48 +; RV64-NEXT: .cfi_offset s5, -56 +; RV64-NEXT: .cfi_offset s6, -64 +; RV64-NEXT: .cfi_offset s7, -72 +; RV64-NEXT: .cfi_offset s8, -80 +; RV64-NEXT: .cfi_offset s9, -88 +; RV64-NEXT: .cfi_offset s10, -96 +; RV64-NEXT: .cfi_offset s11, -104 ; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: vslidedown.vi v12, v10, 1 @@ -759,43 +783,43 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV64-NEXT: vmv.x.s s5, v15 ; RV64-NEXT: vmv.x.s s6, v16 ; RV64-NEXT: vmv.x.s s7, v17 -; RV64-NEXT: vsetvli s8, zero, e8, m2, ta, ma +; RV64-NEXT: vmv.x.s s8, v18 +; RV64-NEXT: vmv.x.s s9, v19 +; RV64-NEXT: vmv.x.s s10, v20 +; RV64-NEXT: vmv.x.s s11, v21 +; RV64-NEXT: vsetvli ra, zero, e8, m2, ta, ma ; RV64-NEXT: vmseq.vx v12, v8, a0 -; RV64-NEXT: vmv.x.s a0, v18 +; RV64-NEXT: vmv.x.s a0, v22 ; RV64-NEXT: vmseq.vx v13, v8, s2 -; RV64-NEXT: vmv.x.s s2, v19 +; RV64-NEXT: vmv.x.s s2, v23 ; RV64-NEXT: vmseq.vx v14, v8, s3 -; RV64-NEXT: vmv.x.s s3, v20 -; RV64-NEXT: vmseq.vx v15, v8, s4 -; RV64-NEXT: vmv.x.s s4, v21 -; RV64-NEXT: vmseq.vx v16, v8, s5 -; RV64-NEXT: vmv.x.s s5, v22 -; RV64-NEXT: vmseq.vx v17, v8, s6 -; RV64-NEXT: vmv.x.s s6, v23 -; RV64-NEXT: vmseq.vx v18, v8, s7 -; RV64-NEXT: vmv.x.s s7, v11 -; RV64-NEXT: vmseq.vx v11, v8, a0 -; RV64-NEXT: vmv.x.s a0, v24 -; RV64-NEXT: vmseq.vx v19, v8, s2 -; RV64-NEXT: vmv.x.s s2, v10 +; RV64-NEXT: vmv.x.s s3, v11 +; RV64-NEXT: vmseq.vx v11, v8, s4 +; RV64-NEXT: vmv.x.s s4, v24 +; RV64-NEXT: vmseq.vx v15, v8, s5 +; RV64-NEXT: vmv.x.s s5, v10 ; RV64-NEXT: vmor.mm v10, v12, v13 +; RV64-NEXT: vmseq.vx v12, v8, s6 ; RV64-NEXT: vmor.mm v10, v10, v14 +; RV64-NEXT: vmseq.vx v13, v8, s7 +; RV64-NEXT: vmor.mm v10, v10, v11 +; RV64-NEXT: vmseq.vx v11, v8, s8 ; RV64-NEXT: vmor.mm v10, v10, v15 -; RV64-NEXT: vmor.mm v10, v10, v16 -; RV64-NEXT: vmor.mm v10, v10, v17 -; RV64-NEXT: vmseq.vx v12, v8, s3 -; RV64-NEXT: vmor.mm v10, v10, v18 -; RV64-NEXT: vmseq.vx v13, v8, s4 +; RV64-NEXT: vmseq.vx v14, v8, s9 +; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmseq.vx v12, v8, s10 +; RV64-NEXT: vmor.mm v10, v10, v13 +; RV64-NEXT: vmseq.vx v13, v8, s11 ; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmseq.vx v11, v8, s5 -; RV64-NEXT: vmor.mm v10, v10, v19 -; RV64-NEXT: vmseq.vx v14, v8, s6 +; RV64-NEXT: vmseq.vx v11, v8, a0 +; RV64-NEXT: vmor.mm v10, v10, v14 +; RV64-NEXT: vmseq.vx v14, v8, s2 ; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, s7 +; RV64-NEXT: vmseq.vx v12, v8, s3 ; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, a0 +; RV64-NEXT: vmseq.vx v13, v8, s4 ; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmseq.vx v11, v8, s2 +; RV64-NEXT: vmseq.vx v11, v8, s5 ; RV64-NEXT: vmor.mm v10, v10, v14 ; RV64-NEXT: vmseq.vx v14, v8, a1 ; RV64-NEXT: vmor.mm v10, v10, v12 @@ -833,15 +857,20 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV64-NEXT: vmseq.vx v11, v8, s1 ; RV64-NEXT: vmor.mm v8, v10, v11 ; RV64-NEXT: vmand.mm v0, v8, v0 -; RV64-NEXT: ld s0, 72(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s1, 64(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s2, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s3, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s4, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s5, 32(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s6, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s7, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s8, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra ; RV64-NEXT: .cfi_restore s0 ; RV64-NEXT: .cfi_restore s1 ; RV64-NEXT: .cfi_restore s2 @@ -851,7 +880,10 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV64-NEXT: .cfi_restore s6 ; RV64-NEXT: .cfi_restore s7 ; RV64-NEXT: .cfi_restore s8 -; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: .cfi_restore s9 +; RV64-NEXT: .cfi_restore s10 +; RV64-NEXT: .cfi_restore s11 +; RV64-NEXT: addi sp, sp, 112 ; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %r = tail call @llvm.experimental.vector.match( %op1, <32 x i8> %op2, %mask) @@ -861,16 +893,20 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) { ; RV32-LABEL: match_v16i8_v32i8: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s6, 4(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s7, 0(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s9, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s10, 4(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s11, 0(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 ; RV32-NEXT: .cfi_offset s1, -8 ; RV32-NEXT: .cfi_offset s2, -12 @@ -879,6 +915,10 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV32-NEXT: .cfi_offset s5, -24 ; RV32-NEXT: .cfi_offset s6, -28 ; RV32-NEXT: .cfi_offset s7, -32 +; RV32-NEXT: .cfi_offset s8, -36 +; RV32-NEXT: .cfi_offset s9, -40 +; RV32-NEXT: .cfi_offset s10, -44 +; RV32-NEXT: .cfi_offset s11, -48 ; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: vslidedown.vi v9, v10, 1 @@ -936,42 +976,42 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV32-NEXT: vmv.x.s s5, v14 ; RV32-NEXT: vmv.x.s s6, v15 ; RV32-NEXT: vmv.x.s s7, v16 +; RV32-NEXT: vmv.x.s s8, v17 +; RV32-NEXT: vmv.x.s s9, v18 +; RV32-NEXT: vmv.x.s s10, v19 +; RV32-NEXT: vmv.x.s s11, v20 ; RV32-NEXT: vmseq.vx v9, v8, a0 -; RV32-NEXT: vmv.x.s a0, v17 +; RV32-NEXT: vmv.x.s a0, v21 ; RV32-NEXT: vmseq.vx v12, v8, s2 -; RV32-NEXT: vmv.x.s s2, v18 +; RV32-NEXT: vmv.x.s s2, v22 ; RV32-NEXT: vmseq.vx v13, v8, s3 -; RV32-NEXT: vmv.x.s s3, v19 -; RV32-NEXT: vmseq.vx v14, v8, s4 -; RV32-NEXT: vmv.x.s s4, v20 -; RV32-NEXT: vmseq.vx v15, v8, s5 -; RV32-NEXT: vmv.x.s s5, v21 -; RV32-NEXT: vmseq.vx v16, v8, s6 -; RV32-NEXT: vmv.x.s s6, v22 -; RV32-NEXT: vmseq.vx v17, v8, s7 -; RV32-NEXT: vmv.x.s s7, v11 -; RV32-NEXT: vmseq.vx v11, v8, a0 -; RV32-NEXT: vmv.x.s a0, v23 -; RV32-NEXT: vmseq.vx v18, v8, s2 -; RV32-NEXT: vmv.x.s s2, v10 +; RV32-NEXT: vmv.x.s s3, v11 +; RV32-NEXT: vmseq.vx v11, v8, s4 +; RV32-NEXT: vmv.x.s s4, v23 +; RV32-NEXT: vmseq.vx v14, v8, s5 +; RV32-NEXT: vmv.x.s s5, v10 ; RV32-NEXT: vmor.mm v9, v9, v12 +; RV32-NEXT: vmseq.vx v10, v8, s6 ; RV32-NEXT: vmor.mm v9, v9, v13 +; RV32-NEXT: vmseq.vx v12, v8, s7 +; RV32-NEXT: vmor.mm v9, v9, v11 +; RV32-NEXT: vmseq.vx v11, v8, s8 ; RV32-NEXT: vmor.mm v9, v9, v14 -; RV32-NEXT: vmor.mm v9, v9, v15 -; RV32-NEXT: vmor.mm v9, v9, v16 -; RV32-NEXT: vmseq.vx v10, v8, s3 -; RV32-NEXT: vmor.mm v9, v9, v17 -; RV32-NEXT: vmseq.vx v12, v8, s4 +; RV32-NEXT: vmseq.vx v13, v8, s9 +; RV32-NEXT: vmor.mm v9, v9, v10 +; RV32-NEXT: vmseq.vx v10, v8, s10 +; RV32-NEXT: vmor.mm v9, v9, v12 +; RV32-NEXT: vmseq.vx v12, v8, s11 ; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, s5 -; RV32-NEXT: vmor.mm v9, v9, v18 -; RV32-NEXT: vmseq.vx v13, v8, s6 +; RV32-NEXT: vmseq.vx v11, v8, a0 +; RV32-NEXT: vmor.mm v9, v9, v13 +; RV32-NEXT: vmseq.vx v13, v8, s2 ; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, s7 +; RV32-NEXT: vmseq.vx v10, v8, s3 ; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, a0 +; RV32-NEXT: vmseq.vx v12, v8, s4 ; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, s2 +; RV32-NEXT: vmseq.vx v11, v8, s5 ; RV32-NEXT: vmor.mm v9, v9, v13 ; RV32-NEXT: vmseq.vx v13, v8, a1 ; RV32-NEXT: vmor.mm v9, v9, v10 @@ -1009,14 +1049,18 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV32-NEXT: vmseq.vx v8, v8, s1 ; RV32-NEXT: vmor.mm v8, v9, v8 ; RV32-NEXT: vmand.mm v0, v8, v0 -; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s3, 16(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s4, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s5, 8(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s6, 4(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s7, 0(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s9, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s10, 4(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s11, 0(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore s0 ; RV32-NEXT: .cfi_restore s1 ; RV32-NEXT: .cfi_restore s2 @@ -1025,22 +1069,30 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV32-NEXT: .cfi_restore s5 ; RV32-NEXT: .cfi_restore s6 ; RV32-NEXT: .cfi_restore s7 -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_restore s8 +; RV32-NEXT: .cfi_restore s9 +; RV32-NEXT: .cfi_restore s10 +; RV32-NEXT: .cfi_restore s11 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: match_v16i8_v32i8: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: .cfi_def_cfa_offset 64 -; RV64-NEXT: sd s0, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s1, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s2, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s3, 32(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s4, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s5, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s6, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s7, 0(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -96 +; RV64-NEXT: .cfi_def_cfa_offset 96 +; RV64-NEXT: sd s0, 88(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 80(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s5, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s6, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s7, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s8, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s9, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s10, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s11, 0(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset s0, -8 ; RV64-NEXT: .cfi_offset s1, -16 ; RV64-NEXT: .cfi_offset s2, -24 @@ -1049,6 +1101,10 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV64-NEXT: .cfi_offset s5, -48 ; RV64-NEXT: .cfi_offset s6, -56 ; RV64-NEXT: .cfi_offset s7, -64 +; RV64-NEXT: .cfi_offset s8, -72 +; RV64-NEXT: .cfi_offset s9, -80 +; RV64-NEXT: .cfi_offset s10, -88 +; RV64-NEXT: .cfi_offset s11, -96 ; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: vslidedown.vi v9, v10, 1 @@ -1106,42 +1162,42 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV64-NEXT: vmv.x.s s5, v14 ; RV64-NEXT: vmv.x.s s6, v15 ; RV64-NEXT: vmv.x.s s7, v16 +; RV64-NEXT: vmv.x.s s8, v17 +; RV64-NEXT: vmv.x.s s9, v18 +; RV64-NEXT: vmv.x.s s10, v19 +; RV64-NEXT: vmv.x.s s11, v20 ; RV64-NEXT: vmseq.vx v9, v8, a0 -; RV64-NEXT: vmv.x.s a0, v17 +; RV64-NEXT: vmv.x.s a0, v21 ; RV64-NEXT: vmseq.vx v12, v8, s2 -; RV64-NEXT: vmv.x.s s2, v18 +; RV64-NEXT: vmv.x.s s2, v22 ; RV64-NEXT: vmseq.vx v13, v8, s3 -; RV64-NEXT: vmv.x.s s3, v19 -; RV64-NEXT: vmseq.vx v14, v8, s4 -; RV64-NEXT: vmv.x.s s4, v20 -; RV64-NEXT: vmseq.vx v15, v8, s5 -; RV64-NEXT: vmv.x.s s5, v21 -; RV64-NEXT: vmseq.vx v16, v8, s6 -; RV64-NEXT: vmv.x.s s6, v22 -; RV64-NEXT: vmseq.vx v17, v8, s7 -; RV64-NEXT: vmv.x.s s7, v11 -; RV64-NEXT: vmseq.vx v11, v8, a0 -; RV64-NEXT: vmv.x.s a0, v23 -; RV64-NEXT: vmseq.vx v18, v8, s2 -; RV64-NEXT: vmv.x.s s2, v10 +; RV64-NEXT: vmv.x.s s3, v11 +; RV64-NEXT: vmseq.vx v11, v8, s4 +; RV64-NEXT: vmv.x.s s4, v23 +; RV64-NEXT: vmseq.vx v14, v8, s5 +; RV64-NEXT: vmv.x.s s5, v10 ; RV64-NEXT: vmor.mm v9, v9, v12 +; RV64-NEXT: vmseq.vx v10, v8, s6 ; RV64-NEXT: vmor.mm v9, v9, v13 +; RV64-NEXT: vmseq.vx v12, v8, s7 +; RV64-NEXT: vmor.mm v9, v9, v11 +; RV64-NEXT: vmseq.vx v11, v8, s8 ; RV64-NEXT: vmor.mm v9, v9, v14 -; RV64-NEXT: vmor.mm v9, v9, v15 -; RV64-NEXT: vmor.mm v9, v9, v16 -; RV64-NEXT: vmseq.vx v10, v8, s3 -; RV64-NEXT: vmor.mm v9, v9, v17 -; RV64-NEXT: vmseq.vx v12, v8, s4 +; RV64-NEXT: vmseq.vx v13, v8, s9 +; RV64-NEXT: vmor.mm v9, v9, v10 +; RV64-NEXT: vmseq.vx v10, v8, s10 +; RV64-NEXT: vmor.mm v9, v9, v12 +; RV64-NEXT: vmseq.vx v12, v8, s11 ; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, s5 -; RV64-NEXT: vmor.mm v9, v9, v18 -; RV64-NEXT: vmseq.vx v13, v8, s6 +; RV64-NEXT: vmseq.vx v11, v8, a0 +; RV64-NEXT: vmor.mm v9, v9, v13 +; RV64-NEXT: vmseq.vx v13, v8, s2 ; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, s7 +; RV64-NEXT: vmseq.vx v10, v8, s3 ; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, a0 +; RV64-NEXT: vmseq.vx v12, v8, s4 ; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, s2 +; RV64-NEXT: vmseq.vx v11, v8, s5 ; RV64-NEXT: vmor.mm v9, v9, v13 ; RV64-NEXT: vmseq.vx v13, v8, a1 ; RV64-NEXT: vmor.mm v9, v9, v10 @@ -1179,14 +1235,18 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV64-NEXT: vmseq.vx v8, v8, s1 ; RV64-NEXT: vmor.mm v8, v9, v8 ; RV64-NEXT: vmand.mm v0, v8, v0 -; RV64-NEXT: ld s0, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s1, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s2, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s3, 32(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s4, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s5, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s6, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s7, 0(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 88(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 80(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s4, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s5, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s6, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s7, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s8, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s9, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s10, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s11, 0(sp) # 8-byte Folded Reload ; RV64-NEXT: .cfi_restore s0 ; RV64-NEXT: .cfi_restore s1 ; RV64-NEXT: .cfi_restore s2 @@ -1195,7 +1255,11 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV64-NEXT: .cfi_restore s5 ; RV64-NEXT: .cfi_restore s6 ; RV64-NEXT: .cfi_restore s7 -; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: .cfi_restore s8 +; RV64-NEXT: .cfi_restore s9 +; RV64-NEXT: .cfi_restore s10 +; RV64-NEXT: .cfi_restore s11 +; RV64-NEXT: addi sp, sp, 96 ; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll index 22e6f23d4d6e6..123048d996360 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -2203,136 +2203,139 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: lshr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -112 -; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu s1, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) -; RV32I-NEXT: lbu s3, 15(a0) -; RV32I-NEXT: lbu s4, 16(a0) -; RV32I-NEXT: lbu s5, 17(a0) -; RV32I-NEXT: lbu s6, 18(a0) -; RV32I-NEXT: lbu s7, 19(a0) +; RV32I-NEXT: lbu t1, 4(a0) +; RV32I-NEXT: lbu t3, 5(a0) +; RV32I-NEXT: lbu t4, 6(a0) +; RV32I-NEXT: lbu s0, 7(a0) +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu s3, 9(a0) +; RV32I-NEXT: lbu s6, 10(a0) +; RV32I-NEXT: lbu s8, 11(a0) +; RV32I-NEXT: lbu s9, 12(a0) +; RV32I-NEXT: lbu s10, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s7, 15(a0) +; RV32I-NEXT: lbu s5, 16(a0) +; RV32I-NEXT: lbu s11, 17(a0) +; RV32I-NEXT: lbu ra, 18(a0) +; RV32I-NEXT: lbu a3, 19(a0) +; RV32I-NEXT: lbu t5, 20(a0) +; RV32I-NEXT: lbu t6, 21(a0) +; RV32I-NEXT: lbu a7, 22(a0) +; RV32I-NEXT: lbu t0, 23(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli s0, s0, 24 +; RV32I-NEXT: or a4, a4, s1 +; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s10, 22(a0) -; RV32I-NEXT: lbu s11, 23(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: slli s2, s2, 16 -; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: or t1, s1, s0 -; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: lbu t6, 24(a0) +; RV32I-NEXT: or a5, t3, t1 +; RV32I-NEXT: or a6, s0, t4 +; RV32I-NEXT: lbu t1, 24(a0) ; RV32I-NEXT: lbu s0, 25(a0) ; RV32I-NEXT: lbu s1, 26(a0) ; RV32I-NEXT: lbu s2, 27(a0) -; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s3, s3, 8 ; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli s7, s7, 24 -; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: or t3, s5, s4 -; RV32I-NEXT: or t4, s7, s6 -; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: slli s8, s8, 24 +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: or t2, s3, t2 +; RV32I-NEXT: or t3, s8, s6 +; RV32I-NEXT: or t4, s10, s9 ; RV32I-NEXT: lbu s3, 28(a0) -; RV32I-NEXT: lbu s4, 29(a0) -; RV32I-NEXT: lbu s5, 30(a0) -; RV32I-NEXT: lbu s6, 31(a0) -; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or a0, s11, s10 -; RV32I-NEXT: or t6, s0, t6 -; RV32I-NEXT: or s0, s2, s1 -; RV32I-NEXT: lbu s1, 0(a1) -; RV32I-NEXT: lbu s2, 1(a1) -; RV32I-NEXT: lbu s7, 2(a1) +; RV32I-NEXT: lbu s6, 29(a0) +; RV32I-NEXT: lbu s8, 30(a0) +; RV32I-NEXT: lbu s9, 31(a0) +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a0, s7, s4 +; RV32I-NEXT: or s4, s11, s5 +; RV32I-NEXT: or s5, a3, ra +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu s7, 1(a1) +; RV32I-NEXT: lbu s10, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 64(sp) +; RV32I-NEXT: sw zero, 68(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: or s3, s4, s3 -; RV32I-NEXT: mv s4, sp -; RV32I-NEXT: slli s5, s5, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s7, s7, 16 +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or t5, t6, t5 +; RV32I-NEXT: addi t6, sp, 8 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or s5, s6, s5 -; RV32I-NEXT: or s1, s2, s1 -; RV32I-NEXT: or a1, a1, s7 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or a0, a0, t5 -; RV32I-NEXT: or t0, s0, t6 -; RV32I-NEXT: or t1, s5, s3 -; RV32I-NEXT: or a1, a1, s1 -; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, s0, t1 +; RV32I-NEXT: or t1, s2, s1 +; RV32I-NEXT: or s0, s6, s3 +; RV32I-NEXT: or s1, s9, s8 +; RV32I-NEXT: or a3, s7, a3 +; RV32I-NEXT: or a1, a1, s10 +; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, s2 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t3, t2 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: or a7, a7, t5 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: sw t2, 24(sp) +; RV32I-NEXT: sw a7, 28(sp) +; RV32I-NEXT: sw t0, 32(sp) +; RV32I-NEXT: sw s0, 36(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a6, 16(sp) ; RV32I-NEXT: sw a0, 20(sp) -; RV32I-NEXT: sw t0, 24(sp) -; RV32I-NEXT: sw t1, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: slli t1, a1, 3 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: add a1, s4, a1 +; RV32I-NEXT: add a1, t6, a1 ; RV32I-NEXT: andi a0, t1, 24 -; RV32I-NEXT: xori a7, a0, 31 +; RV32I-NEXT: xori t0, a0, 31 ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a5, 8(a1) ; RV32I-NEXT: lw a6, 12(a1) -; RV32I-NEXT: lw t0, 16(a1) +; RV32I-NEXT: lw a7, 16(a1) ; RV32I-NEXT: lw t2, 20(a1) ; RV32I-NEXT: lw t3, 24(a1) ; RV32I-NEXT: lw t4, 28(a1) @@ -2341,33 +2344,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srl a1, a3, t1 ; RV32I-NEXT: slli t6, a4, 1 ; RV32I-NEXT: srl a3, a6, t1 -; RV32I-NEXT: slli s0, t0, 1 +; RV32I-NEXT: slli s0, a7, 1 ; RV32I-NEXT: srl a4, a5, t1 ; RV32I-NEXT: slli s1, a6, 1 ; RV32I-NEXT: srl a5, t2, t1 ; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: srl a6, t0, t1 +; RV32I-NEXT: srl a6, a7, t1 ; RV32I-NEXT: slli t2, t2, 1 -; RV32I-NEXT: srl t0, t3, t1 +; RV32I-NEXT: srl a7, t3, t1 ; RV32I-NEXT: slli t3, t4, 1 ; RV32I-NEXT: srl t1, t4, t1 -; RV32I-NEXT: sll t4, t5, a7 -; RV32I-NEXT: sll t5, t6, a7 -; RV32I-NEXT: sll t6, s0, a7 -; RV32I-NEXT: sll s0, s1, a7 -; RV32I-NEXT: sll s1, s2, a7 -; RV32I-NEXT: sll t2, t2, a7 -; RV32I-NEXT: sll t3, t3, a7 +; RV32I-NEXT: sll t4, t5, t0 +; RV32I-NEXT: sll t5, t6, t0 +; RV32I-NEXT: sll t6, s0, t0 +; RV32I-NEXT: sll s0, s1, t0 +; RV32I-NEXT: sll s1, s2, t0 +; RV32I-NEXT: sll t2, t2, t0 +; RV32I-NEXT: sll t3, t3, t0 ; RV32I-NEXT: srli s2, t1, 24 ; RV32I-NEXT: srli s3, t1, 16 ; RV32I-NEXT: srli s4, t1, 8 -; RV32I-NEXT: or a7, a0, t4 +; RV32I-NEXT: or t0, a0, t4 ; RV32I-NEXT: or t4, a1, t5 ; RV32I-NEXT: or t5, a3, t6 ; RV32I-NEXT: or s0, a4, s0 ; RV32I-NEXT: or s1, a5, s1 ; RV32I-NEXT: or t2, a6, t2 -; RV32I-NEXT: or t3, t0, t3 +; RV32I-NEXT: or t3, a7, t3 ; RV32I-NEXT: sb t1, 28(a2) ; RV32I-NEXT: sb s4, 29(a2) ; RV32I-NEXT: sb s3, 30(a2) @@ -2384,23 +2387,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s6, s0, 24 ; RV32I-NEXT: srli s7, s0, 16 ; RV32I-NEXT: srli s0, s0, 8 -; RV32I-NEXT: sb t0, 24(a2) -; RV32I-NEXT: srli t0, t5, 24 -; RV32I-NEXT: sb t3, 25(a2) -; RV32I-NEXT: srli t3, t5, 16 +; RV32I-NEXT: srli s8, t5, 24 +; RV32I-NEXT: srli s9, t5, 16 ; RV32I-NEXT: srli t5, t5, 8 +; RV32I-NEXT: srli s10, t4, 24 +; RV32I-NEXT: srli s11, t4, 16 +; RV32I-NEXT: srli t4, t4, 8 +; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: sb t3, 25(a2) ; RV32I-NEXT: sb t6, 26(a2) -; RV32I-NEXT: srli t6, t4, 24 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, t4, 16 -; RV32I-NEXT: srli t4, t4, 8 +; RV32I-NEXT: srli a7, t0, 24 ; RV32I-NEXT: sb a6, 16(a2) -; RV32I-NEXT: srli a6, a7, 24 ; RV32I-NEXT: sb t2, 17(a2) ; RV32I-NEXT: sb s3, 18(a2) ; RV32I-NEXT: sb s2, 19(a2) -; RV32I-NEXT: srli t2, a7, 16 -; RV32I-NEXT: srli a7, a7, 8 +; RV32I-NEXT: srli a6, t0, 16 +; RV32I-NEXT: srli t0, t0, 8 ; RV32I-NEXT: sb a5, 20(a2) ; RV32I-NEXT: sb s1, 21(a2) ; RV32I-NEXT: sb s5, 22(a2) @@ -2411,29 +2414,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: sb a3, 12(a2) ; RV32I-NEXT: sb t5, 13(a2) -; RV32I-NEXT: sb t3, 14(a2) -; RV32I-NEXT: sb t0, 15(a2) +; RV32I-NEXT: sb s9, 14(a2) +; RV32I-NEXT: sb s8, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: sb t4, 1(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb t6, 3(a2) +; RV32I-NEXT: sb s11, 2(a2) +; RV32I-NEXT: sb s10, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: sb a7, 5(a2) -; RV32I-NEXT: sb t2, 6(a2) -; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 112 +; RV32I-NEXT: sb t0, 5(a2) +; RV32I-NEXT: sb a6, 6(a2) +; RV32I-NEXT: sb a7, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -2678,128 +2682,132 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; ; RV32I-LABEL: lshr_32bytes_wordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -112 -; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s2, 12(a0) -; RV32I-NEXT: lbu s3, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu s6, 16(a0) -; RV32I-NEXT: lbu s7, 17(a0) -; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu s9, 19(a0) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: lbu s10, 20(a0) -; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: lbu a5, 0(a0) +; RV32I-NEXT: lbu a7, 1(a0) +; RV32I-NEXT: lbu t0, 2(a0) +; RV32I-NEXT: lbu t1, 3(a0) +; RV32I-NEXT: lbu s2, 4(a0) +; RV32I-NEXT: lbu s4, 5(a0) +; RV32I-NEXT: lbu s5, 6(a0) +; RV32I-NEXT: lbu s6, 7(a0) +; RV32I-NEXT: lbu s3, 8(a0) +; RV32I-NEXT: lbu s9, 9(a0) +; RV32I-NEXT: lbu s10, 10(a0) +; RV32I-NEXT: lbu s11, 11(a0) +; RV32I-NEXT: lbu ra, 12(a0) +; RV32I-NEXT: lbu a1, 13(a0) +; RV32I-NEXT: lbu t4, 14(a0) +; RV32I-NEXT: lbu t6, 15(a0) +; RV32I-NEXT: lbu a4, 16(a0) +; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a6, 17(a0) +; RV32I-NEXT: lbu t2, 18(a0) +; RV32I-NEXT: lbu t3, 19(a0) +; RV32I-NEXT: lbu a4, 20(a0) +; RV32I-NEXT: lbu t5, 21(a0) ; RV32I-NEXT: lbu s0, 22(a0) ; RV32I-NEXT: lbu s1, 23(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: or t1, s3, s2 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu s2, 25(a0) -; RV32I-NEXT: lbu s3, 26(a0) -; RV32I-NEXT: lbu s4, 27(a0) -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: or t4, s7, s6 -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: or t6, s11, s10 -; RV32I-NEXT: lbu s5, 28(a0) -; RV32I-NEXT: lbu s6, 29(a0) -; RV32I-NEXT: lbu s7, 30(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or t0, s4, s2 +; RV32I-NEXT: or t1, s6, s5 +; RV32I-NEXT: lbu s2, 24(a0) +; RV32I-NEXT: lbu s6, 25(a0) +; RV32I-NEXT: lbu s7, 26(a0) +; RV32I-NEXT: lbu s8, 27(a0) +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or s3, s9, s3 +; RV32I-NEXT: or s4, s11, s10 +; RV32I-NEXT: or s5, a1, ra +; RV32I-NEXT: lbu s9, 28(a0) +; RV32I-NEXT: lbu a1, 29(a0) +; RV32I-NEXT: lbu s10, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: lbu a3, 0(a3) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 64(sp) +; RV32I-NEXT: sw zero, 68(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or t4, t6, t4 +; RV32I-NEXT: addi t6, sp, 8 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: slli t5, t5, 8 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: mv s1, sp -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s3, s3, 16 -; RV32I-NEXT: slli s4, s4, 24 ; RV32I-NEXT: slli s6, s6, 8 ; RV32I-NEXT: slli s7, s7, 16 +; RV32I-NEXT: slli s8, s8, 24 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: or t3, s2, t3 -; RV32I-NEXT: or s2, s4, s3 -; RV32I-NEXT: or s3, s6, s5 -; RV32I-NEXT: or a0, a0, s7 -; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t5, t4 -; RV32I-NEXT: or t0, s0, t6 -; RV32I-NEXT: or t1, s2, t3 -; RV32I-NEXT: or a0, a0, s3 -; RV32I-NEXT: add s1, s1, a1 -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t0, 20(sp) -; RV32I-NEXT: sw t1, 24(sp) -; RV32I-NEXT: sw a0, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: slli a3, a3, 2 +; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a6, a6, s11 +; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: or a4, t5, a4 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: or t3, s6, s2 +; RV32I-NEXT: or t5, s8, s7 +; RV32I-NEXT: or a1, a1, s9 +; RV32I-NEXT: or a0, a0, s10 +; RV32I-NEXT: andi a3, a3, 28 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or t0, s4, s3 +; RV32I-NEXT: or t1, t4, s5 +; RV32I-NEXT: or a6, t2, a6 +; RV32I-NEXT: or a4, s0, a4 +; RV32I-NEXT: or t2, t5, t3 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: add t6, t6, a3 +; RV32I-NEXT: sw a6, 24(sp) +; RV32I-NEXT: sw a4, 28(sp) +; RV32I-NEXT: sw t2, 32(sp) +; RV32I-NEXT: sw a0, 36(sp) ; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) -; RV32I-NEXT: lw a6, 16(s1) -; RV32I-NEXT: lw a5, 20(s1) -; RV32I-NEXT: lw a7, 24(s1) -; RV32I-NEXT: lw a1, 0(s1) -; RV32I-NEXT: lw a0, 4(s1) -; RV32I-NEXT: lw a4, 8(s1) -; RV32I-NEXT: lw a3, 12(s1) -; RV32I-NEXT: lw t0, 28(s1) +; RV32I-NEXT: sw a7, 12(sp) +; RV32I-NEXT: sw t0, 16(sp) +; RV32I-NEXT: sw t1, 20(sp) +; RV32I-NEXT: lw a6, 16(t6) +; RV32I-NEXT: lw a5, 20(t6) +; RV32I-NEXT: lw a7, 24(t6) +; RV32I-NEXT: lw a1, 0(t6) +; RV32I-NEXT: lw a0, 4(t6) +; RV32I-NEXT: lw a4, 8(t6) +; RV32I-NEXT: lw a3, 12(t6) +; RV32I-NEXT: lw t0, 28(t6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -2814,21 +2822,21 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 +; RV32I-NEXT: srli s8, a4, 8 +; RV32I-NEXT: srli s9, a3, 24 +; RV32I-NEXT: srli s10, a3, 16 +; RV32I-NEXT: srli s11, a3, 8 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a4, 8 +; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t3, 25(a2) -; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) -; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a3, 8 +; RV32I-NEXT: srli t1, a1, 16 ; RV32I-NEXT: sb t0, 28(a2) -; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) -; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t4, a1, 8 +; RV32I-NEXT: srli t0, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -2840,35 +2848,36 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a7, 9(a2) +; RV32I-NEXT: sb s8, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb t1, 13(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t3, 15(a2) +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb s9, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t4, 1(a2) -; RV32I-NEXT: sb t6, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb t1, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 112 +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %wordOff = load i256, ptr %wordOff.ptr, align 1 @@ -2894,111 +2903,111 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: lbu a4, 1(a0) -; RV64I-NEXT: lbu a5, 2(a0) -; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: lbu a7, 4(a0) -; RV64I-NEXT: lbu t0, 5(a0) -; RV64I-NEXT: lbu t1, 6(a0) -; RV64I-NEXT: lbu t2, 7(a0) -; RV64I-NEXT: lbu t3, 8(a0) -; RV64I-NEXT: lbu t4, 9(a0) -; RV64I-NEXT: lbu t5, 10(a0) -; RV64I-NEXT: lbu t6, 11(a0) -; RV64I-NEXT: lbu s0, 12(a0) -; RV64I-NEXT: lbu s1, 13(a0) -; RV64I-NEXT: lbu s2, 14(a0) -; RV64I-NEXT: lbu s3, 15(a0) -; RV64I-NEXT: lbu s4, 16(a0) -; RV64I-NEXT: lbu s5, 17(a0) -; RV64I-NEXT: lbu s6, 18(a0) -; RV64I-NEXT: lbu s7, 19(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a6, t2, t1 -; RV64I-NEXT: lbu s8, 20(a0) -; RV64I-NEXT: lbu s9, 21(a0) -; RV64I-NEXT: lbu s10, 22(a0) -; RV64I-NEXT: lbu s11, 23(a0) -; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: slli t5, t5, 16 -; RV64I-NEXT: slli t6, t6, 24 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: lbu a5, 0(a0) +; RV64I-NEXT: lbu a7, 1(a0) +; RV64I-NEXT: lbu t2, 2(a0) +; RV64I-NEXT: lbu s3, 3(a0) +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu s8, 5(a0) +; RV64I-NEXT: lbu s9, 6(a0) +; RV64I-NEXT: lbu s10, 7(a0) +; RV64I-NEXT: lbu s2, 8(a0) +; RV64I-NEXT: lbu s4, 9(a0) +; RV64I-NEXT: lbu s5, 10(a0) +; RV64I-NEXT: lbu s6, 11(a0) +; RV64I-NEXT: lbu s7, 12(a0) +; RV64I-NEXT: lbu s11, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t3, 15(a0) +; RV64I-NEXT: lbu a3, 16(a0) +; RV64I-NEXT: lbu a6, 17(a0) +; RV64I-NEXT: lbu t4, 18(a0) +; RV64I-NEXT: lbu t5, 19(a0) +; RV64I-NEXT: lbu a4, 20(a0) +; RV64I-NEXT: lbu t6, 21(a0) +; RV64I-NEXT: lbu s0, 22(a0) +; RV64I-NEXT: lbu s1, 23(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: or a7, t4, t3 -; RV64I-NEXT: or t0, t6, t5 -; RV64I-NEXT: or t1, s1, s0 -; RV64I-NEXT: or t2, s3, s2 -; RV64I-NEXT: lbu t3, 24(a0) -; RV64I-NEXT: lbu t4, 25(a0) -; RV64I-NEXT: lbu t5, 26(a0) -; RV64I-NEXT: lbu t6, 27(a0) -; RV64I-NEXT: slli s5, s5, 8 -; RV64I-NEXT: slli s6, s6, 16 -; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: slli s9, s9, 8 -; RV64I-NEXT: or s0, s5, s4 -; RV64I-NEXT: or s1, s7, s6 -; RV64I-NEXT: or s2, s9, s8 -; RV64I-NEXT: lbu s3, 28(a0) -; RV64I-NEXT: lbu s4, 29(a0) -; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, s3, t2 +; RV64I-NEXT: or t0, s8, t0 +; RV64I-NEXT: or t2, s10, s9 +; RV64I-NEXT: lbu s3, 24(a0) +; RV64I-NEXT: lbu s8, 25(a0) +; RV64I-NEXT: lbu s9, 26(a0) +; RV64I-NEXT: lbu s10, 27(a0) +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: or s2, s4, s2 +; RV64I-NEXT: or s4, s6, s5 +; RV64I-NEXT: or s5, s11, s7 +; RV64I-NEXT: lbu s6, 28(a0) +; RV64I-NEXT: lbu s7, 29(a0) +; RV64I-NEXT: lbu s11, 30(a0) ; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) ; RV64I-NEXT: sd zero, 56(sp) -; RV64I-NEXT: slli s10, s10, 16 -; RV64I-NEXT: slli s11, s11, 24 -; RV64I-NEXT: or s6, s11, s10 -; RV64I-NEXT: mv s7, sp -; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: slli t5, t5, 16 -; RV64I-NEXT: slli t6, t6, 24 -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: or t1, t3, t1 +; RV64I-NEXT: mv t3, sp +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: slli t5, t5, 24 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s0, s0, 16 +; RV64I-NEXT: slli s1, s1, 24 +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: slli s11, s11, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: or t3, t4, t3 -; RV64I-NEXT: or t4, t6, t5 -; RV64I-NEXT: or t5, s4, s3 -; RV64I-NEXT: or a0, a0, s5 -; RV64I-NEXT: andi a1, a1, 24 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a6, t5, t4 +; RV64I-NEXT: or a4, t6, a4 ; RV64I-NEXT: or s0, s1, s0 -; RV64I-NEXT: or a7, s6, s2 -; RV64I-NEXT: or t0, t4, t3 -; RV64I-NEXT: or a0, a0, t5 -; RV64I-NEXT: add s7, s7, a1 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or t4, s8, s3 +; RV64I-NEXT: or t5, s10, s9 +; RV64I-NEXT: or t6, s7, s6 +; RV64I-NEXT: or a0, a0, s11 +; RV64I-NEXT: andi a1, a1, 24 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: or t0, s4, s2 +; RV64I-NEXT: or t1, t1, s5 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a4, s0, a4 +; RV64I-NEXT: or a6, t5, t4 +; RV64I-NEXT: or a0, a0, t6 +; RV64I-NEXT: add t3, t3, a1 ; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a1, a7, a5 +; RV64I-NEXT: or a5, t1, t0 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a1, a6, a5 -; RV64I-NEXT: or a4, a7, s0 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: sd a3, 0(sp) -; RV64I-NEXT: sd a1, 8(sp) -; RV64I-NEXT: sd a4, 16(sp) +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: sd a1, 0(sp) +; RV64I-NEXT: sd a5, 8(sp) +; RV64I-NEXT: sd a3, 16(sp) ; RV64I-NEXT: sd a0, 24(sp) -; RV64I-NEXT: ld a4, 16(s7) -; RV64I-NEXT: ld a0, 8(s7) -; RV64I-NEXT: ld a1, 0(s7) -; RV64I-NEXT: ld a3, 24(s7) +; RV64I-NEXT: ld a4, 16(t3) +; RV64I-NEXT: ld a0, 8(t3) +; RV64I-NEXT: ld a1, 0(t3) +; RV64I-NEXT: ld a3, 24(t3) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 ; RV64I-NEXT: srli a7, a4, 40 @@ -3017,25 +3026,25 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: srli s5, a1, 48 ; RV64I-NEXT: srli s6, a1, 40 ; RV64I-NEXT: srli s7, a1, 32 +; RV64I-NEXT: srli s8, a1, 24 +; RV64I-NEXT: srli s9, a1, 16 +; RV64I-NEXT: srli s10, a1, 8 +; RV64I-NEXT: srli s11, a0, 56 ; RV64I-NEXT: sb t0, 20(a2) -; RV64I-NEXT: srli t0, a1, 24 ; RV64I-NEXT: sb a7, 21(a2) -; RV64I-NEXT: srli a7, a1, 16 ; RV64I-NEXT: sb a6, 22(a2) -; RV64I-NEXT: srli a6, a1, 8 ; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, a0, 56 +; RV64I-NEXT: srli a5, a0, 48 ; RV64I-NEXT: sb a4, 16(a2) -; RV64I-NEXT: srli a4, a0, 48 ; RV64I-NEXT: sb t3, 17(a2) ; RV64I-NEXT: sb t2, 18(a2) ; RV64I-NEXT: sb t1, 19(a2) -; RV64I-NEXT: srli t1, a0, 40 +; RV64I-NEXT: srli a4, a0, 40 ; RV64I-NEXT: sb s0, 28(a2) ; RV64I-NEXT: sb t6, 29(a2) ; RV64I-NEXT: sb t5, 30(a2) ; RV64I-NEXT: sb t4, 31(a2) -; RV64I-NEXT: srli t2, a0, 32 +; RV64I-NEXT: srli a6, a0, 32 ; RV64I-NEXT: sb a3, 24(a2) ; RV64I-NEXT: sb s3, 25(a2) ; RV64I-NEXT: sb s2, 26(a2) @@ -3045,19 +3054,19 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sb s6, 5(a2) ; RV64I-NEXT: sb s5, 6(a2) ; RV64I-NEXT: sb s4, 7(a2) -; RV64I-NEXT: srli t3, a0, 16 +; RV64I-NEXT: srli a7, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a6, 1(a2) -; RV64I-NEXT: sb a7, 2(a2) -; RV64I-NEXT: sb t0, 3(a2) +; RV64I-NEXT: sb s10, 1(a2) +; RV64I-NEXT: sb s9, 2(a2) +; RV64I-NEXT: sb s8, 3(a2) ; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: sb t2, 12(a2) -; RV64I-NEXT: sb t1, 13(a2) -; RV64I-NEXT: sb a4, 14(a2) -; RV64I-NEXT: sb a5, 15(a2) +; RV64I-NEXT: sb a6, 12(a2) +; RV64I-NEXT: sb a4, 13(a2) +; RV64I-NEXT: sb a5, 14(a2) +; RV64I-NEXT: sb s11, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: sb t3, 10(a2) +; RV64I-NEXT: sb a7, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) ; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload @@ -3076,128 +3085,132 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; ; RV32I-LABEL: lshr_32bytes_dwordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -112 -; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s2, 12(a0) -; RV32I-NEXT: lbu s3, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu s6, 16(a0) -; RV32I-NEXT: lbu s7, 17(a0) -; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu s9, 19(a0) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: lbu s10, 20(a0) -; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: lbu a5, 0(a0) +; RV32I-NEXT: lbu a7, 1(a0) +; RV32I-NEXT: lbu t0, 2(a0) +; RV32I-NEXT: lbu t1, 3(a0) +; RV32I-NEXT: lbu s2, 4(a0) +; RV32I-NEXT: lbu s4, 5(a0) +; RV32I-NEXT: lbu s5, 6(a0) +; RV32I-NEXT: lbu s6, 7(a0) +; RV32I-NEXT: lbu s3, 8(a0) +; RV32I-NEXT: lbu s9, 9(a0) +; RV32I-NEXT: lbu s10, 10(a0) +; RV32I-NEXT: lbu s11, 11(a0) +; RV32I-NEXT: lbu ra, 12(a0) +; RV32I-NEXT: lbu a1, 13(a0) +; RV32I-NEXT: lbu t4, 14(a0) +; RV32I-NEXT: lbu t6, 15(a0) +; RV32I-NEXT: lbu a4, 16(a0) +; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a6, 17(a0) +; RV32I-NEXT: lbu t2, 18(a0) +; RV32I-NEXT: lbu t3, 19(a0) +; RV32I-NEXT: lbu a4, 20(a0) +; RV32I-NEXT: lbu t5, 21(a0) ; RV32I-NEXT: lbu s0, 22(a0) ; RV32I-NEXT: lbu s1, 23(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: or t1, s3, s2 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu s2, 25(a0) -; RV32I-NEXT: lbu s3, 26(a0) -; RV32I-NEXT: lbu s4, 27(a0) -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: or t4, s7, s6 -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: or t6, s11, s10 -; RV32I-NEXT: lbu s5, 28(a0) -; RV32I-NEXT: lbu s6, 29(a0) -; RV32I-NEXT: lbu s7, 30(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or t0, s4, s2 +; RV32I-NEXT: or t1, s6, s5 +; RV32I-NEXT: lbu s2, 24(a0) +; RV32I-NEXT: lbu s6, 25(a0) +; RV32I-NEXT: lbu s7, 26(a0) +; RV32I-NEXT: lbu s8, 27(a0) +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or s3, s9, s3 +; RV32I-NEXT: or s4, s11, s10 +; RV32I-NEXT: or s5, a1, ra +; RV32I-NEXT: lbu s9, 28(a0) +; RV32I-NEXT: lbu a1, 29(a0) +; RV32I-NEXT: lbu s10, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: lbu a3, 0(a3) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 64(sp) +; RV32I-NEXT: sw zero, 68(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or t4, t6, t4 +; RV32I-NEXT: addi t6, sp, 8 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: slli t5, t5, 8 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: mv s1, sp -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s3, s3, 16 -; RV32I-NEXT: slli s4, s4, 24 ; RV32I-NEXT: slli s6, s6, 8 ; RV32I-NEXT: slli s7, s7, 16 +; RV32I-NEXT: slli s8, s8, 24 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: or t3, s2, t3 -; RV32I-NEXT: or s2, s4, s3 -; RV32I-NEXT: or s3, s6, s5 -; RV32I-NEXT: or a0, a0, s7 -; RV32I-NEXT: andi a1, a1, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t5, t4 -; RV32I-NEXT: or t0, s0, t6 -; RV32I-NEXT: or t1, s2, t3 -; RV32I-NEXT: or a0, a0, s3 -; RV32I-NEXT: add s1, s1, a1 -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t0, 20(sp) -; RV32I-NEXT: sw t1, 24(sp) -; RV32I-NEXT: sw a0, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: slli a3, a3, 3 +; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a6, a6, s11 +; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: or a4, t5, a4 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: or t3, s6, s2 +; RV32I-NEXT: or t5, s8, s7 +; RV32I-NEXT: or a1, a1, s9 +; RV32I-NEXT: or a0, a0, s10 +; RV32I-NEXT: andi a3, a3, 24 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or t0, s4, s3 +; RV32I-NEXT: or t1, t4, s5 +; RV32I-NEXT: or a6, t2, a6 +; RV32I-NEXT: or a4, s0, a4 +; RV32I-NEXT: or t2, t5, t3 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: add t6, t6, a3 +; RV32I-NEXT: sw a6, 24(sp) +; RV32I-NEXT: sw a4, 28(sp) +; RV32I-NEXT: sw t2, 32(sp) +; RV32I-NEXT: sw a0, 36(sp) ; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) -; RV32I-NEXT: lw a6, 16(s1) -; RV32I-NEXT: lw a5, 20(s1) -; RV32I-NEXT: lw a7, 24(s1) -; RV32I-NEXT: lw a1, 0(s1) -; RV32I-NEXT: lw a0, 4(s1) -; RV32I-NEXT: lw a4, 8(s1) -; RV32I-NEXT: lw a3, 12(s1) -; RV32I-NEXT: lw t0, 28(s1) +; RV32I-NEXT: sw a7, 12(sp) +; RV32I-NEXT: sw t0, 16(sp) +; RV32I-NEXT: sw t1, 20(sp) +; RV32I-NEXT: lw a6, 16(t6) +; RV32I-NEXT: lw a5, 20(t6) +; RV32I-NEXT: lw a7, 24(t6) +; RV32I-NEXT: lw a1, 0(t6) +; RV32I-NEXT: lw a0, 4(t6) +; RV32I-NEXT: lw a4, 8(t6) +; RV32I-NEXT: lw a3, 12(t6) +; RV32I-NEXT: lw t0, 28(t6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -3212,21 +3225,21 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 +; RV32I-NEXT: srli s8, a4, 8 +; RV32I-NEXT: srli s9, a3, 24 +; RV32I-NEXT: srli s10, a3, 16 +; RV32I-NEXT: srli s11, a3, 8 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a4, 8 +; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t3, 25(a2) -; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) -; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a3, 8 +; RV32I-NEXT: srli t1, a1, 16 ; RV32I-NEXT: sb t0, 28(a2) -; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) -; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t4, a1, 8 +; RV32I-NEXT: srli t0, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -3238,35 +3251,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a7, 9(a2) +; RV32I-NEXT: sb s8, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb t1, 13(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t3, 15(a2) +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb s9, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t4, 1(a2) -; RV32I-NEXT: sb t6, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb t1, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 112 +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 @@ -3510,129 +3524,132 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -112 -; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu s1, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) -; RV32I-NEXT: lbu s3, 15(a0) -; RV32I-NEXT: lbu s4, 16(a0) -; RV32I-NEXT: lbu s5, 17(a0) -; RV32I-NEXT: lbu s6, 18(a0) -; RV32I-NEXT: lbu s7, 19(a0) +; RV32I-NEXT: lbu t1, 4(a0) +; RV32I-NEXT: lbu t3, 5(a0) +; RV32I-NEXT: lbu t4, 6(a0) +; RV32I-NEXT: lbu s0, 7(a0) +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu s3, 9(a0) +; RV32I-NEXT: lbu s6, 10(a0) +; RV32I-NEXT: lbu s8, 11(a0) +; RV32I-NEXT: lbu s9, 12(a0) +; RV32I-NEXT: lbu s10, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s7, 15(a0) +; RV32I-NEXT: lbu s5, 16(a0) +; RV32I-NEXT: lbu s11, 17(a0) +; RV32I-NEXT: lbu ra, 18(a0) +; RV32I-NEXT: lbu a3, 19(a0) +; RV32I-NEXT: lbu t5, 20(a0) +; RV32I-NEXT: lbu t6, 21(a0) +; RV32I-NEXT: lbu a7, 22(a0) +; RV32I-NEXT: lbu t0, 23(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli s0, s0, 24 +; RV32I-NEXT: or a4, a4, s1 +; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s10, 22(a0) -; RV32I-NEXT: lbu s11, 23(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: slli s2, s2, 16 -; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: or t1, s1, s0 -; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: lbu t6, 24(a0) +; RV32I-NEXT: or a5, t3, t1 +; RV32I-NEXT: or a6, s0, t4 +; RV32I-NEXT: lbu t1, 24(a0) ; RV32I-NEXT: lbu s0, 25(a0) ; RV32I-NEXT: lbu s1, 26(a0) ; RV32I-NEXT: lbu s2, 27(a0) -; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s3, s3, 8 ; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli s7, s7, 24 -; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: or t3, s5, s4 -; RV32I-NEXT: or t4, s7, s6 -; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: slli s8, s8, 24 +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: or t2, s3, t2 +; RV32I-NEXT: or t3, s8, s6 +; RV32I-NEXT: or t4, s10, s9 ; RV32I-NEXT: lbu s3, 28(a0) -; RV32I-NEXT: lbu s4, 29(a0) -; RV32I-NEXT: lbu s5, 30(a0) -; RV32I-NEXT: lbu s6, 31(a0) -; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or a0, s11, s10 -; RV32I-NEXT: or t6, s0, t6 -; RV32I-NEXT: or s0, s2, s1 -; RV32I-NEXT: lbu s1, 0(a1) -; RV32I-NEXT: lbu s2, 1(a1) -; RV32I-NEXT: lbu s7, 2(a1) +; RV32I-NEXT: lbu s6, 29(a0) +; RV32I-NEXT: lbu s8, 30(a0) +; RV32I-NEXT: lbu s9, 31(a0) +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a0, s7, s4 +; RV32I-NEXT: or s4, s11, s5 +; RV32I-NEXT: or s5, a3, ra +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu s7, 1(a1) +; RV32I-NEXT: lbu s10, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 0(sp) -; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: or s3, s4, s3 -; RV32I-NEXT: addi s4, sp, 32 -; RV32I-NEXT: slli s5, s5, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s7, s7, 16 +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or t5, t6, t5 +; RV32I-NEXT: addi t6, sp, 40 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or s5, s6, s5 -; RV32I-NEXT: or s1, s2, s1 -; RV32I-NEXT: or a1, a1, s7 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or a0, a0, t5 -; RV32I-NEXT: or t0, s0, t6 -; RV32I-NEXT: or t1, s5, s3 -; RV32I-NEXT: or a1, a1, s1 -; RV32I-NEXT: sw a7, 48(sp) +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, s0, t1 +; RV32I-NEXT: or t1, s2, s1 +; RV32I-NEXT: or s0, s6, s3 +; RV32I-NEXT: or s1, s9, s8 +; RV32I-NEXT: or a3, s7, a3 +; RV32I-NEXT: or a1, a1, s10 +; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, s2 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t3, t2 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: or a7, a7, t5 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: sw t2, 56(sp) +; RV32I-NEXT: sw a7, 60(sp) +; RV32I-NEXT: sw t0, 64(sp) +; RV32I-NEXT: sw s0, 68(sp) +; RV32I-NEXT: sw a4, 40(sp) +; RV32I-NEXT: sw a5, 44(sp) +; RV32I-NEXT: sw a6, 48(sp) ; RV32I-NEXT: sw a0, 52(sp) -; RV32I-NEXT: sw t0, 56(sp) -; RV32I-NEXT: sw t1, 60(sp) -; RV32I-NEXT: sw a3, 32(sp) -; RV32I-NEXT: sw a4, 36(sp) -; RV32I-NEXT: sw a5, 40(sp) -; RV32I-NEXT: sw a6, 44(sp) ; RV32I-NEXT: slli a3, a1, 3 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: sub a1, s4, a1 +; RV32I-NEXT: sub a1, t6, a1 ; RV32I-NEXT: andi a0, a3, 24 ; RV32I-NEXT: xori a0, a0, 31 ; RV32I-NEXT: lw a4, 0(a1) @@ -3647,10 +3664,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli t4, a4, 1 ; RV32I-NEXT: sll t5, a7, a3 ; RV32I-NEXT: srli t6, a6, 1 -; RV32I-NEXT: sll a6, a6, a3 +; RV32I-NEXT: sll s0, a6, a3 ; RV32I-NEXT: srli a5, a5, 1 -; RV32I-NEXT: sll s0, t1, a3 -; RV32I-NEXT: srli s1, t0, 1 +; RV32I-NEXT: sll s1, t1, a3 +; RV32I-NEXT: srli a6, t0, 1 ; RV32I-NEXT: sll s2, t0, a3 ; RV32I-NEXT: srli a7, a7, 1 ; RV32I-NEXT: sll s3, a1, a3 @@ -3658,56 +3675,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sll s4, t2, a3 ; RV32I-NEXT: srli t0, t1, 1 ; RV32I-NEXT: sll s5, a4, a3 -; RV32I-NEXT: srl t4, t4, a0 -; RV32I-NEXT: srl a4, t6, a0 -; RV32I-NEXT: srl t1, a5, a0 -; RV32I-NEXT: srl t6, s1, a0 -; RV32I-NEXT: srl s1, a7, a0 -; RV32I-NEXT: srl s6, a1, a0 -; RV32I-NEXT: srl s7, t0, a0 -; RV32I-NEXT: srli t2, s4, 24 -; RV32I-NEXT: srli t0, s3, 24 +; RV32I-NEXT: srl t2, t4, a0 +; RV32I-NEXT: srl t4, t6, a0 +; RV32I-NEXT: srl t6, a5, a0 +; RV32I-NEXT: srl s6, a6, a0 +; RV32I-NEXT: srl s7, a7, a0 +; RV32I-NEXT: srl s8, a1, a0 +; RV32I-NEXT: srl s9, t0, a0 +; RV32I-NEXT: srli t1, s4, 24 +; RV32I-NEXT: srli a7, s3, 24 ; RV32I-NEXT: srli a5, s2, 24 -; RV32I-NEXT: srli a3, s0, 24 -; RV32I-NEXT: srli a1, a6, 24 +; RV32I-NEXT: srli a3, s1, 24 +; RV32I-NEXT: srli a1, s0, 24 ; RV32I-NEXT: srli a0, t5, 24 -; RV32I-NEXT: srli s8, s5, 24 -; RV32I-NEXT: or a4, t5, a4 -; RV32I-NEXT: srli t5, s5, 16 -; RV32I-NEXT: or t1, a6, t1 -; RV32I-NEXT: srli s9, s5, 8 -; RV32I-NEXT: or a7, t3, t4 -; RV32I-NEXT: srli a6, t3, 24 -; RV32I-NEXT: or t3, s0, t6 -; RV32I-NEXT: or t4, s2, s1 -; RV32I-NEXT: or t6, s3, s6 -; RV32I-NEXT: or s0, s4, s7 +; RV32I-NEXT: srli s10, s5, 24 +; RV32I-NEXT: srli s11, s5, 16 +; RV32I-NEXT: srli ra, s5, 8 +; RV32I-NEXT: srli a4, t3, 24 +; RV32I-NEXT: or a6, t3, t2 +; RV32I-NEXT: or t0, t5, t4 +; RV32I-NEXT: or t2, s0, t6 +; RV32I-NEXT: or t3, s1, s6 +; RV32I-NEXT: or t4, s2, s7 +; RV32I-NEXT: or t5, s3, s8 +; RV32I-NEXT: or t6, s4, s9 ; RV32I-NEXT: sb s5, 0(a2) -; RV32I-NEXT: sb s9, 1(a2) -; RV32I-NEXT: sb t5, 2(a2) -; RV32I-NEXT: sb s8, 3(a2) -; RV32I-NEXT: srli t5, s0, 16 -; RV32I-NEXT: srli s1, s0, 8 -; RV32I-NEXT: srli s2, t6, 16 -; RV32I-NEXT: srli s3, t6, 8 +; RV32I-NEXT: sb ra, 1(a2) +; RV32I-NEXT: sb s11, 2(a2) +; RV32I-NEXT: sb s10, 3(a2) +; RV32I-NEXT: srli s0, t6, 16 +; RV32I-NEXT: srli s1, t6, 8 +; RV32I-NEXT: srli s2, t5, 16 +; RV32I-NEXT: srli s3, t5, 8 ; RV32I-NEXT: srli s4, t4, 16 ; RV32I-NEXT: srli s5, t4, 8 ; RV32I-NEXT: srli s6, t3, 16 ; RV32I-NEXT: srli s7, t3, 8 -; RV32I-NEXT: sb s0, 24(a2) -; RV32I-NEXT: srli s0, t1, 16 +; RV32I-NEXT: srli s8, t2, 16 +; RV32I-NEXT: srli s9, t2, 8 +; RV32I-NEXT: srli s10, t0, 16 +; RV32I-NEXT: srli s11, t0, 8 +; RV32I-NEXT: sb t6, 24(a2) ; RV32I-NEXT: sb s1, 25(a2) -; RV32I-NEXT: srli s1, t1, 8 -; RV32I-NEXT: sb t5, 26(a2) -; RV32I-NEXT: srli t5, a4, 16 -; RV32I-NEXT: sb t2, 27(a2) -; RV32I-NEXT: srli t2, a4, 8 -; RV32I-NEXT: sb t6, 28(a2) -; RV32I-NEXT: srli t6, a7, 16 +; RV32I-NEXT: sb s0, 26(a2) +; RV32I-NEXT: sb t1, 27(a2) +; RV32I-NEXT: srli t1, a6, 16 +; RV32I-NEXT: sb t5, 28(a2) ; RV32I-NEXT: sb s3, 29(a2) ; RV32I-NEXT: sb s2, 30(a2) -; RV32I-NEXT: sb t0, 31(a2) -; RV32I-NEXT: srli t0, a7, 8 +; RV32I-NEXT: sb a7, 31(a2) +; RV32I-NEXT: srli a7, a6, 8 ; RV32I-NEXT: sb t4, 16(a2) ; RV32I-NEXT: sb s5, 17(a2) ; RV32I-NEXT: sb s4, 18(a2) @@ -3716,31 +3733,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s7, 21(a2) ; RV32I-NEXT: sb s6, 22(a2) ; RV32I-NEXT: sb a3, 23(a2) -; RV32I-NEXT: sb t1, 8(a2) -; RV32I-NEXT: sb s1, 9(a2) -; RV32I-NEXT: sb s0, 10(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb s9, 9(a2) +; RV32I-NEXT: sb s8, 10(a2) ; RV32I-NEXT: sb a1, 11(a2) -; RV32I-NEXT: sb a4, 12(a2) -; RV32I-NEXT: sb t2, 13(a2) -; RV32I-NEXT: sb t5, 14(a2) +; RV32I-NEXT: sb t0, 12(a2) +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) ; RV32I-NEXT: sb a0, 15(a2) -; RV32I-NEXT: sb a7, 4(a2) -; RV32I-NEXT: sb t0, 5(a2) -; RV32I-NEXT: sb t6, 6(a2) -; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 112 +; RV32I-NEXT: sb a6, 4(a2) +; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb t1, 6(a2) +; RV32I-NEXT: sb a4, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -3985,128 +4003,132 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; ; RV32I-LABEL: shl_32bytes_wordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -112 -; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s2, 12(a0) -; RV32I-NEXT: lbu s3, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu s6, 16(a0) -; RV32I-NEXT: lbu s7, 17(a0) -; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu s9, 19(a0) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: lbu s10, 20(a0) -; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: lbu a5, 0(a0) +; RV32I-NEXT: lbu a7, 1(a0) +; RV32I-NEXT: lbu t0, 2(a0) +; RV32I-NEXT: lbu t1, 3(a0) +; RV32I-NEXT: lbu s2, 4(a0) +; RV32I-NEXT: lbu s4, 5(a0) +; RV32I-NEXT: lbu s5, 6(a0) +; RV32I-NEXT: lbu s6, 7(a0) +; RV32I-NEXT: lbu s3, 8(a0) +; RV32I-NEXT: lbu s9, 9(a0) +; RV32I-NEXT: lbu s10, 10(a0) +; RV32I-NEXT: lbu s11, 11(a0) +; RV32I-NEXT: lbu ra, 12(a0) +; RV32I-NEXT: lbu a1, 13(a0) +; RV32I-NEXT: lbu t4, 14(a0) +; RV32I-NEXT: lbu t6, 15(a0) +; RV32I-NEXT: lbu a4, 16(a0) +; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a6, 17(a0) +; RV32I-NEXT: lbu t2, 18(a0) +; RV32I-NEXT: lbu t3, 19(a0) +; RV32I-NEXT: lbu a4, 20(a0) +; RV32I-NEXT: lbu t5, 21(a0) ; RV32I-NEXT: lbu s0, 22(a0) ; RV32I-NEXT: lbu s1, 23(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: or t1, s3, s2 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu s2, 25(a0) -; RV32I-NEXT: lbu s3, 26(a0) -; RV32I-NEXT: lbu s4, 27(a0) -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: or t4, s7, s6 -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: or t6, s11, s10 -; RV32I-NEXT: lbu s5, 28(a0) -; RV32I-NEXT: lbu s6, 29(a0) -; RV32I-NEXT: lbu s7, 30(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or t0, s4, s2 +; RV32I-NEXT: or t1, s6, s5 +; RV32I-NEXT: lbu s2, 24(a0) +; RV32I-NEXT: lbu s6, 25(a0) +; RV32I-NEXT: lbu s7, 26(a0) +; RV32I-NEXT: lbu s8, 27(a0) +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or s3, s9, s3 +; RV32I-NEXT: or s4, s11, s10 +; RV32I-NEXT: or s5, a1, ra +; RV32I-NEXT: lbu s9, 28(a0) +; RV32I-NEXT: lbu a1, 29(a0) +; RV32I-NEXT: lbu s10, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: lbu a3, 0(a3) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 0(sp) -; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or t4, t6, t4 +; RV32I-NEXT: addi t6, sp, 40 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: slli t5, t5, 8 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: addi s1, sp, 32 -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s3, s3, 16 -; RV32I-NEXT: slli s4, s4, 24 ; RV32I-NEXT: slli s6, s6, 8 ; RV32I-NEXT: slli s7, s7, 16 +; RV32I-NEXT: slli s8, s8, 24 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: or t3, s2, t3 -; RV32I-NEXT: or s2, s4, s3 -; RV32I-NEXT: or s3, s6, s5 -; RV32I-NEXT: or a0, a0, s7 -; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t5, t4 -; RV32I-NEXT: or t0, s0, t6 -; RV32I-NEXT: or t1, s2, t3 -; RV32I-NEXT: or a0, a0, s3 -; RV32I-NEXT: sub s1, s1, a1 -; RV32I-NEXT: sw a7, 48(sp) -; RV32I-NEXT: sw t0, 52(sp) -; RV32I-NEXT: sw t1, 56(sp) -; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a3, 32(sp) -; RV32I-NEXT: sw a4, 36(sp) +; RV32I-NEXT: slli a3, a3, 2 +; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a6, a6, s11 +; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: or a4, t5, a4 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: or t3, s6, s2 +; RV32I-NEXT: or t5, s8, s7 +; RV32I-NEXT: or a1, a1, s9 +; RV32I-NEXT: or a0, a0, s10 +; RV32I-NEXT: andi a3, a3, 28 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or t0, s4, s3 +; RV32I-NEXT: or t1, t4, s5 +; RV32I-NEXT: or a6, t2, a6 +; RV32I-NEXT: or a4, s0, a4 +; RV32I-NEXT: or t2, t5, t3 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: sub t3, t6, a3 +; RV32I-NEXT: sw a6, 56(sp) +; RV32I-NEXT: sw a4, 60(sp) +; RV32I-NEXT: sw t2, 64(sp) +; RV32I-NEXT: sw a0, 68(sp) ; RV32I-NEXT: sw a5, 40(sp) -; RV32I-NEXT: sw a6, 44(sp) -; RV32I-NEXT: lw a6, 16(s1) -; RV32I-NEXT: lw a5, 20(s1) -; RV32I-NEXT: lw a7, 24(s1) -; RV32I-NEXT: lw a1, 0(s1) -; RV32I-NEXT: lw a0, 4(s1) -; RV32I-NEXT: lw a4, 8(s1) -; RV32I-NEXT: lw a3, 12(s1) -; RV32I-NEXT: lw t0, 28(s1) +; RV32I-NEXT: sw a7, 44(sp) +; RV32I-NEXT: sw t0, 48(sp) +; RV32I-NEXT: sw t1, 52(sp) +; RV32I-NEXT: lw a6, 16(t3) +; RV32I-NEXT: lw a5, 20(t3) +; RV32I-NEXT: lw a7, 24(t3) +; RV32I-NEXT: lw a1, 0(t3) +; RV32I-NEXT: lw a0, 4(t3) +; RV32I-NEXT: lw a4, 8(t3) +; RV32I-NEXT: lw a3, 12(t3) +; RV32I-NEXT: lw t0, 28(t3) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -4121,21 +4143,21 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 +; RV32I-NEXT: srli s8, a4, 8 +; RV32I-NEXT: srli s9, a3, 24 +; RV32I-NEXT: srli s10, a3, 16 +; RV32I-NEXT: srli s11, a3, 8 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a4, 8 +; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t3, 25(a2) -; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) -; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a3, 8 +; RV32I-NEXT: srli t1, a1, 16 ; RV32I-NEXT: sb t0, 28(a2) -; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) -; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t4, a1, 8 +; RV32I-NEXT: srli t0, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -4147,35 +4169,36 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a7, 9(a2) +; RV32I-NEXT: sb s8, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb t1, 13(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t3, 15(a2) +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb s9, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t4, 1(a2) -; RV32I-NEXT: sb t6, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb t1, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 112 +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %wordOff = load i256, ptr %wordOff.ptr, align 1 @@ -4201,111 +4224,111 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: lbu a4, 1(a0) -; RV64I-NEXT: lbu a5, 2(a0) -; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: lbu a7, 4(a0) -; RV64I-NEXT: lbu t0, 5(a0) -; RV64I-NEXT: lbu t1, 6(a0) -; RV64I-NEXT: lbu t2, 7(a0) -; RV64I-NEXT: lbu t3, 8(a0) -; RV64I-NEXT: lbu t4, 9(a0) -; RV64I-NEXT: lbu t5, 10(a0) -; RV64I-NEXT: lbu t6, 11(a0) -; RV64I-NEXT: lbu s0, 12(a0) -; RV64I-NEXT: lbu s1, 13(a0) -; RV64I-NEXT: lbu s2, 14(a0) -; RV64I-NEXT: lbu s3, 15(a0) -; RV64I-NEXT: lbu s4, 16(a0) -; RV64I-NEXT: lbu s5, 17(a0) -; RV64I-NEXT: lbu s6, 18(a0) -; RV64I-NEXT: lbu s7, 19(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a6, t2, t1 -; RV64I-NEXT: lbu s8, 20(a0) -; RV64I-NEXT: lbu s9, 21(a0) -; RV64I-NEXT: lbu s10, 22(a0) -; RV64I-NEXT: lbu s11, 23(a0) -; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: slli t5, t5, 16 -; RV64I-NEXT: slli t6, t6, 24 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: lbu a5, 0(a0) +; RV64I-NEXT: lbu a7, 1(a0) +; RV64I-NEXT: lbu t2, 2(a0) +; RV64I-NEXT: lbu s3, 3(a0) +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu s8, 5(a0) +; RV64I-NEXT: lbu s9, 6(a0) +; RV64I-NEXT: lbu s10, 7(a0) +; RV64I-NEXT: lbu s2, 8(a0) +; RV64I-NEXT: lbu s4, 9(a0) +; RV64I-NEXT: lbu s5, 10(a0) +; RV64I-NEXT: lbu s6, 11(a0) +; RV64I-NEXT: lbu s7, 12(a0) +; RV64I-NEXT: lbu s11, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t3, 15(a0) +; RV64I-NEXT: lbu a3, 16(a0) +; RV64I-NEXT: lbu a6, 17(a0) +; RV64I-NEXT: lbu t4, 18(a0) +; RV64I-NEXT: lbu t5, 19(a0) +; RV64I-NEXT: lbu a4, 20(a0) +; RV64I-NEXT: lbu t6, 21(a0) +; RV64I-NEXT: lbu s0, 22(a0) +; RV64I-NEXT: lbu s1, 23(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: or a7, t4, t3 -; RV64I-NEXT: or t0, t6, t5 -; RV64I-NEXT: or t1, s1, s0 -; RV64I-NEXT: or t2, s3, s2 -; RV64I-NEXT: lbu t3, 24(a0) -; RV64I-NEXT: lbu t4, 25(a0) -; RV64I-NEXT: lbu t5, 26(a0) -; RV64I-NEXT: lbu t6, 27(a0) -; RV64I-NEXT: slli s5, s5, 8 -; RV64I-NEXT: slli s6, s6, 16 -; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: slli s9, s9, 8 -; RV64I-NEXT: or s0, s5, s4 -; RV64I-NEXT: or s1, s7, s6 -; RV64I-NEXT: or s2, s9, s8 -; RV64I-NEXT: lbu s3, 28(a0) -; RV64I-NEXT: lbu s4, 29(a0) -; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, s3, t2 +; RV64I-NEXT: or t0, s8, t0 +; RV64I-NEXT: or t2, s10, s9 +; RV64I-NEXT: lbu s3, 24(a0) +; RV64I-NEXT: lbu s8, 25(a0) +; RV64I-NEXT: lbu s9, 26(a0) +; RV64I-NEXT: lbu s10, 27(a0) +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: or s2, s4, s2 +; RV64I-NEXT: or s4, s6, s5 +; RV64I-NEXT: or s5, s11, s7 +; RV64I-NEXT: lbu s6, 28(a0) +; RV64I-NEXT: lbu s7, 29(a0) +; RV64I-NEXT: lbu s11, 30(a0) ; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) -; RV64I-NEXT: slli s10, s10, 16 -; RV64I-NEXT: slli s11, s11, 24 -; RV64I-NEXT: or s6, s11, s10 -; RV64I-NEXT: addi s7, sp, 32 -; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: slli t5, t5, 16 -; RV64I-NEXT: slli t6, t6, 24 -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: or t1, t3, t1 +; RV64I-NEXT: addi t3, sp, 32 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: slli t5, t5, 24 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s0, s0, 16 +; RV64I-NEXT: slli s1, s1, 24 +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: slli s11, s11, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: or t3, t4, t3 -; RV64I-NEXT: or t4, t6, t5 -; RV64I-NEXT: or t5, s4, s3 -; RV64I-NEXT: or a0, a0, s5 -; RV64I-NEXT: andi a1, a1, 24 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a6, t5, t4 +; RV64I-NEXT: or a4, t6, a4 ; RV64I-NEXT: or s0, s1, s0 -; RV64I-NEXT: or a7, s6, s2 -; RV64I-NEXT: or t0, t4, t3 -; RV64I-NEXT: or a0, a0, t5 -; RV64I-NEXT: sub t1, s7, a1 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or t4, s8, s3 +; RV64I-NEXT: or t5, s10, s9 +; RV64I-NEXT: or t6, s7, s6 +; RV64I-NEXT: or a0, a0, s11 +; RV64I-NEXT: andi a1, a1, 24 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: or t0, s4, s2 +; RV64I-NEXT: or t1, t1, s5 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a4, s0, a4 +; RV64I-NEXT: or a6, t5, t4 +; RV64I-NEXT: or a0, a0, t6 +; RV64I-NEXT: sub t2, t3, a1 ; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a1, a7, a5 +; RV64I-NEXT: or a5, t1, t0 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a1, a6, a5 -; RV64I-NEXT: or a4, a7, s0 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: sd a3, 32(sp) -; RV64I-NEXT: sd a1, 40(sp) -; RV64I-NEXT: sd a4, 48(sp) +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: sd a1, 32(sp) +; RV64I-NEXT: sd a5, 40(sp) +; RV64I-NEXT: sd a3, 48(sp) ; RV64I-NEXT: sd a0, 56(sp) -; RV64I-NEXT: ld a4, 16(t1) -; RV64I-NEXT: ld a0, 8(t1) -; RV64I-NEXT: ld a1, 0(t1) -; RV64I-NEXT: ld a3, 24(t1) +; RV64I-NEXT: ld a4, 16(t2) +; RV64I-NEXT: ld a0, 8(t2) +; RV64I-NEXT: ld a1, 0(t2) +; RV64I-NEXT: ld a3, 24(t2) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 ; RV64I-NEXT: srli a7, a4, 40 @@ -4324,25 +4347,25 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: srli s5, a1, 48 ; RV64I-NEXT: srli s6, a1, 40 ; RV64I-NEXT: srli s7, a1, 32 +; RV64I-NEXT: srli s8, a1, 24 +; RV64I-NEXT: srli s9, a1, 16 +; RV64I-NEXT: srli s10, a1, 8 +; RV64I-NEXT: srli s11, a0, 56 ; RV64I-NEXT: sb t0, 20(a2) -; RV64I-NEXT: srli t0, a1, 24 ; RV64I-NEXT: sb a7, 21(a2) -; RV64I-NEXT: srli a7, a1, 16 ; RV64I-NEXT: sb a6, 22(a2) -; RV64I-NEXT: srli a6, a1, 8 ; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, a0, 56 +; RV64I-NEXT: srli a5, a0, 48 ; RV64I-NEXT: sb a4, 16(a2) -; RV64I-NEXT: srli a4, a0, 48 ; RV64I-NEXT: sb t3, 17(a2) ; RV64I-NEXT: sb t2, 18(a2) ; RV64I-NEXT: sb t1, 19(a2) -; RV64I-NEXT: srli t1, a0, 40 +; RV64I-NEXT: srli a4, a0, 40 ; RV64I-NEXT: sb s0, 28(a2) ; RV64I-NEXT: sb t6, 29(a2) ; RV64I-NEXT: sb t5, 30(a2) ; RV64I-NEXT: sb t4, 31(a2) -; RV64I-NEXT: srli t2, a0, 32 +; RV64I-NEXT: srli a6, a0, 32 ; RV64I-NEXT: sb a3, 24(a2) ; RV64I-NEXT: sb s3, 25(a2) ; RV64I-NEXT: sb s2, 26(a2) @@ -4352,19 +4375,19 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: sb s6, 5(a2) ; RV64I-NEXT: sb s5, 6(a2) ; RV64I-NEXT: sb s4, 7(a2) -; RV64I-NEXT: srli t3, a0, 16 +; RV64I-NEXT: srli a7, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a6, 1(a2) -; RV64I-NEXT: sb a7, 2(a2) -; RV64I-NEXT: sb t0, 3(a2) +; RV64I-NEXT: sb s10, 1(a2) +; RV64I-NEXT: sb s9, 2(a2) +; RV64I-NEXT: sb s8, 3(a2) ; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: sb t2, 12(a2) -; RV64I-NEXT: sb t1, 13(a2) -; RV64I-NEXT: sb a4, 14(a2) -; RV64I-NEXT: sb a5, 15(a2) +; RV64I-NEXT: sb a6, 12(a2) +; RV64I-NEXT: sb a4, 13(a2) +; RV64I-NEXT: sb a5, 14(a2) +; RV64I-NEXT: sb s11, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: sb t3, 10(a2) +; RV64I-NEXT: sb a7, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) ; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload @@ -4383,128 +4406,132 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; ; RV32I-LABEL: shl_32bytes_dwordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -112 -; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s2, 12(a0) -; RV32I-NEXT: lbu s3, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu s6, 16(a0) -; RV32I-NEXT: lbu s7, 17(a0) -; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu s9, 19(a0) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: lbu s10, 20(a0) -; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: lbu a5, 0(a0) +; RV32I-NEXT: lbu a7, 1(a0) +; RV32I-NEXT: lbu t0, 2(a0) +; RV32I-NEXT: lbu t1, 3(a0) +; RV32I-NEXT: lbu s2, 4(a0) +; RV32I-NEXT: lbu s4, 5(a0) +; RV32I-NEXT: lbu s5, 6(a0) +; RV32I-NEXT: lbu s6, 7(a0) +; RV32I-NEXT: lbu s3, 8(a0) +; RV32I-NEXT: lbu s9, 9(a0) +; RV32I-NEXT: lbu s10, 10(a0) +; RV32I-NEXT: lbu s11, 11(a0) +; RV32I-NEXT: lbu ra, 12(a0) +; RV32I-NEXT: lbu a1, 13(a0) +; RV32I-NEXT: lbu t4, 14(a0) +; RV32I-NEXT: lbu t6, 15(a0) +; RV32I-NEXT: lbu a4, 16(a0) +; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a6, 17(a0) +; RV32I-NEXT: lbu t2, 18(a0) +; RV32I-NEXT: lbu t3, 19(a0) +; RV32I-NEXT: lbu a4, 20(a0) +; RV32I-NEXT: lbu t5, 21(a0) ; RV32I-NEXT: lbu s0, 22(a0) ; RV32I-NEXT: lbu s1, 23(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: or t1, s3, s2 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu s2, 25(a0) -; RV32I-NEXT: lbu s3, 26(a0) -; RV32I-NEXT: lbu s4, 27(a0) -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: or t4, s7, s6 -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: or t6, s11, s10 -; RV32I-NEXT: lbu s5, 28(a0) -; RV32I-NEXT: lbu s6, 29(a0) -; RV32I-NEXT: lbu s7, 30(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or t0, s4, s2 +; RV32I-NEXT: or t1, s6, s5 +; RV32I-NEXT: lbu s2, 24(a0) +; RV32I-NEXT: lbu s6, 25(a0) +; RV32I-NEXT: lbu s7, 26(a0) +; RV32I-NEXT: lbu s8, 27(a0) +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or s3, s9, s3 +; RV32I-NEXT: or s4, s11, s10 +; RV32I-NEXT: or s5, a1, ra +; RV32I-NEXT: lbu s9, 28(a0) +; RV32I-NEXT: lbu a1, 29(a0) +; RV32I-NEXT: lbu s10, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: lbu a3, 0(a3) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 0(sp) -; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or t4, t6, t4 +; RV32I-NEXT: addi t6, sp, 40 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: slli t5, t5, 8 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: addi s1, sp, 32 -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s3, s3, 16 -; RV32I-NEXT: slli s4, s4, 24 ; RV32I-NEXT: slli s6, s6, 8 ; RV32I-NEXT: slli s7, s7, 16 +; RV32I-NEXT: slli s8, s8, 24 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: or t3, s2, t3 -; RV32I-NEXT: or s2, s4, s3 -; RV32I-NEXT: or s3, s6, s5 -; RV32I-NEXT: or a0, a0, s7 -; RV32I-NEXT: andi a1, a1, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t5, t4 -; RV32I-NEXT: or t0, s0, t6 -; RV32I-NEXT: or t1, s2, t3 -; RV32I-NEXT: or a0, a0, s3 -; RV32I-NEXT: sub s1, s1, a1 -; RV32I-NEXT: sw a7, 48(sp) -; RV32I-NEXT: sw t0, 52(sp) -; RV32I-NEXT: sw t1, 56(sp) -; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a3, 32(sp) -; RV32I-NEXT: sw a4, 36(sp) +; RV32I-NEXT: slli a3, a3, 3 +; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a6, a6, s11 +; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: or a4, t5, a4 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: or t3, s6, s2 +; RV32I-NEXT: or t5, s8, s7 +; RV32I-NEXT: or a1, a1, s9 +; RV32I-NEXT: or a0, a0, s10 +; RV32I-NEXT: andi a3, a3, 24 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or t0, s4, s3 +; RV32I-NEXT: or t1, t4, s5 +; RV32I-NEXT: or a6, t2, a6 +; RV32I-NEXT: or a4, s0, a4 +; RV32I-NEXT: or t2, t5, t3 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: sub t3, t6, a3 +; RV32I-NEXT: sw a6, 56(sp) +; RV32I-NEXT: sw a4, 60(sp) +; RV32I-NEXT: sw t2, 64(sp) +; RV32I-NEXT: sw a0, 68(sp) ; RV32I-NEXT: sw a5, 40(sp) -; RV32I-NEXT: sw a6, 44(sp) -; RV32I-NEXT: lw a6, 16(s1) -; RV32I-NEXT: lw a5, 20(s1) -; RV32I-NEXT: lw a7, 24(s1) -; RV32I-NEXT: lw a1, 0(s1) -; RV32I-NEXT: lw a0, 4(s1) -; RV32I-NEXT: lw a4, 8(s1) -; RV32I-NEXT: lw a3, 12(s1) -; RV32I-NEXT: lw t0, 28(s1) +; RV32I-NEXT: sw a7, 44(sp) +; RV32I-NEXT: sw t0, 48(sp) +; RV32I-NEXT: sw t1, 52(sp) +; RV32I-NEXT: lw a6, 16(t3) +; RV32I-NEXT: lw a5, 20(t3) +; RV32I-NEXT: lw a7, 24(t3) +; RV32I-NEXT: lw a1, 0(t3) +; RV32I-NEXT: lw a0, 4(t3) +; RV32I-NEXT: lw a4, 8(t3) +; RV32I-NEXT: lw a3, 12(t3) +; RV32I-NEXT: lw t0, 28(t3) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -4519,21 +4546,21 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 +; RV32I-NEXT: srli s8, a4, 8 +; RV32I-NEXT: srli s9, a3, 24 +; RV32I-NEXT: srli s10, a3, 16 +; RV32I-NEXT: srli s11, a3, 8 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a4, 8 +; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t3, 25(a2) -; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) -; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a3, 8 +; RV32I-NEXT: srli t1, a1, 16 ; RV32I-NEXT: sb t0, 28(a2) -; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) -; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t4, a1, 8 +; RV32I-NEXT: srli t0, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -4545,35 +4572,36 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a7, 9(a2) +; RV32I-NEXT: sb s8, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb t1, 13(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t3, 15(a2) +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb s9, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t4, 1(a2) -; RV32I-NEXT: sb t6, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb t1, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 112 +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 @@ -4818,137 +4846,140 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -112 -; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu t6, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) -; RV32I-NEXT: lbu s3, 15(a0) -; RV32I-NEXT: lbu s4, 16(a0) -; RV32I-NEXT: lbu s5, 17(a0) -; RV32I-NEXT: lbu s6, 18(a0) -; RV32I-NEXT: lbu s7, 19(a0) +; RV32I-NEXT: lbu t1, 4(a0) +; RV32I-NEXT: lbu t3, 5(a0) +; RV32I-NEXT: lbu t4, 6(a0) +; RV32I-NEXT: lbu t5, 7(a0) +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu s1, 9(a0) +; RV32I-NEXT: lbu s7, 10(a0) +; RV32I-NEXT: lbu s8, 11(a0) +; RV32I-NEXT: lbu s9, 12(a0) +; RV32I-NEXT: lbu s10, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s6, 15(a0) +; RV32I-NEXT: lbu s5, 16(a0) +; RV32I-NEXT: lbu s11, 17(a0) +; RV32I-NEXT: lbu ra, 18(a0) +; RV32I-NEXT: lbu a3, 19(a0) +; RV32I-NEXT: lbu s2, 20(a0) +; RV32I-NEXT: lbu s3, 21(a0) +; RV32I-NEXT: lbu a7, 22(a0) +; RV32I-NEXT: lbu t0, 23(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: or a4, a4, t6 +; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s10, 22(a0) -; RV32I-NEXT: lbu s11, 23(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or a5, t3, t1 +; RV32I-NEXT: or a6, t5, t4 +; RV32I-NEXT: lbu t1, 24(a0) +; RV32I-NEXT: lbu t5, 25(a0) +; RV32I-NEXT: lbu t6, 26(a0) +; RV32I-NEXT: lbu s0, 27(a0) ; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: slli s2, s2, 16 -; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: or t1, s1, s0 -; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: lbu t6, 24(a0) -; RV32I-NEXT: lbu s0, 25(a0) -; RV32I-NEXT: lbu s1, 26(a0) -; RV32I-NEXT: lbu s2, 27(a0) -; RV32I-NEXT: slli s5, s5, 8 -; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli s7, s7, 24 -; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: or t3, s5, s4 -; RV32I-NEXT: or t4, s7, s6 -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: lbu s3, 28(a0) -; RV32I-NEXT: lbu s4, 29(a0) -; RV32I-NEXT: lbu s5, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or s6, s11, s10 -; RV32I-NEXT: or t6, s0, t6 -; RV32I-NEXT: or s0, s2, s1 -; RV32I-NEXT: lbu s1, 0(a1) -; RV32I-NEXT: lbu s2, 1(a1) -; RV32I-NEXT: lbu s7, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: or s3, s4, s3 -; RV32I-NEXT: mv s4, sp -; RV32I-NEXT: slli s5, s5, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: slli s2, s2, 8 ; RV32I-NEXT: slli s7, s7, 16 +; RV32I-NEXT: slli s8, s8, 24 +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: or t2, s1, t2 +; RV32I-NEXT: or t3, s8, s7 +; RV32I-NEXT: or t4, s10, s9 +; RV32I-NEXT: lbu s1, 28(a0) +; RV32I-NEXT: lbu s7, 29(a0) +; RV32I-NEXT: lbu s8, 30(a0) +; RV32I-NEXT: lbu s9, 31(a0) +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a0, s6, s4 +; RV32I-NEXT: or s4, s11, s5 +; RV32I-NEXT: or s5, a3, ra +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu s6, 1(a1) +; RV32I-NEXT: lbu s10, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s2, s3, s2 +; RV32I-NEXT: addi s3, sp, 8 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: slli s0, s0, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or s5, a0, s5 -; RV32I-NEXT: or s1, s2, s1 -; RV32I-NEXT: or a1, a1, s7 -; RV32I-NEXT: srai a0, a0, 31 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, s6, t5 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, t5, t1 ; RV32I-NEXT: or t1, s0, t6 -; RV32I-NEXT: or t2, s5, s3 -; RV32I-NEXT: or a1, a1, s1 -; RV32I-NEXT: sw a0, 48(sp) -; RV32I-NEXT: sw a0, 52(sp) -; RV32I-NEXT: sw a0, 56(sp) -; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a0, 32(sp) -; RV32I-NEXT: sw a0, 36(sp) -; RV32I-NEXT: sw a0, 40(sp) -; RV32I-NEXT: sw a0, 44(sp) -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t0, 20(sp) -; RV32I-NEXT: sw t1, 24(sp) -; RV32I-NEXT: sw t2, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: or t5, s7, s1 +; RV32I-NEXT: or t6, s9, s8 +; RV32I-NEXT: or a3, s6, a3 +; RV32I-NEXT: or a1, a1, s10 +; RV32I-NEXT: srai s0, s9, 31 +; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, s1 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t3, t2 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: or a7, a7, s2 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: or t1, t6, t5 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: sw s0, 56(sp) +; RV32I-NEXT: sw s0, 60(sp) +; RV32I-NEXT: sw s0, 64(sp) +; RV32I-NEXT: sw s0, 68(sp) +; RV32I-NEXT: sw s0, 40(sp) +; RV32I-NEXT: sw s0, 44(sp) +; RV32I-NEXT: sw s0, 48(sp) +; RV32I-NEXT: sw s0, 52(sp) +; RV32I-NEXT: sw t2, 24(sp) +; RV32I-NEXT: sw a7, 28(sp) +; RV32I-NEXT: sw t0, 32(sp) +; RV32I-NEXT: sw t1, 36(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a6, 16(sp) +; RV32I-NEXT: sw a0, 20(sp) ; RV32I-NEXT: slli t1, a1, 3 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: add a1, s4, a1 +; RV32I-NEXT: add a1, s3, a1 ; RV32I-NEXT: andi a0, t1, 24 -; RV32I-NEXT: xori a7, a0, 31 +; RV32I-NEXT: xori t0, a0, 31 ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a5, 8(a1) ; RV32I-NEXT: lw a6, 12(a1) -; RV32I-NEXT: lw t0, 16(a1) +; RV32I-NEXT: lw a7, 16(a1) ; RV32I-NEXT: lw t2, 20(a1) ; RV32I-NEXT: lw t3, 24(a1) ; RV32I-NEXT: lw t4, 28(a1) @@ -4957,33 +4988,33 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srl a1, a3, t1 ; RV32I-NEXT: slli t6, a4, 1 ; RV32I-NEXT: srl a3, a6, t1 -; RV32I-NEXT: slli s0, t0, 1 +; RV32I-NEXT: slli s0, a7, 1 ; RV32I-NEXT: srl a4, a5, t1 ; RV32I-NEXT: slli s1, a6, 1 ; RV32I-NEXT: srl a5, t2, t1 ; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: srl a6, t0, t1 +; RV32I-NEXT: srl a6, a7, t1 ; RV32I-NEXT: slli t2, t2, 1 -; RV32I-NEXT: srl t0, t3, t1 +; RV32I-NEXT: srl a7, t3, t1 ; RV32I-NEXT: slli t3, t4, 1 ; RV32I-NEXT: sra t1, t4, t1 -; RV32I-NEXT: sll t4, t5, a7 -; RV32I-NEXT: sll t5, t6, a7 -; RV32I-NEXT: sll t6, s0, a7 -; RV32I-NEXT: sll s0, s1, a7 -; RV32I-NEXT: sll s1, s2, a7 -; RV32I-NEXT: sll t2, t2, a7 -; RV32I-NEXT: sll t3, t3, a7 +; RV32I-NEXT: sll t4, t5, t0 +; RV32I-NEXT: sll t5, t6, t0 +; RV32I-NEXT: sll t6, s0, t0 +; RV32I-NEXT: sll s0, s1, t0 +; RV32I-NEXT: sll s1, s2, t0 +; RV32I-NEXT: sll t2, t2, t0 +; RV32I-NEXT: sll t3, t3, t0 ; RV32I-NEXT: srli s2, t1, 24 ; RV32I-NEXT: srli s3, t1, 16 ; RV32I-NEXT: srli s4, t1, 8 -; RV32I-NEXT: or a7, a0, t4 +; RV32I-NEXT: or t0, a0, t4 ; RV32I-NEXT: or t4, a1, t5 ; RV32I-NEXT: or t5, a3, t6 ; RV32I-NEXT: or s0, a4, s0 ; RV32I-NEXT: or s1, a5, s1 ; RV32I-NEXT: or t2, a6, t2 -; RV32I-NEXT: or t3, t0, t3 +; RV32I-NEXT: or t3, a7, t3 ; RV32I-NEXT: sb t1, 28(a2) ; RV32I-NEXT: sb s4, 29(a2) ; RV32I-NEXT: sb s3, 30(a2) @@ -5000,23 +5031,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s6, s0, 24 ; RV32I-NEXT: srli s7, s0, 16 ; RV32I-NEXT: srli s0, s0, 8 -; RV32I-NEXT: sb t0, 24(a2) -; RV32I-NEXT: srli t0, t5, 24 -; RV32I-NEXT: sb t3, 25(a2) -; RV32I-NEXT: srli t3, t5, 16 +; RV32I-NEXT: srli s8, t5, 24 +; RV32I-NEXT: srli s9, t5, 16 ; RV32I-NEXT: srli t5, t5, 8 +; RV32I-NEXT: srli s10, t4, 24 +; RV32I-NEXT: srli s11, t4, 16 +; RV32I-NEXT: srli t4, t4, 8 +; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: sb t3, 25(a2) ; RV32I-NEXT: sb t6, 26(a2) -; RV32I-NEXT: srli t6, t4, 24 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, t4, 16 -; RV32I-NEXT: srli t4, t4, 8 +; RV32I-NEXT: srli a7, t0, 24 ; RV32I-NEXT: sb a6, 16(a2) -; RV32I-NEXT: srli a6, a7, 24 ; RV32I-NEXT: sb t2, 17(a2) ; RV32I-NEXT: sb s3, 18(a2) ; RV32I-NEXT: sb s2, 19(a2) -; RV32I-NEXT: srli t2, a7, 16 -; RV32I-NEXT: srli a7, a7, 8 +; RV32I-NEXT: srli a6, t0, 16 +; RV32I-NEXT: srli t0, t0, 8 ; RV32I-NEXT: sb a5, 20(a2) ; RV32I-NEXT: sb s1, 21(a2) ; RV32I-NEXT: sb s5, 22(a2) @@ -5027,29 +5058,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: sb a3, 12(a2) ; RV32I-NEXT: sb t5, 13(a2) -; RV32I-NEXT: sb t3, 14(a2) -; RV32I-NEXT: sb t0, 15(a2) +; RV32I-NEXT: sb s9, 14(a2) +; RV32I-NEXT: sb s8, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: sb t4, 1(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb t6, 3(a2) +; RV32I-NEXT: sb s11, 2(a2) +; RV32I-NEXT: sb s10, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: sb a7, 5(a2) -; RV32I-NEXT: sb t2, 6(a2) -; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 112 +; RV32I-NEXT: sb t0, 5(a2) +; RV32I-NEXT: sb a6, 6(a2) +; RV32I-NEXT: sb a7, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -5295,129 +5327,130 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; ; RV32I-LABEL: ashr_32bytes_wordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -112 -; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) -; RV32I-NEXT: lbu s3, 15(a0) -; RV32I-NEXT: lbu s4, 16(a0) -; RV32I-NEXT: lbu s5, 17(a0) -; RV32I-NEXT: lbu s6, 18(a0) -; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a5, 0(a0) +; RV32I-NEXT: lbu a6, 1(a0) +; RV32I-NEXT: lbu a7, 2(a0) +; RV32I-NEXT: lbu t1, 3(a0) +; RV32I-NEXT: lbu s0, 4(a0) +; RV32I-NEXT: lbu s2, 5(a0) +; RV32I-NEXT: lbu s3, 6(a0) +; RV32I-NEXT: lbu s6, 7(a0) +; RV32I-NEXT: lbu s1, 8(a0) +; RV32I-NEXT: lbu s7, 9(a0) +; RV32I-NEXT: lbu s8, 10(a0) +; RV32I-NEXT: lbu s9, 11(a0) +; RV32I-NEXT: lbu s10, 12(a0) +; RV32I-NEXT: lbu s11, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu t0, 17(a0) +; RV32I-NEXT: lbu t2, 18(a0) +; RV32I-NEXT: lbu t3, 19(a0) +; RV32I-NEXT: lbu a4, 20(a0) +; RV32I-NEXT: lbu t4, 21(a0) +; RV32I-NEXT: lbu t5, 22(a0) +; RV32I-NEXT: lbu t6, 23(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s3, s3, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t1, a7 +; RV32I-NEXT: or a7, s2, s0 +; RV32I-NEXT: or t1, s6, s3 +; RV32I-NEXT: lbu s0, 24(a0) +; RV32I-NEXT: lbu s6, 25(a0) +; RV32I-NEXT: lbu ra, 26(a0) +; RV32I-NEXT: lbu s2, 27(a0) +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or s1, s7, s1 +; RV32I-NEXT: or s7, s9, s8 +; RV32I-NEXT: or s3, s11, s10 +; RV32I-NEXT: lbu s8, 28(a0) +; RV32I-NEXT: lbu s9, 29(a0) +; RV32I-NEXT: lbu s10, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: or s4, s5, s4 +; RV32I-NEXT: addi s5, sp, 8 ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s10, 22(a0) -; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 ; RV32I-NEXT: slli t4, t4, 8 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: slli s2, s2, 16 -; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: or t1, s1, s0 -; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu t5, 25(a0) -; RV32I-NEXT: lbu t6, 26(a0) -; RV32I-NEXT: lbu s0, 27(a0) -; RV32I-NEXT: slli s5, s5, 8 -; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli s2, s2, 24 ; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: or t4, s5, s4 -; RV32I-NEXT: or s1, s7, s6 -; RV32I-NEXT: or s2, s9, s8 -; RV32I-NEXT: lbu s3, 28(a0) -; RV32I-NEXT: lbu s4, 29(a0) -; RV32I-NEXT: lbu s5, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: lbu a1, 0(a1) ; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: or s6, s11, s10 -; RV32I-NEXT: mv s7, sp -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: slli t6, t6, 16 -; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: or t3, t5, t3 -; RV32I-NEXT: or t5, s0, t6 -; RV32I-NEXT: or t6, s4, s3 -; RV32I-NEXT: or s0, a0, s5 +; RV32I-NEXT: or a3, t0, a3 +; RV32I-NEXT: or t0, t3, t2 +; RV32I-NEXT: or a4, t4, a4 +; RV32I-NEXT: or t2, t6, t5 +; RV32I-NEXT: or t3, s6, s0 +; RV32I-NEXT: or t4, s2, ra +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or t6, a0, s10 ; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, s1, t4 -; RV32I-NEXT: or t0, s6, s2 -; RV32I-NEXT: or t1, t5, t3 -; RV32I-NEXT: or t2, s0, t6 -; RV32I-NEXT: sw a0, 48(sp) -; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t1, a7 +; RV32I-NEXT: or a7, s7, s1 +; RV32I-NEXT: or t1, s4, s3 +; RV32I-NEXT: or a3, t0, a3 +; RV32I-NEXT: or a4, t2, a4 +; RV32I-NEXT: or t0, t4, t3 +; RV32I-NEXT: or t2, t6, t5 ; RV32I-NEXT: sw a0, 56(sp) ; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a0, 32(sp) -; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: sw a0, 64(sp) +; RV32I-NEXT: sw a0, 68(sp) ; RV32I-NEXT: sw a0, 40(sp) ; RV32I-NEXT: sw a0, 44(sp) -; RV32I-NEXT: add s7, s7, a1 -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t0, 20(sp) -; RV32I-NEXT: sw t1, 24(sp) -; RV32I-NEXT: sw t2, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: add s5, s5, a1 +; RV32I-NEXT: sw a3, 24(sp) +; RV32I-NEXT: sw a4, 28(sp) +; RV32I-NEXT: sw t0, 32(sp) +; RV32I-NEXT: sw t2, 36(sp) ; RV32I-NEXT: sw a5, 8(sp) ; RV32I-NEXT: sw a6, 12(sp) -; RV32I-NEXT: lw a6, 16(s7) -; RV32I-NEXT: lw a5, 20(s7) -; RV32I-NEXT: lw a7, 24(s7) -; RV32I-NEXT: lw a1, 0(s7) -; RV32I-NEXT: lw a0, 4(s7) -; RV32I-NEXT: lw a4, 8(s7) -; RV32I-NEXT: lw a3, 12(s7) -; RV32I-NEXT: lw t0, 28(s7) +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t1, 20(sp) +; RV32I-NEXT: lw a6, 16(s5) +; RV32I-NEXT: lw a5, 20(s5) +; RV32I-NEXT: lw a7, 24(s5) +; RV32I-NEXT: lw a1, 0(s5) +; RV32I-NEXT: lw a0, 4(s5) +; RV32I-NEXT: lw a4, 8(s5) +; RV32I-NEXT: lw a3, 12(s5) +; RV32I-NEXT: lw t0, 28(s5) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -5432,21 +5465,21 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 +; RV32I-NEXT: srli s8, a4, 8 +; RV32I-NEXT: srli s9, a3, 24 +; RV32I-NEXT: srli s10, a3, 16 +; RV32I-NEXT: srli s11, a3, 8 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a4, 8 +; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t3, 25(a2) -; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) -; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a3, 8 +; RV32I-NEXT: srli t1, a1, 16 ; RV32I-NEXT: sb t0, 28(a2) -; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) -; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t4, a1, 8 +; RV32I-NEXT: srli t0, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -5458,35 +5491,36 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a7, 9(a2) +; RV32I-NEXT: sb s8, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb t1, 13(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t3, 15(a2) +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb s9, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t4, 1(a2) -; RV32I-NEXT: sb t6, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb t1, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 112 +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %wordOff = load i256, ptr %wordOff.ptr, align 1 @@ -5512,112 +5546,112 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: lbu a4, 1(a0) -; RV64I-NEXT: lbu a5, 2(a0) -; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: lbu a7, 4(a0) -; RV64I-NEXT: lbu t0, 5(a0) -; RV64I-NEXT: lbu t1, 6(a0) -; RV64I-NEXT: lbu t2, 7(a0) -; RV64I-NEXT: lbu t3, 8(a0) -; RV64I-NEXT: lbu t4, 9(a0) -; RV64I-NEXT: lbu t5, 10(a0) -; RV64I-NEXT: lbu t6, 11(a0) -; RV64I-NEXT: lbu s0, 12(a0) -; RV64I-NEXT: lbu s1, 13(a0) -; RV64I-NEXT: lbu s2, 14(a0) -; RV64I-NEXT: lbu s3, 15(a0) -; RV64I-NEXT: lbu s4, 16(a0) -; RV64I-NEXT: lbu s5, 17(a0) -; RV64I-NEXT: lbu s6, 18(a0) -; RV64I-NEXT: lbu s7, 19(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: lbu a5, 0(a0) +; RV64I-NEXT: lbu a7, 1(a0) +; RV64I-NEXT: lbu t1, 2(a0) +; RV64I-NEXT: lbu s3, 3(a0) +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu s8, 5(a0) +; RV64I-NEXT: lbu s9, 6(a0) +; RV64I-NEXT: lbu s10, 7(a0) +; RV64I-NEXT: lbu s2, 8(a0) +; RV64I-NEXT: lbu s4, 9(a0) +; RV64I-NEXT: lbu s5, 10(a0) +; RV64I-NEXT: lbu s6, 11(a0) +; RV64I-NEXT: lbu s7, 12(a0) +; RV64I-NEXT: lbu s11, 13(a0) +; RV64I-NEXT: lbu t4, 14(a0) +; RV64I-NEXT: lbu t5, 15(a0) +; RV64I-NEXT: lbu a3, 16(a0) +; RV64I-NEXT: lbu a6, 17(a0) +; RV64I-NEXT: lbu t2, 18(a0) +; RV64I-NEXT: lbu t3, 19(a0) +; RV64I-NEXT: lbu a4, 20(a0) +; RV64I-NEXT: lbu t6, 21(a0) +; RV64I-NEXT: lbu s0, 22(a0) +; RV64I-NEXT: lbu s1, 23(a0) +; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a6, t2, t1 -; RV64I-NEXT: lbu s8, 20(a0) -; RV64I-NEXT: lbu s9, 21(a0) -; RV64I-NEXT: lbu s10, 22(a0) -; RV64I-NEXT: lbu s11, 23(a0) -; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: slli t5, t5, 16 -; RV64I-NEXT: slli t6, t6, 24 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: slli s2, s2, 16 ; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: or a7, t4, t3 -; RV64I-NEXT: or t0, t6, t5 -; RV64I-NEXT: or t1, s1, s0 -; RV64I-NEXT: or t2, s3, s2 -; RV64I-NEXT: lbu t3, 24(a0) -; RV64I-NEXT: lbu t4, 25(a0) -; RV64I-NEXT: lbu t5, 26(a0) -; RV64I-NEXT: lbu t6, 27(a0) -; RV64I-NEXT: slli s5, s5, 8 -; RV64I-NEXT: slli s6, s6, 16 -; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: slli s9, s9, 8 -; RV64I-NEXT: or s0, s5, s4 -; RV64I-NEXT: or s1, s7, s6 -; RV64I-NEXT: or s2, s9, s8 -; RV64I-NEXT: lbu s3, 28(a0) -; RV64I-NEXT: lbu s4, 29(a0) -; RV64I-NEXT: lbu s5, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) -; RV64I-NEXT: lbu a1, 0(a1) -; RV64I-NEXT: slli s10, s10, 16 -; RV64I-NEXT: slli s11, s11, 24 -; RV64I-NEXT: or s6, s11, s10 -; RV64I-NEXT: mv s7, sp -; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: slli t5, t5, 16 -; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, s3, t1 +; RV64I-NEXT: or t0, s8, t0 +; RV64I-NEXT: or t1, s10, s9 +; RV64I-NEXT: lbu s3, 24(a0) +; RV64I-NEXT: lbu s8, 25(a0) +; RV64I-NEXT: lbu s9, 26(a0) +; RV64I-NEXT: lbu s10, 27(a0) ; RV64I-NEXT: slli s4, s4, 8 ; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: or s2, s4, s2 +; RV64I-NEXT: or s4, s6, s5 +; RV64I-NEXT: or s5, s11, s7 +; RV64I-NEXT: lbu s6, 28(a0) +; RV64I-NEXT: lbu s7, 29(a0) +; RV64I-NEXT: lbu s11, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: slli t5, t5, 24 +; RV64I-NEXT: or t4, t5, t4 +; RV64I-NEXT: mv t5, sp +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s0, s0, 16 +; RV64I-NEXT: slli s1, s1, 24 +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: slli s11, s11, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: or t3, t4, t3 -; RV64I-NEXT: or t4, t6, t5 -; RV64I-NEXT: or t5, s4, s3 -; RV64I-NEXT: or a0, a0, s5 -; RV64I-NEXT: andi a1, a1, 24 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a6, t3, t2 +; RV64I-NEXT: or a4, t6, a4 ; RV64I-NEXT: or s0, s1, s0 -; RV64I-NEXT: or a7, s6, s2 -; RV64I-NEXT: or t0, t4, t3 -; RV64I-NEXT: or a0, a0, t5 -; RV64I-NEXT: add s7, s7, a1 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or t2, s8, s3 +; RV64I-NEXT: or t3, s10, s9 +; RV64I-NEXT: or t6, s7, s6 +; RV64I-NEXT: or a0, a0, s11 +; RV64I-NEXT: andi a1, a1, 24 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or t0, s4, s2 +; RV64I-NEXT: or t1, t4, s5 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a4, s0, a4 +; RV64I-NEXT: or a6, t3, t2 +; RV64I-NEXT: or a0, a0, t6 +; RV64I-NEXT: add t5, t5, a1 ; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: sraiw a0, a0, 31 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a5, a7, s0 -; RV64I-NEXT: or a1, a1, t0 +; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: sd a0, 32(sp) ; RV64I-NEXT: sd a0, 40(sp) ; RV64I-NEXT: sd a0, 48(sp) ; RV64I-NEXT: sd a0, 56(sp) -; RV64I-NEXT: sd a3, 0(sp) -; RV64I-NEXT: sd a4, 8(sp) -; RV64I-NEXT: sd a5, 16(sp) +; RV64I-NEXT: sd a5, 0(sp) +; RV64I-NEXT: sd a7, 8(sp) +; RV64I-NEXT: sd a3, 16(sp) ; RV64I-NEXT: sd a1, 24(sp) -; RV64I-NEXT: ld a4, 16(s7) -; RV64I-NEXT: ld a0, 8(s7) -; RV64I-NEXT: ld a1, 0(s7) -; RV64I-NEXT: ld a3, 24(s7) +; RV64I-NEXT: ld a4, 16(t5) +; RV64I-NEXT: ld a0, 8(t5) +; RV64I-NEXT: ld a1, 0(t5) +; RV64I-NEXT: ld a3, 24(t5) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 ; RV64I-NEXT: srli a7, a4, 40 @@ -5636,25 +5670,25 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: srli s5, a1, 48 ; RV64I-NEXT: srli s6, a1, 40 ; RV64I-NEXT: srli s7, a1, 32 +; RV64I-NEXT: srli s8, a1, 24 +; RV64I-NEXT: srli s9, a1, 16 +; RV64I-NEXT: srli s10, a1, 8 +; RV64I-NEXT: srli s11, a0, 56 ; RV64I-NEXT: sb t0, 20(a2) -; RV64I-NEXT: srli t0, a1, 24 ; RV64I-NEXT: sb a7, 21(a2) -; RV64I-NEXT: srli a7, a1, 16 ; RV64I-NEXT: sb a6, 22(a2) -; RV64I-NEXT: srli a6, a1, 8 ; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, a0, 56 +; RV64I-NEXT: srli a5, a0, 48 ; RV64I-NEXT: sb a4, 16(a2) -; RV64I-NEXT: srli a4, a0, 48 ; RV64I-NEXT: sb t3, 17(a2) ; RV64I-NEXT: sb t2, 18(a2) ; RV64I-NEXT: sb t1, 19(a2) -; RV64I-NEXT: srli t1, a0, 40 +; RV64I-NEXT: srli a4, a0, 40 ; RV64I-NEXT: sb s0, 28(a2) ; RV64I-NEXT: sb t6, 29(a2) ; RV64I-NEXT: sb t5, 30(a2) ; RV64I-NEXT: sb t4, 31(a2) -; RV64I-NEXT: srli t2, a0, 32 +; RV64I-NEXT: srli a6, a0, 32 ; RV64I-NEXT: sb a3, 24(a2) ; RV64I-NEXT: sb s3, 25(a2) ; RV64I-NEXT: sb s2, 26(a2) @@ -5664,19 +5698,19 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sb s6, 5(a2) ; RV64I-NEXT: sb s5, 6(a2) ; RV64I-NEXT: sb s4, 7(a2) -; RV64I-NEXT: srli t3, a0, 16 +; RV64I-NEXT: srli a7, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a6, 1(a2) -; RV64I-NEXT: sb a7, 2(a2) -; RV64I-NEXT: sb t0, 3(a2) +; RV64I-NEXT: sb s10, 1(a2) +; RV64I-NEXT: sb s9, 2(a2) +; RV64I-NEXT: sb s8, 3(a2) ; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: sb t2, 12(a2) -; RV64I-NEXT: sb t1, 13(a2) -; RV64I-NEXT: sb a4, 14(a2) -; RV64I-NEXT: sb a5, 15(a2) +; RV64I-NEXT: sb a6, 12(a2) +; RV64I-NEXT: sb a4, 13(a2) +; RV64I-NEXT: sb a5, 14(a2) +; RV64I-NEXT: sb s11, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: sb t3, 10(a2) +; RV64I-NEXT: sb a7, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) ; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload @@ -5695,129 +5729,130 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; ; RV32I-LABEL: ashr_32bytes_dwordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -112 -; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) -; RV32I-NEXT: lbu s3, 15(a0) -; RV32I-NEXT: lbu s4, 16(a0) -; RV32I-NEXT: lbu s5, 17(a0) -; RV32I-NEXT: lbu s6, 18(a0) -; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a5, 0(a0) +; RV32I-NEXT: lbu a6, 1(a0) +; RV32I-NEXT: lbu a7, 2(a0) +; RV32I-NEXT: lbu t1, 3(a0) +; RV32I-NEXT: lbu s0, 4(a0) +; RV32I-NEXT: lbu s2, 5(a0) +; RV32I-NEXT: lbu s3, 6(a0) +; RV32I-NEXT: lbu s6, 7(a0) +; RV32I-NEXT: lbu s1, 8(a0) +; RV32I-NEXT: lbu s7, 9(a0) +; RV32I-NEXT: lbu s8, 10(a0) +; RV32I-NEXT: lbu s9, 11(a0) +; RV32I-NEXT: lbu s10, 12(a0) +; RV32I-NEXT: lbu s11, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu t0, 17(a0) +; RV32I-NEXT: lbu t2, 18(a0) +; RV32I-NEXT: lbu t3, 19(a0) +; RV32I-NEXT: lbu a4, 20(a0) +; RV32I-NEXT: lbu t4, 21(a0) +; RV32I-NEXT: lbu t5, 22(a0) +; RV32I-NEXT: lbu t6, 23(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s3, s3, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t1, a7 +; RV32I-NEXT: or a7, s2, s0 +; RV32I-NEXT: or t1, s6, s3 +; RV32I-NEXT: lbu s0, 24(a0) +; RV32I-NEXT: lbu s6, 25(a0) +; RV32I-NEXT: lbu ra, 26(a0) +; RV32I-NEXT: lbu s2, 27(a0) +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or s1, s7, s1 +; RV32I-NEXT: or s7, s9, s8 +; RV32I-NEXT: or s3, s11, s10 +; RV32I-NEXT: lbu s8, 28(a0) +; RV32I-NEXT: lbu s9, 29(a0) +; RV32I-NEXT: lbu s10, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: or s4, s5, s4 +; RV32I-NEXT: addi s5, sp, 8 ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s10, 22(a0) -; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 ; RV32I-NEXT: slli t4, t4, 8 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: slli s2, s2, 16 -; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: or t1, s1, s0 -; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu t5, 25(a0) -; RV32I-NEXT: lbu t6, 26(a0) -; RV32I-NEXT: lbu s0, 27(a0) -; RV32I-NEXT: slli s5, s5, 8 -; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli s2, s2, 24 ; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: or t4, s5, s4 -; RV32I-NEXT: or s1, s7, s6 -; RV32I-NEXT: or s2, s9, s8 -; RV32I-NEXT: lbu s3, 28(a0) -; RV32I-NEXT: lbu s4, 29(a0) -; RV32I-NEXT: lbu s5, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: lbu a1, 0(a1) ; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: or s6, s11, s10 -; RV32I-NEXT: mv s7, sp -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: slli t6, t6, 16 -; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: or t3, t5, t3 -; RV32I-NEXT: or t5, s0, t6 -; RV32I-NEXT: or t6, s4, s3 -; RV32I-NEXT: or s0, a0, s5 +; RV32I-NEXT: or a3, t0, a3 +; RV32I-NEXT: or t0, t3, t2 +; RV32I-NEXT: or a4, t4, a4 +; RV32I-NEXT: or t2, t6, t5 +; RV32I-NEXT: or t3, s6, s0 +; RV32I-NEXT: or t4, s2, ra +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or t6, a0, s10 ; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: andi a1, a1, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, s1, t4 -; RV32I-NEXT: or t0, s6, s2 -; RV32I-NEXT: or t1, t5, t3 -; RV32I-NEXT: or t2, s0, t6 -; RV32I-NEXT: sw a0, 48(sp) -; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t1, a7 +; RV32I-NEXT: or a7, s7, s1 +; RV32I-NEXT: or t1, s4, s3 +; RV32I-NEXT: or a3, t0, a3 +; RV32I-NEXT: or a4, t2, a4 +; RV32I-NEXT: or t0, t4, t3 +; RV32I-NEXT: or t2, t6, t5 ; RV32I-NEXT: sw a0, 56(sp) ; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a0, 32(sp) -; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: sw a0, 64(sp) +; RV32I-NEXT: sw a0, 68(sp) ; RV32I-NEXT: sw a0, 40(sp) ; RV32I-NEXT: sw a0, 44(sp) -; RV32I-NEXT: add s7, s7, a1 -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t0, 20(sp) -; RV32I-NEXT: sw t1, 24(sp) -; RV32I-NEXT: sw t2, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: add s5, s5, a1 +; RV32I-NEXT: sw a3, 24(sp) +; RV32I-NEXT: sw a4, 28(sp) +; RV32I-NEXT: sw t0, 32(sp) +; RV32I-NEXT: sw t2, 36(sp) ; RV32I-NEXT: sw a5, 8(sp) ; RV32I-NEXT: sw a6, 12(sp) -; RV32I-NEXT: lw a6, 16(s7) -; RV32I-NEXT: lw a5, 20(s7) -; RV32I-NEXT: lw a7, 24(s7) -; RV32I-NEXT: lw a1, 0(s7) -; RV32I-NEXT: lw a0, 4(s7) -; RV32I-NEXT: lw a4, 8(s7) -; RV32I-NEXT: lw a3, 12(s7) -; RV32I-NEXT: lw t0, 28(s7) +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t1, 20(sp) +; RV32I-NEXT: lw a6, 16(s5) +; RV32I-NEXT: lw a5, 20(s5) +; RV32I-NEXT: lw a7, 24(s5) +; RV32I-NEXT: lw a1, 0(s5) +; RV32I-NEXT: lw a0, 4(s5) +; RV32I-NEXT: lw a4, 8(s5) +; RV32I-NEXT: lw a3, 12(s5) +; RV32I-NEXT: lw t0, 28(s5) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -5832,21 +5867,21 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 +; RV32I-NEXT: srli s8, a4, 8 +; RV32I-NEXT: srli s9, a3, 24 +; RV32I-NEXT: srli s10, a3, 16 +; RV32I-NEXT: srli s11, a3, 8 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a4, 8 +; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t3, 25(a2) -; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) -; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a3, 8 +; RV32I-NEXT: srli t1, a1, 16 ; RV32I-NEXT: sb t0, 28(a2) -; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) -; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t4, a1, 8 +; RV32I-NEXT: srli t0, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -5858,35 +5893,36 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a7, 9(a2) +; RV32I-NEXT: sb s8, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb t1, 13(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t3, 15(a2) +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb s9, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t4, 1(a2) -; RV32I-NEXT: sb t6, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb t1, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 112 +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index b8952d2cb2b29..b2c130c2d7c10 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -1530,24 +1530,25 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: lshr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -112 -; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu a6, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) +; RV32I-NEXT: lbu a5, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) @@ -1556,105 +1557,107 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu t5, 10(a0) ; RV32I-NEXT: lbu t6, 11(a0) ; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) -; RV32I-NEXT: lbu s3, 15(a0) -; RV32I-NEXT: lbu s4, 16(a0) -; RV32I-NEXT: lbu s5, 17(a0) -; RV32I-NEXT: lbu s6, 18(a0) -; RV32I-NEXT: lbu s7, 19(a0) +; RV32I-NEXT: lbu s2, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: or a4, a7, a6 +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: lbu ra, 22(a0) +; RV32I-NEXT: lbu a3, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s10, 22(a0) -; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: slli t4, t4, 8 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: slli s2, s2, 16 -; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a5, t0, a5 +; RV32I-NEXT: or a6, t2, t1 ; RV32I-NEXT: or a7, t4, t3 ; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: or t1, s1, s0 -; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: lbu t6, 24(a0) -; RV32I-NEXT: lbu s0, 25(a0) -; RV32I-NEXT: lbu s1, 26(a0) -; RV32I-NEXT: lbu s2, 27(a0) -; RV32I-NEXT: slli s5, s5, 8 -; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli s7, s7, 24 -; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: or t3, s5, s4 -; RV32I-NEXT: or t4, s7, s6 -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s1, 24(a0) +; RV32I-NEXT: lbu s3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: or t1, s2, s0 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: or t3, s7, s6 +; RV32I-NEXT: lbu t6, 28(a0) ; RV32I-NEXT: lbu s4, 29(a0) ; RV32I-NEXT: lbu s5, 30(a0) ; RV32I-NEXT: lbu s6, 31(a0) -; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or a0, s11, s10 -; RV32I-NEXT: or t6, s0, t6 -; RV32I-NEXT: or s0, s2, s1 -; RV32I-NEXT: lbu s1, 0(a1) -; RV32I-NEXT: lbu s2, 1(a1) -; RV32I-NEXT: lbu s7, 2(a1) +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a0, s9, s8 +; RV32I-NEXT: or s0, s11, s10 +; RV32I-NEXT: or s2, a3, ra +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu s7, 1(a1) +; RV32I-NEXT: lbu s8, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 64(sp) +; RV32I-NEXT: sw zero, 68(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s1, s3, s1 +; RV32I-NEXT: addi s3, sp, 8 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 ; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: or s3, s4, s3 -; RV32I-NEXT: mv s4, sp ; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s7, s7, 16 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or s5, s6, s5 -; RV32I-NEXT: or s1, s2, s1 -; RV32I-NEXT: or a1, a1, s7 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, a0, t5 -; RV32I-NEXT: or t1, s0, t6 -; RV32I-NEXT: or t2, s5, s3 -; RV32I-NEXT: or a0, a1, s1 -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t0, 20(sp) -; RV32I-NEXT: sw t1, 24(sp) -; RV32I-NEXT: sw t2, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t5, s4, t6 +; RV32I-NEXT: or t6, s6, s5 +; RV32I-NEXT: or a3, s7, a3 +; RV32I-NEXT: or a1, a1, s8 +; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, s4 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or t0, a0, t3 +; RV32I-NEXT: or t1, s2, s0 +; RV32I-NEXT: or t2, t4, s1 +; RV32I-NEXT: or t3, t6, t5 +; RV32I-NEXT: or a0, a1, a3 +; RV32I-NEXT: sw t0, 24(sp) +; RV32I-NEXT: sw t1, 28(sp) +; RV32I-NEXT: sw t2, 32(sp) +; RV32I-NEXT: sw t3, 36(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a6, 16(sp) +; RV32I-NEXT: sw a7, 20(sp) ; RV32I-NEXT: srli a1, a0, 3 ; RV32I-NEXT: andi a3, a0, 31 ; RV32I-NEXT: andi a4, a1, 28 ; RV32I-NEXT: xori a1, a3, 31 -; RV32I-NEXT: add a4, s4, a4 +; RV32I-NEXT: add a4, s3, a4 ; RV32I-NEXT: lw a3, 0(a4) ; RV32I-NEXT: lw a5, 4(a4) ; RV32I-NEXT: lw a6, 8(a4) @@ -1714,13 +1717,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s5, a3, 24 ; RV32I-NEXT: srli s6, a3, 16 ; RV32I-NEXT: srli s7, a3, 8 +; RV32I-NEXT: srli s8, a1, 24 +; RV32I-NEXT: srli s9, a1, 16 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t2, 25(a2) -; RV32I-NEXT: srli t2, a1, 16 ; RV32I-NEXT: sb t1, 26(a2) ; RV32I-NEXT: sb t0, 27(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli a7, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb t5, 17(a2) ; RV32I-NEXT: sb t4, 18(a2) @@ -1741,26 +1744,27 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 14(a2) ; RV32I-NEXT: sb s5, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb t2, 2(a2) -; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: sb a7, 1(a2) +; RV32I-NEXT: sb s9, 2(a2) +; RV32I-NEXT: sb s8, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 112 +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 @@ -2002,24 +2006,25 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -112 -; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu a6, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) +; RV32I-NEXT: lbu a5, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) @@ -2028,105 +2033,107 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu t5, 10(a0) ; RV32I-NEXT: lbu t6, 11(a0) ; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) -; RV32I-NEXT: lbu s3, 15(a0) -; RV32I-NEXT: lbu s4, 16(a0) -; RV32I-NEXT: lbu s5, 17(a0) -; RV32I-NEXT: lbu s6, 18(a0) -; RV32I-NEXT: lbu s7, 19(a0) +; RV32I-NEXT: lbu s2, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: or a4, a7, a6 +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: lbu ra, 22(a0) +; RV32I-NEXT: lbu a3, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s10, 22(a0) -; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: slli t4, t4, 8 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: slli s2, s2, 16 -; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a5, t0, a5 +; RV32I-NEXT: or a6, t2, t1 ; RV32I-NEXT: or a7, t4, t3 ; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: or t1, s1, s0 -; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: lbu t6, 24(a0) -; RV32I-NEXT: lbu s0, 25(a0) -; RV32I-NEXT: lbu s1, 26(a0) -; RV32I-NEXT: lbu s2, 27(a0) -; RV32I-NEXT: slli s5, s5, 8 -; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli s7, s7, 24 -; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: or t3, s5, s4 -; RV32I-NEXT: or t4, s7, s6 -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s1, 24(a0) +; RV32I-NEXT: lbu s3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: or t1, s2, s0 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: or t3, s7, s6 +; RV32I-NEXT: lbu t6, 28(a0) ; RV32I-NEXT: lbu s4, 29(a0) ; RV32I-NEXT: lbu s5, 30(a0) ; RV32I-NEXT: lbu s6, 31(a0) -; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or a0, s11, s10 -; RV32I-NEXT: or t6, s0, t6 -; RV32I-NEXT: or s0, s2, s1 -; RV32I-NEXT: lbu s1, 0(a1) -; RV32I-NEXT: lbu s2, 1(a1) -; RV32I-NEXT: lbu s7, 2(a1) +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a0, s9, s8 +; RV32I-NEXT: or s0, s11, s10 +; RV32I-NEXT: or s2, a3, ra +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu s7, 1(a1) +; RV32I-NEXT: lbu s8, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 0(sp) -; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s1, s3, s1 +; RV32I-NEXT: addi s3, sp, 40 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 ; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: or s3, s4, s3 -; RV32I-NEXT: addi s4, sp, 32 ; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s7, s7, 16 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or s5, s6, s5 -; RV32I-NEXT: or s1, s2, s1 -; RV32I-NEXT: or a1, a1, s7 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, a0, t5 -; RV32I-NEXT: or t1, s0, t6 -; RV32I-NEXT: or t2, s5, s3 -; RV32I-NEXT: or a0, a1, s1 -; RV32I-NEXT: sw a7, 48(sp) -; RV32I-NEXT: sw t0, 52(sp) -; RV32I-NEXT: sw t1, 56(sp) -; RV32I-NEXT: sw t2, 60(sp) -; RV32I-NEXT: sw a3, 32(sp) -; RV32I-NEXT: sw a4, 36(sp) -; RV32I-NEXT: sw a5, 40(sp) -; RV32I-NEXT: sw a6, 44(sp) +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t5, s4, t6 +; RV32I-NEXT: or t6, s6, s5 +; RV32I-NEXT: or a3, s7, a3 +; RV32I-NEXT: or a1, a1, s8 +; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, s4 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or t0, a0, t3 +; RV32I-NEXT: or t1, s2, s0 +; RV32I-NEXT: or t2, t4, s1 +; RV32I-NEXT: or t3, t6, t5 +; RV32I-NEXT: or a0, a1, a3 +; RV32I-NEXT: sw t0, 56(sp) +; RV32I-NEXT: sw t1, 60(sp) +; RV32I-NEXT: sw t2, 64(sp) +; RV32I-NEXT: sw t3, 68(sp) +; RV32I-NEXT: sw a4, 40(sp) +; RV32I-NEXT: sw a5, 44(sp) +; RV32I-NEXT: sw a6, 48(sp) +; RV32I-NEXT: sw a7, 52(sp) ; RV32I-NEXT: srli a1, a0, 3 ; RV32I-NEXT: andi a3, a0, 31 ; RV32I-NEXT: andi a4, a1, 28 ; RV32I-NEXT: xori a1, a3, 31 -; RV32I-NEXT: sub a3, s4, a4 +; RV32I-NEXT: sub a3, s3, a4 ; RV32I-NEXT: lw a4, 0(a3) ; RV32I-NEXT: lw a5, 4(a3) ; RV32I-NEXT: lw a6, 8(a3) @@ -2186,13 +2193,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s5, a3, 24 ; RV32I-NEXT: srli s6, a3, 16 ; RV32I-NEXT: srli s7, a3, 8 +; RV32I-NEXT: srli s8, a1, 24 +; RV32I-NEXT: srli s9, a1, 16 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t2, 25(a2) -; RV32I-NEXT: srli t2, a1, 16 ; RV32I-NEXT: sb t1, 26(a2) ; RV32I-NEXT: sb t0, 27(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli a7, a1, 8 ; RV32I-NEXT: sb a6, 28(a2) ; RV32I-NEXT: sb t5, 29(a2) ; RV32I-NEXT: sb t4, 30(a2) @@ -2213,26 +2220,27 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 10(a2) ; RV32I-NEXT: sb s5, 11(a2) ; RV32I-NEXT: sb a1, 12(a2) -; RV32I-NEXT: sb t0, 13(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb a7, 15(a2) +; RV32I-NEXT: sb a7, 13(a2) +; RV32I-NEXT: sb s9, 14(a2) +; RV32I-NEXT: sb s8, 15(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 112 +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 @@ -2475,24 +2483,25 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -112 -; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu a6, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) +; RV32I-NEXT: lbu a5, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) @@ -2509,98 +2518,100 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s6, 18(a0) ; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: or a4, a7, a6 ; RV32I-NEXT: lbu s8, 20(a0) ; RV32I-NEXT: lbu s9, 21(a0) ; RV32I-NEXT: lbu s10, 22(a0) ; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: slli t4, t4, 8 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or a5, t0, a5 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: lbu ra, 24(a0) +; RV32I-NEXT: lbu a3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) ; RV32I-NEXT: slli s1, s1, 8 ; RV32I-NEXT: slli s2, s2, 16 ; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: slli s5, s5, 8 ; RV32I-NEXT: or t1, s1, s0 ; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: lbu t6, 24(a0) -; RV32I-NEXT: lbu s0, 25(a0) -; RV32I-NEXT: lbu s1, 26(a0) -; RV32I-NEXT: lbu s2, 27(a0) -; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: lbu t6, 28(a0) +; RV32I-NEXT: lbu s0, 29(a0) +; RV32I-NEXT: lbu s1, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: slli s6, s6, 16 ; RV32I-NEXT: slli s7, s7, 24 ; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: or t3, s5, s4 -; RV32I-NEXT: or t4, s7, s6 -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: lbu s3, 28(a0) -; RV32I-NEXT: lbu s4, 29(a0) -; RV32I-NEXT: lbu s5, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or s6, s11, s10 -; RV32I-NEXT: or t6, s0, t6 -; RV32I-NEXT: or s0, s2, s1 -; RV32I-NEXT: lbu s1, 0(a1) -; RV32I-NEXT: lbu s2, 1(a1) +; RV32I-NEXT: or s2, s7, s6 +; RV32I-NEXT: or s3, s9, s8 +; RV32I-NEXT: or s4, s11, s10 +; RV32I-NEXT: lbu s5, 0(a1) +; RV32I-NEXT: lbu s6, 1(a1) ; RV32I-NEXT: lbu s7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: or s3, s4, s3 -; RV32I-NEXT: mv s4, sp -; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, ra +; RV32I-NEXT: addi s8, sp, 8 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s6, s6, 8 ; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or s5, a0, s5 -; RV32I-NEXT: or s1, s2, s1 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t5, s0, t6 +; RV32I-NEXT: or s1, a0, s1 +; RV32I-NEXT: or t6, s6, s5 ; RV32I-NEXT: or a1, a1, s7 -; RV32I-NEXT: srai s2, a0, 31 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, s6, t5 -; RV32I-NEXT: or t1, s0, t6 -; RV32I-NEXT: or t2, s5, s3 -; RV32I-NEXT: or a0, a1, s1 -; RV32I-NEXT: sw s2, 48(sp) -; RV32I-NEXT: sw s2, 52(sp) -; RV32I-NEXT: sw s2, 56(sp) -; RV32I-NEXT: sw s2, 60(sp) -; RV32I-NEXT: sw s2, 32(sp) -; RV32I-NEXT: sw s2, 36(sp) -; RV32I-NEXT: sw s2, 40(sp) -; RV32I-NEXT: sw s2, 44(sp) -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t0, 20(sp) -; RV32I-NEXT: sw t1, 24(sp) -; RV32I-NEXT: sw t2, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: srai s0, a0, 31 +; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, a0 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or t0, s2, t3 +; RV32I-NEXT: or t1, s4, s3 +; RV32I-NEXT: or a3, t4, a3 +; RV32I-NEXT: or t2, s1, t5 +; RV32I-NEXT: or a0, a1, t6 +; RV32I-NEXT: sw s0, 56(sp) +; RV32I-NEXT: sw s0, 60(sp) +; RV32I-NEXT: sw s0, 64(sp) +; RV32I-NEXT: sw s0, 68(sp) +; RV32I-NEXT: sw s0, 40(sp) +; RV32I-NEXT: sw s0, 44(sp) +; RV32I-NEXT: sw s0, 48(sp) +; RV32I-NEXT: sw s0, 52(sp) +; RV32I-NEXT: sw t0, 24(sp) +; RV32I-NEXT: sw t1, 28(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: sw t2, 36(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a6, 16(sp) +; RV32I-NEXT: sw a7, 20(sp) ; RV32I-NEXT: srli a1, a0, 3 ; RV32I-NEXT: andi a3, a0, 31 ; RV32I-NEXT: andi a4, a1, 28 ; RV32I-NEXT: xori a1, a3, 31 -; RV32I-NEXT: add a4, s4, a4 +; RV32I-NEXT: add a4, s8, a4 ; RV32I-NEXT: lw a3, 0(a4) ; RV32I-NEXT: lw a5, 4(a4) ; RV32I-NEXT: lw a6, 8(a4) @@ -2660,13 +2671,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s5, a3, 24 ; RV32I-NEXT: srli s6, a3, 16 ; RV32I-NEXT: srli s7, a3, 8 +; RV32I-NEXT: srli s8, a1, 24 +; RV32I-NEXT: srli s9, a1, 16 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t2, 25(a2) -; RV32I-NEXT: srli t2, a1, 16 ; RV32I-NEXT: sb t1, 26(a2) ; RV32I-NEXT: sb t0, 27(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli a7, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb t5, 17(a2) ; RV32I-NEXT: sb t4, 18(a2) @@ -2687,26 +2698,27 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 14(a2) ; RV32I-NEXT: sb s5, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb t2, 2(a2) -; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: sb a7, 1(a2) +; RV32I-NEXT: sb s9, 2(a2) +; RV32I-NEXT: sb s8, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 112 +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc index 67759bd5c4632..2c4b1f36ffd23 100644 --- a/llvm/unittests/CodeGen/MFCommon.inc +++ b/llvm/unittests/CodeGen/MFCommon.inc @@ -50,8 +50,8 @@ public: const char *getRegPressureSetName(unsigned Idx) const override { return "bogus"; } - unsigned getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const override { + unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx, + bool RemoveReserved) const override { return 0; } const int * diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index a6f87119aca5b..674925c1b2acd 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -275,7 +275,8 @@ void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS, OS << "// Get the register unit pressure limit for this dimension.\n" << "// This limit must be adjusted dynamically for reserved registers.\n" << "unsigned " << ClassName << "::\n" - << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const " + << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx, bool " + "RemoveReserved) const " "{\n" << " static const " << getMinimalTypeForRange(MaxRegUnitWeight, 32) << " PressureLimitTable[] = {\n"; @@ -1130,7 +1131,7 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) { << " unsigned getNumRegPressureSets() const override;\n" << " const char *getRegPressureSetName(unsigned Idx) const override;\n" << " unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned " - "Idx) const override;\n" + "Idx, bool RemoveReserved = true) const override;\n" << " const int *getRegClassPressureSets(" << "const TargetRegisterClass *RC) const override;\n" << " const int *getRegUnitPressureSets(" From 6b0461f0b6b90dcd983cf288220879d6c087e99d Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Tue, 3 Dec 2024 21:47:30 +0800 Subject: [PATCH 3/3] Revert "Test commit: add a parameter to keep reserved" This reverts commit e96f7f7898790da1fe9cdc5cd3be7e3ae8eb8705. --- .../include/llvm/CodeGen/TargetRegisterInfo.h | 4 +- llvm/lib/CodeGen/RegisterClassInfo.cpp | 3 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 3 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 4 +- llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 8 +- llvm/lib/Target/RISCV/RISCVRegisterInfo.h | 4 +- llvm/test/CodeGen/RISCV/pr69586.ll | 844 ++--- .../RISCV/rvv/fixed-vectors-masked-scatter.ll | 78 +- .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll | 2104 ++++++----- .../RISCV/rvv/intrinsic-vector-match.ll | 472 ++- ...lar-shift-by-byte-multiple-legalization.ll | 3242 ++++++++--------- .../RISCV/wide-scalar-shift-legalization.ll | 646 ++-- llvm/unittests/CodeGen/MFCommon.inc | 4 +- llvm/utils/TableGen/RegisterInfoEmitter.cpp | 5 +- 14 files changed, 3606 insertions(+), 3815 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index eaed26e33c4eb..292fa3c94969b 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -914,10 +914,8 @@ class TargetRegisterInfo : public MCRegisterInfo { /// Get the register unit pressure limit for this dimension. /// This limit must be adjusted dynamically for reserved registers. - /// If RemoveReserved is true, the target should remove reserved registers. virtual unsigned getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx, - bool RemoveReserved = true) const = 0; + unsigned Idx) const = 0; /// Get the dimensions of register pressure impacted by this register class. /// Returns a -1 terminated array of pressure set IDs. diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp index 0a33915ed1e40..9312bc03bc522 100644 --- a/llvm/lib/CodeGen/RegisterClassInfo.cpp +++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp @@ -222,8 +222,7 @@ unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const { assert(RC && "Failed to find register class"); compute(RC); unsigned NAllocatableRegs = getNumAllocatableRegs(RC); - unsigned RegPressureSetLimit = - TRI->getRegPressureSetLimit(*MF, Idx, /*RemoveReserved=*/false); + unsigned RegPressureSetLimit = TRI->getRegPressureSetLimit(*MF, Idx); // If all the regs are reserved, return raw RegPressureSetLimit. // One example is VRSAVERC in PowerPC. // Avoid returning zero, getRegPressureSetLimit(Idx) assumes computePSetLimit diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 9883454ed7829..049f4af4dd2f9 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3640,8 +3640,7 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx, - bool RemoveReserved) const { + unsigned Idx) const { if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || Idx == AMDGPU::RegisterPressureSets::AGPR_32) return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index b55f5f2c418b0..8e481e3ac2304 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -331,8 +331,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; - unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx, - bool RemoveReserved = true) const override; + unsigned getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const override; const int *getRegUnitPressureSets(unsigned RegUnit) const override; diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index d5a769b6c78c7..a73bd1621a739 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -936,12 +936,8 @@ bool RISCVRegisterInfo::getRegAllocationHints( } unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx, - bool RemoveReserved) const { + unsigned Idx) const { if (Idx == RISCV::RegisterPressureSets::GPRAll) { - if (!RemoveReserved) - return 32; - unsigned Reserved = 0; BitVector ReservedRegs = getReservedRegs(MF); for (MCPhysReg Reg = RISCV::X0_H; Reg <= RISCV::X31_H; Reg++) @@ -950,5 +946,5 @@ unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, return 32 - Reserved; } - return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx, RemoveReserved); + return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx); } diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h index 58f97394ec559..ca4934de2f52d 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h @@ -144,8 +144,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo { static bool isRVVRegClass(const TargetRegisterClass *RC) { return RISCVRI::isVRegClass(RC->TSFlags); } - unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx, - bool RemoveReserved = true) const override; + unsigned getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const override; }; } // namespace llvm diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll index 8e6a7add781c9..21e64ada7061a 100644 --- a/llvm/test/CodeGen/RISCV/pr69586.ll +++ b/llvm/test/CodeGen/RISCV/pr69586.ll @@ -39,388 +39,384 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: slli a2, a2, 1 ; NOREMAT-NEXT: sub sp, sp, a2 ; NOREMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb -; NOREMAT-NEXT: li a7, 32 -; NOREMAT-NEXT: addi a6, a0, 512 -; NOREMAT-NEXT: addi a4, a0, 1024 -; NOREMAT-NEXT: addi a5, a0, 1536 -; NOREMAT-NEXT: li t0, 1 +; NOREMAT-NEXT: mv a7, a0 +; NOREMAT-NEXT: li a0, 32 +; NOREMAT-NEXT: addi a5, a7, 512 +; NOREMAT-NEXT: addi a4, a7, 1024 +; NOREMAT-NEXT: addi a6, a7, 1536 +; NOREMAT-NEXT: li t1, 1 ; NOREMAT-NEXT: li a3, 5 -; NOREMAT-NEXT: li t1, 3 +; NOREMAT-NEXT: li t0, 3 ; NOREMAT-NEXT: li a2, 7 ; NOREMAT-NEXT: lui t2, 1 -; NOREMAT-NEXT: li s4, 9 -; NOREMAT-NEXT: li s6, 11 -; NOREMAT-NEXT: li s9, 13 -; NOREMAT-NEXT: lui s7, 2 -; NOREMAT-NEXT: lui s1, 3 -; NOREMAT-NEXT: lui ra, 4 -; NOREMAT-NEXT: lui t3, 5 -; NOREMAT-NEXT: lui s0, 6 -; NOREMAT-NEXT: lui s3, 7 -; NOREMAT-NEXT: vsetvli zero, a7, e32, m2, ta, ma -; NOREMAT-NEXT: slli t0, t0, 11 -; NOREMAT-NEXT: sd t0, 504(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: slli t5, a3, 9 -; NOREMAT-NEXT: slli t6, t1, 10 -; NOREMAT-NEXT: slli s2, a2, 9 -; NOREMAT-NEXT: add a7, a0, t2 -; NOREMAT-NEXT: lui s11, 1 -; NOREMAT-NEXT: slli s4, s4, 9 -; NOREMAT-NEXT: slli s5, a3, 10 -; NOREMAT-NEXT: vle32.v v8, (a6) -; NOREMAT-NEXT: slli s6, s6, 9 -; NOREMAT-NEXT: slli s8, t1, 11 +; NOREMAT-NEXT: li s5, 9 +; NOREMAT-NEXT: li s8, 11 +; NOREMAT-NEXT: lui s1, 2 +; NOREMAT-NEXT: lui t5, 3 +; NOREMAT-NEXT: lui s11, 4 +; NOREMAT-NEXT: lui ra, 5 +; NOREMAT-NEXT: lui t3, 6 +; NOREMAT-NEXT: lui s0, 7 +; NOREMAT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOREMAT-NEXT: slli t4, t1, 11 +; NOREMAT-NEXT: slli t6, a3, 9 +; NOREMAT-NEXT: slli s2, t0, 10 +; NOREMAT-NEXT: slli s4, a2, 9 +; NOREMAT-NEXT: add a0, a7, t2 +; NOREMAT-NEXT: vle32.v v8, (a5) +; NOREMAT-NEXT: slli s5, s5, 9 ; NOREMAT-NEXT: vle32.v v10, (a4) ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: slli s9, s9, 9 -; NOREMAT-NEXT: vle32.v v0, (a5) -; NOREMAT-NEXT: vle32.v v12, (a5) -; NOREMAT-NEXT: slli s10, a2, 10 -; NOREMAT-NEXT: vle32.v v4, (a7) -; NOREMAT-NEXT: vle32.v v20, (a7) -; NOREMAT-NEXT: add a4, a0, s7 +; NOREMAT-NEXT: slli s6, a3, 10 +; NOREMAT-NEXT: vle32.v v0, (a6) +; NOREMAT-NEXT: vle32.v v12, (a6) +; NOREMAT-NEXT: slli s8, s8, 9 +; NOREMAT-NEXT: slli s9, t0, 11 +; NOREMAT-NEXT: vle32.v v4, (a0) +; NOREMAT-NEXT: vle32.v v20, (a0) +; NOREMAT-NEXT: add a4, a7, s1 ; NOREMAT-NEXT: vle32.v v6, (a4) ; NOREMAT-NEXT: vle32.v v30, (a4) -; NOREMAT-NEXT: add a4, a0, s1 +; NOREMAT-NEXT: add a4, a7, t5 ; NOREMAT-NEXT: vle32.v v28, (a4) ; NOREMAT-NEXT: vle32.v v26, (a4) -; NOREMAT-NEXT: add a4, a0, ra +; NOREMAT-NEXT: add a4, a7, s11 ; NOREMAT-NEXT: vle32.v v24, (a4) ; NOREMAT-NEXT: vle32.v v22, (a4) -; NOREMAT-NEXT: add a4, a0, t3 -; NOREMAT-NEXT: vle32.v v14, (a0) +; NOREMAT-NEXT: add a4, a7, ra +; NOREMAT-NEXT: vle32.v v14, (a7) ; NOREMAT-NEXT: vle32.v v18, (a4) ; NOREMAT-NEXT: vle32.v v16, (a4) -; NOREMAT-NEXT: add a4, a0, s0 +; NOREMAT-NEXT: add a4, a7, t3 ; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v8 ; NOREMAT-NEXT: vle32.v v14, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: addi a4, sp, 640 -; NOREMAT-NEXT: vs2r.v v8, (a4) # Unknown-size Folded Spill -; NOREMAT-NEXT: add a4, a0, t0 +; NOREMAT-NEXT: addi a0, sp, 640 +; NOREMAT-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; NOREMAT-NEXT: add a4, a7, t4 ; NOREMAT-NEXT: vle32.v v10, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: add a4, a0, t5 +; NOREMAT-NEXT: add a4, a7, t6 ; NOREMAT-NEXT: vle32.v v0, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v10 ; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: add a4, a0, t6 +; NOREMAT-NEXT: add a4, a7, s2 ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: add a4, a0, s2 +; NOREMAT-NEXT: add a4, a7, s4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a0, s3 +; NOREMAT-NEXT: add a4, a7, s0 ; NOREMAT-NEXT: vle32.v v0, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v8 ; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: add a4, a0, s4 +; NOREMAT-NEXT: add a4, a7, s5 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a0, s5 +; NOREMAT-NEXT: add a4, a7, s6 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v8 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: add a4, a0, s6 +; NOREMAT-NEXT: add a4, a7, s8 ; NOREMAT-NEXT: vle32.v v20, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a0, s8 +; NOREMAT-NEXT: add a4, a7, s9 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: add a4, a0, s9 +; NOREMAT-NEXT: li t5, 13 +; NOREMAT-NEXT: slli a4, t5, 9 +; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a7, a4 ; NOREMAT-NEXT: vle32.v v20, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a0, s10 +; NOREMAT-NEXT: slli a4, a2, 10 +; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a7, a4 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: li t2, 15 -; NOREMAT-NEXT: slli a4, t2, 9 -; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a0, a4 +; NOREMAT-NEXT: li a6, 15 +; NOREMAT-NEXT: slli a4, a6, 9 +; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a7, a4 ; NOREMAT-NEXT: vle32.v v2, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 -; NOREMAT-NEXT: lui t4, 8 -; NOREMAT-NEXT: add a5, a0, t4 +; NOREMAT-NEXT: lui t1, 8 +; NOREMAT-NEXT: add a5, a7, t1 ; NOREMAT-NEXT: vle32.v v20, (a5) ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v2 ; NOREMAT-NEXT: li a4, 17 ; NOREMAT-NEXT: slli a4, a4, 9 -; NOREMAT-NEXT: li s1, 17 -; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a0, a4 +; NOREMAT-NEXT: li t2, 17 +; NOREMAT-NEXT: sd a4, 600(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a7, a4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v6 ; NOREMAT-NEXT: li a5, 9 ; NOREMAT-NEXT: slli a4, a5, 10 -; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a0, a4 +; NOREMAT-NEXT: sd a4, 592(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a7, a4 ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: vle32.v v6, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 ; NOREMAT-NEXT: li a4, 19 ; NOREMAT-NEXT: slli a4, a4, 9 -; NOREMAT-NEXT: li t1, 19 -; NOREMAT-NEXT: sd a4, 600(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a0, a4 +; NOREMAT-NEXT: li s1, 19 +; NOREMAT-NEXT: sd a4, 584(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a7, a4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: vle32.v v30, (a4) ; NOREMAT-NEXT: slli a3, a3, 11 -; NOREMAT-NEXT: sd a3, 592(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 -; NOREMAT-NEXT: add a3, a0, a3 +; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 ; NOREMAT-NEXT: li s7, 21 ; NOREMAT-NEXT: slli a3, s7, 9 -; NOREMAT-NEXT: sd a3, 584(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a0, a3 +; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v6, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 -; NOREMAT-NEXT: li a6, 11 -; NOREMAT-NEXT: slli a3, a6, 10 -; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a0, a3 +; NOREMAT-NEXT: li a4, 11 +; NOREMAT-NEXT: slli a3, a4, 10 +; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8 ; NOREMAT-NEXT: li s3, 23 -; NOREMAT-NEXT: slli a3, s3, 9 -; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a0, a3 +; NOREMAT-NEXT: slli s10, s3, 9 +; NOREMAT-NEXT: add a3, a7, s10 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 ; NOREMAT-NEXT: li s0, 25 ; NOREMAT-NEXT: slli a3, s0, 9 -; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a0, a3 +; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v6, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 -; NOREMAT-NEXT: li a7, 13 -; NOREMAT-NEXT: slli a3, a7, 10 -; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a0, a3 +; NOREMAT-NEXT: slli a3, t5, 10 +; NOREMAT-NEXT: sd a3, 544(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28 ; NOREMAT-NEXT: li t3, 27 ; NOREMAT-NEXT: slli a3, t3, 9 -; NOREMAT-NEXT: sd a3, 544(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a0, a3 +; NOREMAT-NEXT: sd a3, 536(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v28, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) ; NOREMAT-NEXT: slli a2, a2, 11 -; NOREMAT-NEXT: sd a2, 536(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12 -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 ; NOREMAT-NEXT: li t0, 29 ; NOREMAT-NEXT: slli a2, t0, 9 -; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v28 -; NOREMAT-NEXT: slli a2, t2, 10 -; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: li t2, 15 -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: slli a2, a6, 10 +; NOREMAT-NEXT: sd a2, 512(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 ; NOREMAT-NEXT: li a3, 31 -; NOREMAT-NEXT: slli a2, a3, 9 -; NOREMAT-NEXT: sd a2, 512(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 -; NOREMAT-NEXT: vle32.v v12, (a2) -; NOREMAT-NEXT: vle32.v v4, (a2) +; NOREMAT-NEXT: slli a0, a3, 9 +; NOREMAT-NEXT: sd a0, 504(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a0, a7, a0 +; NOREMAT-NEXT: vle32.v v12, (a0) +; NOREMAT-NEXT: vle32.v v4, (a0) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8 -; NOREMAT-NEXT: addiw a2, ra, 512 +; NOREMAT-NEXT: addiw a2, s11, 512 ; NOREMAT-NEXT: sd a2, 496(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v28 -; NOREMAT-NEXT: slli a2, s1, 10 +; NOREMAT-NEXT: slli a2, t2, 10 ; NOREMAT-NEXT: sd a2, 488(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 -; NOREMAT-NEXT: addiw a2, ra, 1536 +; NOREMAT-NEXT: addiw a2, s11, 1536 ; NOREMAT-NEXT: sd a2, 480(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: slli a2, a5, 11 ; NOREMAT-NEXT: sd a2, 472(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v24 -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v8 -; NOREMAT-NEXT: lui a4, 5 -; NOREMAT-NEXT: addiw a2, a4, -1536 +; NOREMAT-NEXT: addiw a2, ra, -1536 ; NOREMAT-NEXT: sd a2, 464(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v28 -; NOREMAT-NEXT: slli a2, t1, 10 +; NOREMAT-NEXT: slli a2, s1, 10 ; NOREMAT-NEXT: sd a2, 456(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: li t1, 19 -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 -; NOREMAT-NEXT: addiw a2, a4, -512 +; NOREMAT-NEXT: addiw a2, ra, -512 ; NOREMAT-NEXT: sd a2, 448(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v24 -; NOREMAT-NEXT: addiw a2, a4, 512 +; NOREMAT-NEXT: addiw a2, ra, 512 ; NOREMAT-NEXT: sd a2, 440(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: slli a2, s7, 10 ; NOREMAT-NEXT: sd a2, 432(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8 -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v26 -; NOREMAT-NEXT: addiw a2, a4, 1536 +; NOREMAT-NEXT: addiw a2, ra, 1536 ; NOREMAT-NEXT: sd a2, 424(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v26, (a2) -; NOREMAT-NEXT: slli a2, a6, 11 +; NOREMAT-NEXT: slli a2, a4, 11 ; NOREMAT-NEXT: sd a2, 416(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v12 -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v18 -; NOREMAT-NEXT: lui a5, 6 -; NOREMAT-NEXT: addiw a2, a5, -1536 +; NOREMAT-NEXT: lui a4, 6 +; NOREMAT-NEXT: addiw a2, a4, -1536 ; NOREMAT-NEXT: sd a2, 408(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: slli a2, s3, 10 ; NOREMAT-NEXT: sd a2, 400(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v24 -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 -; NOREMAT-NEXT: addiw a2, a5, -512 +; NOREMAT-NEXT: addiw a2, a4, -512 ; NOREMAT-NEXT: sd a2, 392(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v22 -; NOREMAT-NEXT: addiw a2, a5, 512 +; NOREMAT-NEXT: addiw a2, a4, 512 ; NOREMAT-NEXT: sd a2, 384(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: slli a2, s0, 10 ; NOREMAT-NEXT: sd a2, 376(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12 -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v2, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v18 -; NOREMAT-NEXT: addiw a2, a5, 1536 +; NOREMAT-NEXT: addiw a2, a4, 1536 ; NOREMAT-NEXT: sd a2, 368(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) -; NOREMAT-NEXT: slli a2, a7, 11 +; NOREMAT-NEXT: slli a2, t5, 11 ; NOREMAT-NEXT: sd a2, 360(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v16 -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v8 -; NOREMAT-NEXT: lui a7, 7 -; NOREMAT-NEXT: addiw a2, a7, -1536 +; NOREMAT-NEXT: lui a5, 7 +; NOREMAT-NEXT: addiw a2, a5, -1536 ; NOREMAT-NEXT: sd a2, 352(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: slli a2, t3, 10 ; NOREMAT-NEXT: sd a2, 344(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v14 -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v14, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: addi a2, sp, 640 -; NOREMAT-NEXT: vl2r.v v12, (a2) # Unknown-size Folded Reload +; NOREMAT-NEXT: addi a0, sp, 640 +; NOREMAT-NEXT: vl2r.v v12, (a0) # Unknown-size Folded Reload ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v22 -; NOREMAT-NEXT: addiw a2, a7, -512 +; NOREMAT-NEXT: addiw a2, a5, -512 ; NOREMAT-NEXT: sd a2, 336(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v26 -; NOREMAT-NEXT: addiw a2, a7, 512 +; NOREMAT-NEXT: addiw a2, a5, 512 ; NOREMAT-NEXT: sd a2, 328(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: slli a2, t0, 10 ; NOREMAT-NEXT: sd a2, 320(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v18 -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v2, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v16 -; NOREMAT-NEXT: addiw a2, a7, 1536 +; NOREMAT-NEXT: addiw a2, a5, 1536 ; NOREMAT-NEXT: sd a2, 312(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) -; NOREMAT-NEXT: slli a2, t2, 11 +; NOREMAT-NEXT: slli a2, a6, 11 ; NOREMAT-NEXT: sd a2, 304(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v14 -; NOREMAT-NEXT: addiw a2, t4, -1536 +; NOREMAT-NEXT: addiw a2, t1, -1536 ; NOREMAT-NEXT: sd a2, 296(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v14, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: slli a2, a3, 10 ; NOREMAT-NEXT: sd a2, 288(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v22 -; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: addiw a2, t4, -512 -; NOREMAT-NEXT: sd a2, 280(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a0, a0, a2 +; NOREMAT-NEXT: addiw a0, t1, -512 +; NOREMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a0, a7, a0 ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0 ; NOREMAT-NEXT: vle32.v v12, (a0) ; NOREMAT-NEXT: vle32.v v0, (a0) @@ -435,32 +431,33 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: addi a0, a1, 1024 ; NOREMAT-NEXT: vse32.v v8, (a0) -; NOREMAT-NEXT: add s11, a1, s11 -; NOREMAT-NEXT: sd s11, 272(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: lui a0, 1 +; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: sd a0, 272(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui a0, 2 ; NOREMAT-NEXT: add a0, a1, a0 ; NOREMAT-NEXT: sd a0, 264(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui a0, 3 ; NOREMAT-NEXT: add a0, a1, a0 ; NOREMAT-NEXT: sd a0, 256(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add s11, a1, s11 +; NOREMAT-NEXT: sd s11, 248(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 240(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a4, a1, a4 -; NOREMAT-NEXT: sd a4, 240(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a4, 232(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a5, a1, a5 -; NOREMAT-NEXT: sd a5, 232(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a7, a1, a7 -; NOREMAT-NEXT: sd a7, 224(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a0, a1, t4 +; NOREMAT-NEXT: sd a5, 224(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a0, a1, t1 ; NOREMAT-NEXT: sd a0, 216(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: addiw a0, t4, 512 +; NOREMAT-NEXT: addiw a0, t1, 512 ; NOREMAT-NEXT: sd a0, 192(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: addiw a0, t4, 1024 +; NOREMAT-NEXT: addiw a0, t1, 1024 ; NOREMAT-NEXT: sd a0, 176(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: addiw a0, t4, 1536 +; NOREMAT-NEXT: addiw a0, t1, 1536 ; NOREMAT-NEXT: sd a0, 160(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: slli s1, s1, 11 -; NOREMAT-NEXT: sd s1, 128(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: slli t2, t2, 11 +; NOREMAT-NEXT: sd t2, 128(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui a0, 9 ; NOREMAT-NEXT: addiw a2, a0, -1536 ; NOREMAT-NEXT: sd a2, 88(sp) # 8-byte Folded Spill @@ -473,7 +470,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: addiw s11, a0, 512 ; NOREMAT-NEXT: addiw s7, a0, 1024 ; NOREMAT-NEXT: addiw s3, a0, 1536 -; NOREMAT-NEXT: slli s1, t1, 11 +; NOREMAT-NEXT: slli s1, s1, 11 ; NOREMAT-NEXT: lui a0, 10 ; NOREMAT-NEXT: addiw t2, a0, -1536 ; NOREMAT-NEXT: addiw a7, a0, -1024 @@ -481,52 +478,52 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: add a2, a1, a0 ; NOREMAT-NEXT: sd a2, 200(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: addiw a0, a0, 512 -; NOREMAT-NEXT: ld a2, 504(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a2, a1, a2 -; NOREMAT-NEXT: add a3, a1, t5 -; NOREMAT-NEXT: add a5, a1, t6 -; NOREMAT-NEXT: add a6, a1, s2 -; NOREMAT-NEXT: add t0, a1, s4 -; NOREMAT-NEXT: add t1, a1, s5 -; NOREMAT-NEXT: add t3, a1, s6 -; NOREMAT-NEXT: add t4, a1, s8 -; NOREMAT-NEXT: add t5, a1, s9 -; NOREMAT-NEXT: add t6, a1, s10 -; NOREMAT-NEXT: ld s0, 624(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add a2, a1, t4 +; NOREMAT-NEXT: add a3, a1, t6 +; NOREMAT-NEXT: add a5, a1, s2 +; NOREMAT-NEXT: add a6, a1, s4 +; NOREMAT-NEXT: add t0, a1, s5 +; NOREMAT-NEXT: add t1, a1, s6 +; NOREMAT-NEXT: add t3, a1, s8 +; NOREMAT-NEXT: add t4, a1, s9 +; NOREMAT-NEXT: ld t5, 624(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add t5, a1, t5 +; NOREMAT-NEXT: ld t6, 616(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add t6, a1, t6 +; NOREMAT-NEXT: ld s0, 608(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s0, a1, s0 -; NOREMAT-NEXT: ld s2, 616(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s2, 600(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s2, a1, s2 -; NOREMAT-NEXT: ld s4, 608(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s4, 592(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s4, a1, s4 -; NOREMAT-NEXT: ld s5, 600(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s5, 584(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s5, a1, s5 -; NOREMAT-NEXT: ld s6, 592(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s6, 576(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s6, a1, s6 -; NOREMAT-NEXT: ld s8, 584(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s8, 568(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s8, a1, s8 -; NOREMAT-NEXT: ld s9, 576(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s9, 560(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s9, a1, s9 -; NOREMAT-NEXT: ld s10, 568(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s10, a1, s10 -; NOREMAT-NEXT: ld ra, 560(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 16(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 32(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 48(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 512(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 64(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 512(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 504(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 80(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 496(sp) # 8-byte Folded Reload @@ -920,10 +917,9 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: .cfi_offset s10, -96 ; REMAT-NEXT: .cfi_offset s11, -104 ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 14 -; REMAT-NEXT: mul a2, a2, a3 +; REMAT-NEXT: slli a2, a2, 3 ; REMAT-NEXT: sub sp, sp, a2 -; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 14 * vlenb +; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 8 * vlenb ; REMAT-NEXT: li a4, 32 ; REMAT-NEXT: addi a5, a0, 512 ; REMAT-NEXT: addi a3, a0, 1024 @@ -960,20 +956,13 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: slli s6, s6, 9 ; REMAT-NEXT: li s7, 5 ; REMAT-NEXT: slli s7, s7, 11 -; REMAT-NEXT: li s8, 21 -; REMAT-NEXT: slli s8, s8, 9 -; REMAT-NEXT: li s9, 11 -; REMAT-NEXT: slli s9, s9, 10 -; REMAT-NEXT: li s10, 23 -; REMAT-NEXT: slli s10, s10, 9 -; REMAT-NEXT: lui s11, 3 ; REMAT-NEXT: vsetvli zero, a4, e32, m2, ta, ma ; REMAT-NEXT: vle32.v v8, (a5) -; REMAT-NEXT: li a4, 25 +; REMAT-NEXT: li a4, 21 ; REMAT-NEXT: slli a4, a4, 9 ; REMAT-NEXT: vle32.v v10, (a3) ; REMAT-NEXT: vle32.v v12, (a3) -; REMAT-NEXT: li a3, 13 +; REMAT-NEXT: li a3, 11 ; REMAT-NEXT: slli a3, a3, 10 ; REMAT-NEXT: vle32.v v14, (a2) ; REMAT-NEXT: vle32.v v16, (a2) @@ -990,7 +979,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a5, 12 +; REMAT-NEXT: li a5, 6 ; REMAT-NEXT: mul a2, a2, a5 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 @@ -1000,8 +989,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a5, 10 -; REMAT-NEXT: mul a2, a2, a5 +; REMAT-NEXT: slli a2, a2, 2 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 ; REMAT-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill @@ -1015,16 +1003,11 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v14 ; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: add a2, a0, t5 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v16, v18 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 3 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: add a2, a0, t6 +; REMAT-NEXT: sf.vc.vv 3, 0, v16, v18 ; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: add a2, a0, t6 +; REMAT-NEXT: vle32.v v16, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22 ; REMAT-NEXT: vle32.v v20, (a2) ; REMAT-NEXT: add a2, a0, s0 @@ -1034,383 +1017,340 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: add a2, a0, s1 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v28, v30 -; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: vle32.v v14, (a2) ; REMAT-NEXT: add a2, a0, s2 -; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: vle32.v v12, (a2) ; REMAT-NEXT: csrr a5, vlenb -; REMAT-NEXT: li a6, 12 +; REMAT-NEXT: li a6, 6 ; REMAT-NEXT: mul a5, a5, a6 ; REMAT-NEXT: add a5, sp, a5 ; REMAT-NEXT: addi a5, a5, 432 -; REMAT-NEXT: vl2r.v v12, (a5) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v12, v2 +; REMAT-NEXT: vl2r.v v28, (a5) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v28, v2 ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: add a2, a0, s3 -; REMAT-NEXT: vle32.v v12, (a2) +; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: csrr a5, vlenb -; REMAT-NEXT: li a6, 10 -; REMAT-NEXT: mul a5, a5, a6 +; REMAT-NEXT: slli a5, a5, 2 ; REMAT-NEXT: add a5, sp, a5 ; REMAT-NEXT: addi a5, a5, 432 -; REMAT-NEXT: vl2r.v v16, (a5) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 -; REMAT-NEXT: vle32.v v30, (a2) +; REMAT-NEXT: vl2r.v v30, (a5) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v30, v4 +; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: add a2, a0, s4 -; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10 -; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: add a2, a0, s5 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v0, v14 -; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: add a2, a0, s6 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: csrr a5, vlenb -; REMAT-NEXT: slli a5, a5, 3 -; REMAT-NEXT: add a5, sp, a5 -; REMAT-NEXT: addi a5, a5, 432 -; REMAT-NEXT: vl2r.v v0, (a5) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18 +; REMAT-NEXT: add a2, a0, s5 +; REMAT-NEXT: vle32.v v6, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v0, v8 ; REMAT-NEXT: vle32.v v0, (a2) -; REMAT-NEXT: add a2, a0, s7 +; REMAT-NEXT: add a2, a0, s6 +; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v18, v16 ; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: add a2, a0, s7 +; REMAT-NEXT: vle32.v v16, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22 -; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 3 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v20, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: add a2, a0, s8 +; REMAT-NEXT: vle32.v v22, (a2) +; REMAT-NEXT: add a2, a0, a4 ; REMAT-NEXT: vle32.v v20, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v24, v26 ; REMAT-NEXT: vle32.v v24, (a2) -; REMAT-NEXT: add a2, a0, s9 -; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v28, v8 -; REMAT-NEXT: vle32.v v26, (a2) -; REMAT-NEXT: add a2, a0, s10 -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v2, v12 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: add a2, a0, s11 -; REMAT-NEXT: vle32.v v2, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v30, v16 -; REMAT-NEXT: vle32.v v16, (a2) ; REMAT-NEXT: addi a2, sp, 432 -; REMAT-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: add a2, a0, a4 -; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 1 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: vs2r.v v24, (a2) # Unknown-size Folded Spill ; REMAT-NEXT: add a2, a0, a3 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v4, v14 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 12 -; REMAT-NEXT: mul a2, a2, a3 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: li a5, 27 +; REMAT-NEXT: vle32.v v24, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v14, v12 +; REMAT-NEXT: vle32.v v12, (a2) +; REMAT-NEXT: li a5, 23 ; REMAT-NEXT: slli a5, a5, 9 ; REMAT-NEXT: add a2, a0, a5 +; REMAT-NEXT: vle32.v v26, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v2, v28 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18 -; REMAT-NEXT: vle32.v v18, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 10 +; REMAT-NEXT: li a3, 6 ; REMAT-NEXT: mul a2, a2, a3 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: li ra, 7 -; REMAT-NEXT: slli ra, ra, 11 -; REMAT-NEXT: add a2, a0, ra +; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: lui s8, 3 +; REMAT-NEXT: add a2, a0, s8 ; REMAT-NEXT: vle32.v v28, (a2) -; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 3 -; REMAT-NEXT: add a3, sp, a3 -; REMAT-NEXT: addi a3, a3, 432 -; REMAT-NEXT: vl2r.v v18, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v18, v20 -; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v4, v30 +; REMAT-NEXT: vle32.v v14, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 3 +; REMAT-NEXT: slli a2, a2, 2 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: li a2, 29 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: li s9, 25 +; REMAT-NEXT: slli s9, s9, 9 +; REMAT-NEXT: add a2, a0, s9 ; REMAT-NEXT: vle32.v v30, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v24, v22 -; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 6 -; REMAT-NEXT: mul a2, a2, a3 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: li a2, 15 -; REMAT-NEXT: slli a2, a2, 10 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: sf.vc.vv 3, 0, v10, v6 +; REMAT-NEXT: vle32.v v14, (a2) +; REMAT-NEXT: li s10, 13 +; REMAT-NEXT: slli s10, s10, 10 +; REMAT-NEXT: add a2, a0, s10 ; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v26, v8 +; REMAT-NEXT: sf.vc.vv 3, 0, v0, v8 ; REMAT-NEXT: vle32.v v8, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 2 +; REMAT-NEXT: slli a2, a2, 1 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 ; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: li a2, 31 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: li s11, 27 +; REMAT-NEXT: slli s11, s11, 9 +; REMAT-NEXT: add a2, a0, s11 ; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v12, v2 +; REMAT-NEXT: sf.vc.vv 3, 0, v18, v16 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: lui a2, 4 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: li ra, 7 +; REMAT-NEXT: slli ra, ra, 11 +; REMAT-NEXT: add a2, a0, ra ; REMAT-NEXT: vle32.v v2, (a2) -; REMAT-NEXT: addi a3, sp, 432 -; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v16 +; REMAT-NEXT: sf.vc.vv 3, 0, v22, v20 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: lui a2, 4 -; REMAT-NEXT: addiw a2, a2, 512 +; REMAT-NEXT: li a2, 29 +; REMAT-NEXT: slli a2, a2, 9 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v0, (a2) -; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 1 -; REMAT-NEXT: add a3, sp, a3 -; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: addi a3, sp, 432 ; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10 +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: li a2, 17 +; REMAT-NEXT: li a2, 15 ; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v24, (a2) -; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: li a4, 12 -; REMAT-NEXT: mul a3, a3, a4 -; REMAT-NEXT: add a3, sp, a3 -; REMAT-NEXT: addi a3, a3, 432 -; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 +; REMAT-NEXT: sf.vc.vv 3, 0, v12, v26 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: lui a2, 4 -; REMAT-NEXT: addiw a2, a2, 1536 +; REMAT-NEXT: li a2, 31 +; REMAT-NEXT: slli a2, a2, 9 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: li a4, 10 +; REMAT-NEXT: li a4, 6 ; REMAT-NEXT: mul a3, a3, a4 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v10, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: li a2, 9 -; REMAT-NEXT: slli a2, a2, 11 +; REMAT-NEXT: lui a2, 4 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 3 +; REMAT-NEXT: slli a3, a3, 2 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v12, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: lui a2, 5 -; REMAT-NEXT: addiw a2, a2, -1536 +; REMAT-NEXT: lui a2, 4 +; REMAT-NEXT: addiw a2, a2, 512 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v30, (a2) -; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: li a4, 6 -; REMAT-NEXT: mul a3, a3, a4 -; REMAT-NEXT: add a3, sp, a3 -; REMAT-NEXT: addi a3, a3, 432 -; REMAT-NEXT: vl2r.v v14, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: li a2, 19 +; REMAT-NEXT: li a2, 17 ; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 2 +; REMAT-NEXT: slli a3, a3, 1 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v16, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: lui a2, 5 -; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: lui a2, 4 +; REMAT-NEXT: addiw a2, a2, 1536 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: li a2, 9 +; REMAT-NEXT: slli a2, a2, 11 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 ; REMAT-NEXT: vle32.v v20, (a2) ; REMAT-NEXT: lui a2, 5 -; REMAT-NEXT: addiw a2, a2, 512 +; REMAT-NEXT: addiw a2, a2, -1536 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: li a2, 21 +; REMAT-NEXT: li a2, 19 ; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: lui s4, 5 -; REMAT-NEXT: addiw s4, s4, 1536 -; REMAT-NEXT: add a2, a0, s4 +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: li a2, 11 -; REMAT-NEXT: slli a2, a2, 11 +; REMAT-NEXT: lui a2, 5 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: lui s3, 6 -; REMAT-NEXT: addiw s3, s3, -1536 -; REMAT-NEXT: add a2, a0, s3 +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: addiw a2, a2, 512 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: li s2, 23 -; REMAT-NEXT: slli s2, s2, 10 -; REMAT-NEXT: add a2, a0, s2 +; REMAT-NEXT: li a2, 21 +; REMAT-NEXT: slli a2, a2, 10 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: lui a2, 6 -; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: addiw a2, a2, 1536 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: lui a2, 6 +; REMAT-NEXT: li a2, 11 +; REMAT-NEXT: slli a2, a2, 11 ; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: lui s1, 6 ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: lui s0, 6 -; REMAT-NEXT: addiw s0, s0, 512 -; REMAT-NEXT: add a2, a0, s0 +; REMAT-NEXT: lui a2, 6 +; REMAT-NEXT: addiw a2, a2, -1536 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: li a2, 25 +; REMAT-NEXT: li a2, 23 ; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: lui t6, 6 -; REMAT-NEXT: addiw t6, t6, 1536 -; REMAT-NEXT: add a2, a0, t6 +; REMAT-NEXT: lui a2, 6 +; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: li t5, 13 -; REMAT-NEXT: slli t5, t5, 11 -; REMAT-NEXT: add a2, a0, t5 +; REMAT-NEXT: lui a2, 6 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui s1, 6 ; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: lui a2, 7 -; REMAT-NEXT: addiw a2, a2, -1536 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui s0, 6 +; REMAT-NEXT: addiw s0, s0, 512 +; REMAT-NEXT: add a2, a0, s0 ; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: li t4, 27 -; REMAT-NEXT: slli t4, t4, 10 -; REMAT-NEXT: add a2, a0, t4 +; REMAT-NEXT: li a2, 25 +; REMAT-NEXT: slli a2, a2, 10 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: lui a2, 7 -; REMAT-NEXT: addiw a2, a2, -512 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui t6, 6 +; REMAT-NEXT: addiw t6, t6, 1536 +; REMAT-NEXT: add a2, a0, t6 ; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: lui a2, 7 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: lui t3, 7 +; REMAT-NEXT: li t5, 13 +; REMAT-NEXT: slli t5, t5, 11 +; REMAT-NEXT: add a2, a0, t5 ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: lui t2, 7 -; REMAT-NEXT: addiw t2, t2, 512 -; REMAT-NEXT: add a2, a0, t2 +; REMAT-NEXT: lui a2, 7 +; REMAT-NEXT: addiw a2, a2, -1536 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: li t1, 29 -; REMAT-NEXT: slli t1, t1, 10 -; REMAT-NEXT: add a2, a0, t1 +; REMAT-NEXT: li t4, 27 +; REMAT-NEXT: slli t4, t4, 10 +; REMAT-NEXT: add a2, a0, t4 ; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: lui t0, 7 -; REMAT-NEXT: addiw t0, t0, 1536 -; REMAT-NEXT: add a2, a0, t0 +; REMAT-NEXT: lui a2, 7 +; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: li a7, 15 -; REMAT-NEXT: slli a7, a7, 11 -; REMAT-NEXT: add a2, a0, a7 +; REMAT-NEXT: lui a2, 7 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui t3, 7 ; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: lui a6, 8 -; REMAT-NEXT: addiw a6, a6, -1536 -; REMAT-NEXT: add a2, a0, a6 +; REMAT-NEXT: lui t2, 7 +; REMAT-NEXT: addiw t2, t2, 512 +; REMAT-NEXT: add a2, a0, t2 ; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: li a4, 31 -; REMAT-NEXT: slli a4, a4, 10 -; REMAT-NEXT: add a2, a0, a4 +; REMAT-NEXT: li t1, 29 +; REMAT-NEXT: slli t1, t1, 10 +; REMAT-NEXT: add a2, a0, t1 ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: lui a3, 8 -; REMAT-NEXT: addiw a3, a3, -512 -; REMAT-NEXT: add a2, a0, a3 +; REMAT-NEXT: lui t0, 7 +; REMAT-NEXT: addiw t0, t0, 1536 +; REMAT-NEXT: add a2, a0, t0 ; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: lui a2, 8 -; REMAT-NEXT: add a0, a0, a2 -; REMAT-NEXT: vle32.v v2, (a0) +; REMAT-NEXT: li a7, 15 +; REMAT-NEXT: slli a7, a7, 11 +; REMAT-NEXT: add a2, a0, a7 +; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 +; REMAT-NEXT: vle32.v v20, (a2) +; REMAT-NEXT: lui a6, 8 +; REMAT-NEXT: addiw a6, a6, -1536 +; REMAT-NEXT: add a2, a0, a6 +; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 +; REMAT-NEXT: vle32.v v22, (a2) +; REMAT-NEXT: li a4, 31 +; REMAT-NEXT: slli a4, a4, 10 +; REMAT-NEXT: add a2, a0, a4 +; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 +; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: lui a3, 8 +; REMAT-NEXT: addiw a3, a3, -512 +; REMAT-NEXT: add a2, a0, a3 +; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 +; REMAT-NEXT: vle32.v v10, (a2) +; REMAT-NEXT: lui a2, 8 +; REMAT-NEXT: add a0, a0, a2 +; REMAT-NEXT: vle32.v v28, (a0) ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 +; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 +; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 +; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: addi a0, a1, 1024 ; REMAT-NEXT: vse32.v v8, (a0) @@ -1457,41 +1397,36 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 ; REMAT-NEXT: sd a0, 336(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 15 -; REMAT-NEXT: slli a0, a0, 9 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 328(sp) # 8-byte Folded Spill -; REMAT-NEXT: lui a0, 2 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 320(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 17 -; REMAT-NEXT: slli a0, a0, 9 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 312(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s2, a1, s2 +; REMAT-NEXT: sd s2, 328(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s3, a1, s3 +; REMAT-NEXT: sd s3, 320(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s4, a1, s4 +; REMAT-NEXT: sd s4, 312(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s5, a1, s5 ; REMAT-NEXT: sd s5, 304(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s6, a1, s6 ; REMAT-NEXT: sd s6, 296(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s7, a1, s7 ; REMAT-NEXT: sd s7, 288(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s8, a1, s8 -; REMAT-NEXT: sd s8, 280(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s9, a1, s9 -; REMAT-NEXT: sd s9, 272(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s10, a1, s10 -; REMAT-NEXT: sd s10, 264(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s11, a1, s11 -; REMAT-NEXT: sd s11, 256(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 25 +; REMAT-NEXT: li a0, 21 ; REMAT-NEXT: slli a0, a0, 9 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 248(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 13 +; REMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 11 ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 240(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd a0, 272(sp) # 8-byte Folded Spill ; REMAT-NEXT: add a5, a1, a5 -; REMAT-NEXT: sd a5, 232(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd a5, 264(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s8, a1, s8 +; REMAT-NEXT: sd s8, 256(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s9, a1, s9 +; REMAT-NEXT: sd s9, 248(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s10, a1, s10 +; REMAT-NEXT: sd s10, 240(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s11, a1, s11 +; REMAT-NEXT: sd s11, 232(sp) # 8-byte Folded Spill ; REMAT-NEXT: add ra, a1, ra ; REMAT-NEXT: sd ra, 224(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 29 @@ -1548,16 +1483,22 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 ; REMAT-NEXT: sd a0, 112(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s4, a1, s4 -; REMAT-NEXT: sd s4, 104(sp) # 8-byte Folded Spill +; REMAT-NEXT: lui a0, 5 +; REMAT-NEXT: addiw a0, a0, 1536 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 104(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 11 ; REMAT-NEXT: slli a0, a0, 11 ; REMAT-NEXT: add a0, a1, a0 ; REMAT-NEXT: sd a0, 96(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s3, a1, s3 -; REMAT-NEXT: sd s3, 88(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s2, a1, s2 -; REMAT-NEXT: sd s2, 80(sp) # 8-byte Folded Spill +; REMAT-NEXT: lui a0, 6 +; REMAT-NEXT: addiw a0, a0, -1536 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 88(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 23 +; REMAT-NEXT: slli a0, a0, 10 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 80(sp) # 8-byte Folded Spill ; REMAT-NEXT: lui a0, 6 ; REMAT-NEXT: addiw a0, a0, -512 ; REMAT-NEXT: add a0, a1, a0 @@ -1854,8 +1795,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: csrr a0, vlenb -; REMAT-NEXT: li a1, 14 -; REMAT-NEXT: mul a0, a0, a1 +; REMAT-NEXT: slli a0, a0, 3 ; REMAT-NEXT: add sp, sp, a0 ; REMAT-NEXT: .cfi_def_cfa sp, 544 ; REMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index 575a757149ebb..0b5856a7000dd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -5682,28 +5682,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -48 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 48 -; RV32ZVE32F-NEXT: sw s0, 44(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s1, 40(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s2, 36(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s3, 32(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s4, 28(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s5, 24(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s6, 20(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s7, 16(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s8, 12(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s9, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s3, 0(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 ; RV32ZVE32F-NEXT: .cfi_offset s3, -16 -; RV32ZVE32F-NEXT: .cfi_offset s4, -20 -; RV32ZVE32F-NEXT: .cfi_offset s5, -24 -; RV32ZVE32F-NEXT: .cfi_offset s6, -28 -; RV32ZVE32F-NEXT: .cfi_offset s7, -32 -; RV32ZVE32F-NEXT: .cfi_offset s8, -36 -; RV32ZVE32F-NEXT: .cfi_offset s9, -40 ; RV32ZVE32F-NEXT: .cfi_remember_state ; RV32ZVE32F-NEXT: lw a3, 56(a0) ; RV32ZVE32F-NEXT: lw a4, 60(a0) @@ -5715,30 +5703,30 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: lw t4, 28(a0) ; RV32ZVE32F-NEXT: lw t1, 32(a0) ; RV32ZVE32F-NEXT: lw t2, 36(a0) +; RV32ZVE32F-NEXT: lw t5, 0(a2) +; RV32ZVE32F-NEXT: lw t6, 8(a2) +; RV32ZVE32F-NEXT: lw s0, 16(a2) +; RV32ZVE32F-NEXT: lw s1, 24(a2) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vmv.v.x v8, t5 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t6 +; RV32ZVE32F-NEXT: lw t5, 32(a2) +; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: lw s2, 48(a2) +; RV32ZVE32F-NEXT: lw s3, 56(a2) +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s0 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s1 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t5 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t6 ; RV32ZVE32F-NEXT: lw s0, 8(a0) ; RV32ZVE32F-NEXT: lw s1, 12(a0) ; RV32ZVE32F-NEXT: lw t5, 16(a0) ; RV32ZVE32F-NEXT: lw t6, 20(a0) -; RV32ZVE32F-NEXT: lw s2, 32(a2) -; RV32ZVE32F-NEXT: lw s3, 40(a2) -; RV32ZVE32F-NEXT: lw s4, 48(a2) -; RV32ZVE32F-NEXT: lw s5, 56(a2) -; RV32ZVE32F-NEXT: lw s6, 0(a2) -; RV32ZVE32F-NEXT: lw s7, 8(a2) -; RV32ZVE32F-NEXT: lw s8, 16(a2) -; RV32ZVE32F-NEXT: lw s9, 24(a2) -; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v8, s6 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s7 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s8 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s9 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s3 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s4 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s5 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: andi s2, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 @@ -5771,27 +5759,15 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a4, 4(a0) ; RV32ZVE32F-NEXT: .LBB51_9: # %else14 -; RV32ZVE32F-NEXT: lw s0, 44(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s1, 40(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s2, 36(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s3, 32(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s4, 28(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s5, 24(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s6, 20(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s7, 16(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s8, 12(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s9, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s3, 0(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: .cfi_restore s0 ; RV32ZVE32F-NEXT: .cfi_restore s1 ; RV32ZVE32F-NEXT: .cfi_restore s2 ; RV32ZVE32F-NEXT: .cfi_restore s3 -; RV32ZVE32F-NEXT: .cfi_restore s4 -; RV32ZVE32F-NEXT: .cfi_restore s5 -; RV32ZVE32F-NEXT: .cfi_restore s6 -; RV32ZVE32F-NEXT: .cfi_restore s7 -; RV32ZVE32F-NEXT: .cfi_restore s8 -; RV32ZVE32F-NEXT: .cfi_restore s9 -; RV32ZVE32F-NEXT: addi sp, sp, 48 +; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 0 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB51_10: # %cond.store diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll index a11c02dd5e2cb..036fee6a13ca4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll @@ -1306,6 +1306,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: sb a0, 219(sp) ; ZVFHMIN32-NEXT: lh a0, 564(sp) ; ZVFHMIN32-NEXT: lh a1, 308(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 218(sp) +; ZVFHMIN32-NEXT: lh a0, 562(sp) +; ZVFHMIN32-NEXT: lh a1, 306(sp) ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 7 ; ZVFHMIN32-NEXT: csrr a2, vlenb @@ -1358,86 +1364,82 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN32-NEXT: vslidedown.vi v26, v8, 15 -; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 14 -; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 13 -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 12 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 1 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 14 +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 13 +; ZVFHMIN32-NEXT: addi a2, sp, 848 ; ZVFHMIN32-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v4, v8, 11 -; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 10 -; ZVFHMIN32-NEXT: vslidedown.vi v30, v8, 9 -; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 8 -; ZVFHMIN32-NEXT: vmv.x.s t5, v16 +; ZVFHMIN32-NEXT: vslidedown.vi v6, v8, 12 +; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 11 +; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 10 +; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 9 +; ZVFHMIN32-NEXT: vslidedown.vi v18, v8, 8 +; ZVFHMIN32-NEXT: vmv.x.s a3, v16 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 218(sp) -; ZVFHMIN32-NEXT: lh a0, 562(sp) -; ZVFHMIN32-NEXT: lh a1, 306(sp) +; ZVFHMIN32-NEXT: sb a0, 217(sp) +; ZVFHMIN32-NEXT: lh a0, 560(sp) +; ZVFHMIN32-NEXT: lh a1, 304(sp) ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 7 -; ZVFHMIN32-NEXT: vslidedown.vi v31, v16, 6 -; ZVFHMIN32-NEXT: vslidedown.vi v5, v16, 5 +; ZVFHMIN32-NEXT: vslidedown.vi v21, v16, 7 +; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 6 +; ZVFHMIN32-NEXT: vslidedown.vi v19, v16, 5 ; ZVFHMIN32-NEXT: vslidedown.vi v23, v16, 4 ; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 3 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 18 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: li a4, 10 +; ZVFHMIN32-NEXT: mul a2, a2, a4 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 2 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 22 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a2, a2, 4 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 1 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 21 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a4, a2, 4 +; ZVFHMIN32-NEXT: sub a2, a4, a2 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v18, v16, 15 -; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 14 -; ZVFHMIN32-NEXT: vslidedown.vi v12, v16, 13 -; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 12 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 11 -; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 10 +; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 15 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 14 +; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 13 +; ZVFHMIN32-NEXT: vslidedown.vi v12, v16, 12 +; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 11 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 19 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a4, a2, 4 +; ZVFHMIN32-NEXT: add a2, a4, a2 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 9 +; ZVFHMIN32-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 10 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 14 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: li a4, 11 +; ZVFHMIN32-NEXT: mul a2, a2, a4 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 8 +; ZVFHMIN32-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v4, v16, 9 +; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 217(sp) -; ZVFHMIN32-NEXT: lh a0, 560(sp) -; ZVFHMIN32-NEXT: lh a1, 304(sp) +; ZVFHMIN32-NEXT: sb a0, 216(sp) +; ZVFHMIN32-NEXT: lh a0, 558(sp) +; ZVFHMIN32-NEXT: lh a1, 302(sp) ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 7 -; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 6 -; ZVFHMIN32-NEXT: vslidedown.vi v13, v0, 5 +; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 7 +; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 6 +; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 5 ; ZVFHMIN32-NEXT: vslidedown.vi v29, v0, 4 -; ZVFHMIN32-NEXT: vslidedown.vi v27, v0, 3 -; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 2 -; ZVFHMIN32-NEXT: vslidedown.vi v21, v0, 1 +; ZVFHMIN32-NEXT: vslidedown.vi v31, v0, 3 +; ZVFHMIN32-NEXT: vslidedown.vi v5, v0, 2 +; ZVFHMIN32-NEXT: vslidedown.vi v27, v0, 1 ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 15 ; ZVFHMIN32-NEXT: csrr a2, vlenb @@ -1447,99 +1449,88 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 14 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 3 +; ZVFHMIN32-NEXT: slli a2, a2, 1 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 13 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 6 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: li a4, 6 +; ZVFHMIN32-NEXT: mul a2, a2, a4 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 12 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 12 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a2, a2, 3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 11 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 10 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: li a4, 13 +; ZVFHMIN32-NEXT: mul a2, a2, a4 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 10 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 4 +; ZVFHMIN32-NEXT: li a4, 19 +; ZVFHMIN32-NEXT: mul a2, a2, a4 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 9 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a4, 21 +; ZVFHMIN32-NEXT: mul a2, a2, a4 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vslidedown.vi v0, v0, 8 -; ZVFHMIN32-NEXT: addi a2, sp, 848 -; ZVFHMIN32-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vmv.x.s t4, v26 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 216(sp) -; ZVFHMIN32-NEXT: lh a0, 558(sp) -; ZVFHMIN32-NEXT: lh a1, 302(sp) -; ZVFHMIN32-NEXT: vmv.x.s t3, v20 -; ZVFHMIN32-NEXT: vmv.x.s t1, v28 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 215(sp) ; ZVFHMIN32-NEXT: lh a0, 556(sp) ; ZVFHMIN32-NEXT: lh a1, 300(sp) -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 1 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s t2, v0 -; ZVFHMIN32-NEXT: vmv.x.s t0, v4 +; ZVFHMIN32-NEXT: vmv.x.s t3, v26 +; ZVFHMIN32-NEXT: vmv.x.s t2, v28 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 214(sp) ; ZVFHMIN32-NEXT: lh a0, 554(sp) ; ZVFHMIN32-NEXT: lh a1, 298(sp) -; ZVFHMIN32-NEXT: vmv.x.s a7, v2 -; ZVFHMIN32-NEXT: vmv.x.s a6, v30 +; ZVFHMIN32-NEXT: addi a2, sp, 848 +; ZVFHMIN32-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s t1, v16 +; ZVFHMIN32-NEXT: vmv.x.s t0, v6 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 213(sp) ; ZVFHMIN32-NEXT: lh a0, 552(sp) ; ZVFHMIN32-NEXT: lh a1, 296(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v22 -; ZVFHMIN32-NEXT: sw a2, 104(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: vmv.x.s a2, v18 -; ZVFHMIN32-NEXT: sw a2, 108(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s a7, v2 +; ZVFHMIN32-NEXT: vmv.x.s a6, v22 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 212(sp) ; ZVFHMIN32-NEXT: lh a0, 550(sp) ; ZVFHMIN32-NEXT: lh a1, 294(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v14 -; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: vmv.x.s a2, v12 -; ZVFHMIN32-NEXT: sw a2, 116(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s a5, v20 +; ZVFHMIN32-NEXT: vmv.x.s a2, v18 +; ZVFHMIN32-NEXT: sw a2, 108(sp) # 4-byte Folded Spill ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 211(sp) ; ZVFHMIN32-NEXT: lh a0, 548(sp) ; ZVFHMIN32-NEXT: lh a1, 292(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v10 -; ZVFHMIN32-NEXT: sw a2, 120(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s a2, v14 +; ZVFHMIN32-NEXT: sw a2, 116(sp) # 4-byte Folded Spill ; ZVFHMIN32-NEXT: vmv.x.s a2, v8 ; ZVFHMIN32-NEXT: sw a2, 124(sp) # 4-byte Folded Spill ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 @@ -1548,33 +1539,27 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: sb a0, 210(sp) ; ZVFHMIN32-NEXT: lh a0, 546(sp) ; ZVFHMIN32-NEXT: lh a1, 290(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN32-NEXT: vmv.x.s t5, v24 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN32-NEXT: vmv.x.s a3, v24 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 ; ZVFHMIN32-NEXT: sb a0, 209(sp) ; ZVFHMIN32-NEXT: lh a0, 544(sp) ; ZVFHMIN32-NEXT: lh a1, 288(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t5 -; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb t5, 192(sp) +; ZVFHMIN32-NEXT: sb a3, 192(sp) ; ZVFHMIN32-NEXT: sb a0, 208(sp) ; ZVFHMIN32-NEXT: lh a0, 738(sp) ; ZVFHMIN32-NEXT: lh a1, 482(sp) -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 29 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s7, 848(a2) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 28 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s a2, v10 +; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s a2, v12 +; ZVFHMIN32-NEXT: sw a2, 120(sp) # 4-byte Folded Spill ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -1582,15 +1567,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 736(sp) ; ZVFHMIN32-NEXT: lh a1, 480(sp) ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 27 +; ZVFHMIN32-NEXT: li a3, 29 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s8, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 26 +; ZVFHMIN32-NEXT: li a3, 28 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -1598,15 +1583,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 734(sp) ; ZVFHMIN32-NEXT: lh a1, 478(sp) ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 25 +; ZVFHMIN32-NEXT: li a3, 27 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s9, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s6, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 24 +; ZVFHMIN32-NEXT: li a3, 26 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s6, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -1614,138 +1599,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 732(sp) ; ZVFHMIN32-NEXT: lh a1, 476(sp) ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 23 +; ZVFHMIN32-NEXT: li a3, 25 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s t5, v3 +; ZVFHMIN32-NEXT: lh s7, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 24 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 174(sp) ; ZVFHMIN32-NEXT: lh a0, 730(sp) ; ZVFHMIN32-NEXT: lh a1, 474(sp) -; ZVFHMIN32-NEXT: vmv.x.s s2, v31 -; ZVFHMIN32-NEXT: vmv.x.s t6, v5 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 23 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh s8, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s t4, v21 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 173(sp) -; ZVFHMIN32-NEXT: lh a1, 728(sp) -; ZVFHMIN32-NEXT: lh s10, 472(sp) -; ZVFHMIN32-NEXT: vmv.x.s a3, v9 -; ZVFHMIN32-NEXT: vmv.x.s a4, v11 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 172(sp) -; ZVFHMIN32-NEXT: lh a1, 726(sp) -; ZVFHMIN32-NEXT: lh s10, 470(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v13 -; ZVFHMIN32-NEXT: vmv.x.s s11, v29 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 171(sp) -; ZVFHMIN32-NEXT: lh ra, 724(sp) -; ZVFHMIN32-NEXT: lh a0, 468(sp) -; ZVFHMIN32-NEXT: vmv.x.s a5, v27 -; ZVFHMIN32-NEXT: vmv.x.s s10, v7 -; ZVFHMIN32-NEXT: fmv.h.x fa5, ra -; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN32-NEXT: lh a0, 728(sp) +; ZVFHMIN32-NEXT: lh a1, 472(sp) +; ZVFHMIN32-NEXT: vmv.x.s t6, v3 +; ZVFHMIN32-NEXT: vmv.x.s t5, v19 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 172(sp) +; ZVFHMIN32-NEXT: lh a0, 726(sp) +; ZVFHMIN32-NEXT: lh a1, 470(sp) +; ZVFHMIN32-NEXT: vmv.x.s s10, v11 +; ZVFHMIN32-NEXT: vmv.x.s s11, v7 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 171(sp) +; ZVFHMIN32-NEXT: lh a0, 724(sp) +; ZVFHMIN32-NEXT: lh s9, 468(sp) +; ZVFHMIN32-NEXT: vmv.x.s a4, v9 +; ZVFHMIN32-NEXT: vmv.x.s ra, v29 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, s9 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 170(sp) ; ZVFHMIN32-NEXT: lh a0, 722(sp) ; ZVFHMIN32-NEXT: lh a1, 466(sp) -; ZVFHMIN32-NEXT: vmv.x.s ra, v21 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s7 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 +; ZVFHMIN32-NEXT: vmv.x.s s9, v31 +; ZVFHMIN32-NEXT: vmv.x.s a3, v5 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 169(sp) ; ZVFHMIN32-NEXT: lh a0, 720(sp) ; ZVFHMIN32-NEXT: lh a1, 464(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s4 -; ZVFHMIN32-NEXT: fmv.h.x fa3, s8 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN32-NEXT: vmv.x.s a2, v27 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s5 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 ; ZVFHMIN32-NEXT: sb a0, 168(sp) ; ZVFHMIN32-NEXT: lh a0, 718(sp) ; ZVFHMIN32-NEXT: lh a1, 462(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa2, s5 -; ZVFHMIN32-NEXT: fmv.h.x fa1, s9 -; ZVFHMIN32-NEXT: fmv.h.x fa0, a0 -; ZVFHMIN32-NEXT: fmv.h.x ft0, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa0, ft0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, s2 +; ZVFHMIN32-NEXT: fmv.h.x fa3, s6 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 167(sp) ; ZVFHMIN32-NEXT: lh a0, 716(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa0, s6 ; ZVFHMIN32-NEXT: lh a1, 460(sp) -; ZVFHMIN32-NEXT: fmv.h.x ft0, a3 +; ZVFHMIN32-NEXT: fmv.h.x fa2, s3 +; ZVFHMIN32-NEXT: fmv.h.x fa1, s7 +; ZVFHMIN32-NEXT: fmv.h.x fa0, a0 +; ZVFHMIN32-NEXT: fmv.h.x ft0, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa0, ft0 +; ZVFHMIN32-NEXT: sb a0, 166(sp) +; ZVFHMIN32-NEXT: lh a0, 714(sp) +; ZVFHMIN32-NEXT: lh a1, 458(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa0, s4 +; ZVFHMIN32-NEXT: fmv.h.x ft0, s8 ; ZVFHMIN32-NEXT: fmv.h.x ft1, a0 -; ZVFHMIN32-NEXT: feq.h a0, fa5, ft0 +; ZVFHMIN32-NEXT: fmv.h.x ft2, a1 +; ZVFHMIN32-NEXT: feq.h a0, ft1, ft2 +; ZVFHMIN32-NEXT: sb a0, 165(sp) +; ZVFHMIN32-NEXT: lh a0, 712(sp) +; ZVFHMIN32-NEXT: lh a1, 456(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft1, s10 +; ZVFHMIN32-NEXT: fmv.h.x ft2, s11 +; ZVFHMIN32-NEXT: fmv.h.x ft3, a0 +; ZVFHMIN32-NEXT: fmv.h.x ft4, a1 +; ZVFHMIN32-NEXT: feq.h a0, ft3, ft4 +; ZVFHMIN32-NEXT: sb a0, 164(sp) +; ZVFHMIN32-NEXT: lh a0, 710(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft3, a4 +; ZVFHMIN32-NEXT: lh a1, 454(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft4, ra +; ZVFHMIN32-NEXT: fmv.h.x ft5, a0 +; ZVFHMIN32-NEXT: feq.h a0, fa5, ft1 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: feq.h a1, ft1, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: sb a1, 166(sp) -; ZVFHMIN32-NEXT: lh a1, 714(sp) -; ZVFHMIN32-NEXT: fmv.h.x ft0, a2 -; ZVFHMIN32-NEXT: lh a2, 458(sp) -; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5 +; ZVFHMIN32-NEXT: feq.h a1, ft5, fa5 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN32-NEXT: sb a1, 163(sp) +; ZVFHMIN32-NEXT: lh a1, 708(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft1, a2 +; ZVFHMIN32-NEXT: lh a2, 452(sp) +; ZVFHMIN32-NEXT: feq.h a3, fa0, fa5 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: feq.h a1, fa3, ft0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s3 -; ZVFHMIN32-NEXT: sb a2, 165(sp) -; ZVFHMIN32-NEXT: lh a2, 712(sp) -; ZVFHMIN32-NEXT: lh a4, 456(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s11 -; ZVFHMIN32-NEXT: feq.h s3, fa2, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a4 -; ZVFHMIN32-NEXT: feq.h a2, fa4, fa3 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: sb a2, 164(sp) -; ZVFHMIN32-NEXT: lh a2, 710(sp) -; ZVFHMIN32-NEXT: lh a4, 454(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa3, s10 -; ZVFHMIN32-NEXT: feq.h a5, fa1, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a4 -; ZVFHMIN32-NEXT: feq.h a2, fa4, fa2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, ra -; ZVFHMIN32-NEXT: sb a2, 163(sp) -; ZVFHMIN32-NEXT: lh a2, 708(sp) -; ZVFHMIN32-NEXT: lh a4, 452(sp) -; ZVFHMIN32-NEXT: feq.h s4, fa0, fa3 -; ZVFHMIN32-NEXT: feq.h s5, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a1, ft0, ft1 +; ZVFHMIN32-NEXT: fmv.h.x fa0, a2 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa0 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s9 ; ZVFHMIN32-NEXT: sb a2, 162(sp) ; ZVFHMIN32-NEXT: lh a2, 706(sp) ; ZVFHMIN32-NEXT: lh a4, 450(sp) -; ZVFHMIN32-NEXT: sb s5, 129(sp) -; ZVFHMIN32-NEXT: sb s4, 130(sp) -; ZVFHMIN32-NEXT: sb a5, 131(sp) -; ZVFHMIN32-NEXT: sb s3, 132(sp) +; ZVFHMIN32-NEXT: sb a1, 129(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa1, fa5 +; ZVFHMIN32-NEXT: sb a3, 130(sp) +; ZVFHMIN32-NEXT: feq.h a3, fa2, ft4 +; ZVFHMIN32-NEXT: sb a1, 131(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa4, ft2 +; ZVFHMIN32-NEXT: sb a3, 132(sp) +; ZVFHMIN32-NEXT: feq.h a3, fa3, ft3 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 ; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 133(sp) -; ZVFHMIN32-NEXT: sb a3, 134(sp) +; ZVFHMIN32-NEXT: sb a3, 133(sp) +; ZVFHMIN32-NEXT: sb a1, 134(sp) ; ZVFHMIN32-NEXT: sb a0, 135(sp) ; ZVFHMIN32-NEXT: sb a2, 161(sp) ; ZVFHMIN32-NEXT: lh a0, 610(sp) ; ZVFHMIN32-NEXT: lh a1, 354(sp) -; ZVFHMIN32-NEXT: vmv.x.s s6, v23 +; ZVFHMIN32-NEXT: vmv.x.s s4, v23 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 18 +; ZVFHMIN32-NEXT: li a3, 10 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 @@ -1753,13 +1748,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: lh a0, 608(sp) ; ZVFHMIN32-NEXT: lh a1, 352(sp) ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 22 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a2, a2, 4 ; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 21 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a3, a2, 4 +; ZVFHMIN32-NEXT: sub a2, a3, a2 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 @@ -1768,148 +1762,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: sb a0, 240(sp) ; ZVFHMIN32-NEXT: lh a0, 606(sp) ; ZVFHMIN32-NEXT: lh a1, 350(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa3, t5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa2 +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7 +; ZVFHMIN32-NEXT: vmv.x.s s6, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 239(sp) ; ZVFHMIN32-NEXT: lh a0, 604(sp) ; ZVFHMIN32-NEXT: lh a1, 348(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6 +; ZVFHMIN32-NEXT: vmv.x.s s7, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 238(sp) ; ZVFHMIN32-NEXT: lh a0, 602(sp) ; ZVFHMIN32-NEXT: lh a1, 346(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5 +; ZVFHMIN32-NEXT: vmv.x.s s8, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 237(sp) ; ZVFHMIN32-NEXT: lh a0, 600(sp) ; ZVFHMIN32-NEXT: lh a1, 344(sp) -; ZVFHMIN32-NEXT: vmv.x.s a3, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4 +; ZVFHMIN32-NEXT: vmv.x.s s9, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 236(sp) ; ZVFHMIN32-NEXT: lh a0, 598(sp) ; ZVFHMIN32-NEXT: lh a1, 342(sp) -; ZVFHMIN32-NEXT: vmv.x.s a4, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3 +; ZVFHMIN32-NEXT: vmv.x.s s10, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 235(sp) ; ZVFHMIN32-NEXT: lh a0, 596(sp) ; ZVFHMIN32-NEXT: lh a1, 340(sp) -; ZVFHMIN32-NEXT: vmv.x.s a5, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2 +; ZVFHMIN32-NEXT: vmv.x.s s11, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 234(sp) ; ZVFHMIN32-NEXT: lh a0, 594(sp) ; ZVFHMIN32-NEXT: lh a1, 338(sp) -; ZVFHMIN32-NEXT: vmv.x.s t6, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1 +; ZVFHMIN32-NEXT: vmv.x.s ra, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 233(sp) ; ZVFHMIN32-NEXT: lh a0, 592(sp) ; ZVFHMIN32-NEXT: lh a1, 336(sp) -; ZVFHMIN32-NEXT: vmv.x.s s2, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 ; ZVFHMIN32-NEXT: sb a0, 232(sp) ; ZVFHMIN32-NEXT: lh a0, 590(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa2, a2 ; ZVFHMIN32-NEXT: lh a1, 334(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa3, t5 +; ZVFHMIN32-NEXT: fmv.h.x fa2, s4 ; ZVFHMIN32-NEXT: fmv.h.x fa1, a0 -; ZVFHMIN32-NEXT: feq.h t5, fa3, fa2 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa1, fa3 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a3 +; ZVFHMIN32-NEXT: fmv.h.x fa0, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa1, fa0 ; ZVFHMIN32-NEXT: sb a0, 231(sp) ; ZVFHMIN32-NEXT: lh a0, 588(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa2, a4 ; ZVFHMIN32-NEXT: lh a1, 332(sp) -; ZVFHMIN32-NEXT: feq.h a3, fa5, fa3 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s6 -; ZVFHMIN32-NEXT: sb a1, 230(sp) -; ZVFHMIN32-NEXT: lh a1, 586(sp) -; ZVFHMIN32-NEXT: lh a4, 330(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa1, s2 +; ZVFHMIN32-NEXT: fmv.h.x fa0, s5 +; ZVFHMIN32-NEXT: fmv.h.x ft0, a0 +; ZVFHMIN32-NEXT: fmv.h.x ft1, a1 +; ZVFHMIN32-NEXT: feq.h a0, ft0, ft1 +; ZVFHMIN32-NEXT: sb a0, 230(sp) +; ZVFHMIN32-NEXT: lh a0, 586(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft0, s3 +; ZVFHMIN32-NEXT: lh a1, 330(sp) +; ZVFHMIN32-NEXT: fmv.h.x ft1, s6 +; ZVFHMIN32-NEXT: fmv.h.x ft2, a0 +; ZVFHMIN32-NEXT: feq.h a0, fa5, ft1 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s5 +; ZVFHMIN32-NEXT: feq.h a1, ft2, fa5 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s7 ; ZVFHMIN32-NEXT: sb a1, 229(sp) ; ZVFHMIN32-NEXT: lh a1, 584(sp) -; ZVFHMIN32-NEXT: lh a4, 328(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN32-NEXT: feq.h t6, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x ft1, s8 +; ZVFHMIN32-NEXT: lh a2, 328(sp) +; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s4 -; ZVFHMIN32-NEXT: sb a1, 228(sp) -; ZVFHMIN32-NEXT: lh a1, 582(sp) -; ZVFHMIN32-NEXT: lh a4, 326(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s2 -; ZVFHMIN32-NEXT: feq.h s2, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s3 -; ZVFHMIN32-NEXT: sb a1, 227(sp) -; ZVFHMIN32-NEXT: lh a1, 580(sp) -; ZVFHMIN32-NEXT: lh a4, 324(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa3, ft1 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 226(sp) -; ZVFHMIN32-NEXT: lh a1, 578(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN32-NEXT: sb a2, 228(sp) +; ZVFHMIN32-NEXT: lh a2, 582(sp) +; ZVFHMIN32-NEXT: lh a4, 326(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, s10 +; ZVFHMIN32-NEXT: feq.h t4, fa2, fa5 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa3 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s11 +; ZVFHMIN32-NEXT: fmv.h.x fa3, ra +; ZVFHMIN32-NEXT: sb a2, 227(sp) +; ZVFHMIN32-NEXT: lh a2, 580(sp) +; ZVFHMIN32-NEXT: lh a4, 324(sp) +; ZVFHMIN32-NEXT: feq.h t5, fa0, fa5 +; ZVFHMIN32-NEXT: feq.h t6, ft0, fa3 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa3 +; ZVFHMIN32-NEXT: sb a2, 226(sp) +; ZVFHMIN32-NEXT: lh a2, 578(sp) ; ZVFHMIN32-NEXT: lh a4, 322(sp) -; ZVFHMIN32-NEXT: sb a2, 193(sp) -; ZVFHMIN32-NEXT: sb s2, 194(sp) +; ZVFHMIN32-NEXT: sb t6, 193(sp) +; ZVFHMIN32-NEXT: feq.h t6, fa1, fa4 +; ZVFHMIN32-NEXT: sb t5, 194(sp) ; ZVFHMIN32-NEXT: sb t6, 195(sp) -; ZVFHMIN32-NEXT: sb a5, 196(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: sb t4, 196(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 197(sp) +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 197(sp) ; ZVFHMIN32-NEXT: sb a3, 198(sp) -; ZVFHMIN32-NEXT: sb t5, 199(sp) -; ZVFHMIN32-NEXT: sb a1, 225(sp) +; ZVFHMIN32-NEXT: sb a0, 199(sp) +; ZVFHMIN32-NEXT: sb a2, 225(sp) ; ZVFHMIN32-NEXT: lh a0, 766(sp) ; ZVFHMIN32-NEXT: lh a1, 510(sp) ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 19 -; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: slli a3, a2, 4 +; ZVFHMIN32-NEXT: add a2, a3, a2 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 ; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload ; ZVFHMIN32-NEXT: vmv.x.s s2, v8 ; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 14 +; ZVFHMIN32-NEXT: li a3, 11 ; ZVFHMIN32-NEXT: mul a2, a2, a3 ; ZVFHMIN32-NEXT: add a2, sp, a2 ; ZVFHMIN32-NEXT: addi a2, a2, 848 @@ -1921,301 +1915,305 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: sb a0, 191(sp) ; ZVFHMIN32-NEXT: lh a0, 764(sp) ; ZVFHMIN32-NEXT: lh a1, 508(sp) -; ZVFHMIN32-NEXT: vmv.x.s t5, v6 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 2 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s a2, v8 +; ZVFHMIN32-NEXT: vmv.x.s t5, v4 +; ZVFHMIN32-NEXT: vmv.x.s t4, v30 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 190(sp) ; ZVFHMIN32-NEXT: lh a0, 762(sp) ; ZVFHMIN32-NEXT: lh a1, 506(sp) +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a2, a2, 2 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s a2, v8 ; ZVFHMIN32-NEXT: csrr a3, vlenb -; ZVFHMIN32-NEXT: slli a3, a3, 3 +; ZVFHMIN32-NEXT: slli a3, a3, 1 ; ZVFHMIN32-NEXT: add a3, sp, a3 ; ZVFHMIN32-NEXT: addi a3, a3, 848 ; ZVFHMIN32-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload ; ZVFHMIN32-NEXT: vmv.x.s a3, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 189(sp) +; ZVFHMIN32-NEXT: lh a0, 760(sp) +; ZVFHMIN32-NEXT: lh a1, 504(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, t3 ; ZVFHMIN32-NEXT: csrr a4, vlenb -; ZVFHMIN32-NEXT: li a5, 6 -; ZVFHMIN32-NEXT: mul a4, a4, a5 +; ZVFHMIN32-NEXT: li t3, 6 +; ZVFHMIN32-NEXT: mul a4, a4, t3 ; ZVFHMIN32-NEXT: add a4, sp, a4 ; ZVFHMIN32-NEXT: addi a4, a4, 848 ; ZVFHMIN32-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload ; ZVFHMIN32-NEXT: vmv.x.s a4, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 189(sp) -; ZVFHMIN32-NEXT: lh a1, 760(sp) -; ZVFHMIN32-NEXT: lh a5, 504(sp) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li s3, 12 -; ZVFHMIN32-NEXT: mul a0, a0, s3 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: addi a0, a0, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s s5, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 +; ZVFHMIN32-NEXT: sb a0, 188(sp) +; ZVFHMIN32-NEXT: lh a0, 758(sp) +; ZVFHMIN32-NEXT: lh a1, 502(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN32-NEXT: csrr t2, vlenb +; ZVFHMIN32-NEXT: slli t2, t2, 3 +; ZVFHMIN32-NEXT: add t2, sp, t2 +; ZVFHMIN32-NEXT: addi t2, t2, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (t2) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s t2, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: sb a0, 187(sp) +; ZVFHMIN32-NEXT: lh a0, 756(sp) +; ZVFHMIN32-NEXT: lh a1, 500(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa3, t1 +; ZVFHMIN32-NEXT: csrr t1, vlenb +; ZVFHMIN32-NEXT: li t3, 13 +; ZVFHMIN32-NEXT: mul t1, t1, t3 +; ZVFHMIN32-NEXT: add t1, sp, t1 +; ZVFHMIN32-NEXT: addi t1, t1, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (t1) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s t3, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN32-NEXT: sb a0, 186(sp) +; ZVFHMIN32-NEXT: lh a0, 754(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa2, t0 +; ZVFHMIN32-NEXT: lh a1, 498(sp) +; ZVFHMIN32-NEXT: csrr t0, vlenb +; ZVFHMIN32-NEXT: li t1, 19 +; ZVFHMIN32-NEXT: mul t0, t0, t1 +; ZVFHMIN32-NEXT: add t0, sp, t0 +; ZVFHMIN32-NEXT: addi t0, t0, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (t0) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s3, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a0 ; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li s3, 10 -; ZVFHMIN32-NEXT: mul a0, a0, s3 +; ZVFHMIN32-NEXT: li t0, 21 +; ZVFHMIN32-NEXT: mul a0, a0, t0 ; ZVFHMIN32-NEXT: add a0, sp, a0 ; ZVFHMIN32-NEXT: addi a0, a0, 848 ; ZVFHMIN32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN32-NEXT: vmv.x.s a0, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 188(sp) -; ZVFHMIN32-NEXT: lh a1, 758(sp) -; ZVFHMIN32-NEXT: lh a5, 502(sp) -; ZVFHMIN32-NEXT: csrr s3, vlenb -; ZVFHMIN32-NEXT: slli s3, s3, 4 -; ZVFHMIN32-NEXT: add s3, sp, s3 -; ZVFHMIN32-NEXT: addi s3, s3, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s s4, v8 -; ZVFHMIN32-NEXT: vmv.x.s s3, v16 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 -; ZVFHMIN32-NEXT: sb a1, 187(sp) -; ZVFHMIN32-NEXT: lh a1, 756(sp) -; ZVFHMIN32-NEXT: lh a5, 500(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h t4, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t3 -; ZVFHMIN32-NEXT: sb a1, 186(sp) -; ZVFHMIN32-NEXT: lh a1, 754(sp) -; ZVFHMIN32-NEXT: lh a2, 498(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h t3, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t1 +; ZVFHMIN32-NEXT: fmv.h.x fa0, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa1, fa0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a2 ; ZVFHMIN32-NEXT: sb a1, 185(sp) ; ZVFHMIN32-NEXT: lh a1, 752(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa0, a3 ; ZVFHMIN32-NEXT: lh a2, 496(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h t1, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h t0, fa5, fa1 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: feq.h t1, fa4, fa0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 ; ZVFHMIN32-NEXT: sb a1, 184(sp) ; ZVFHMIN32-NEXT: lh a1, 750(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 ; ZVFHMIN32-NEXT: lh a2, 494(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s5 -; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN32-NEXT: sb a1, 183(sp) -; ZVFHMIN32-NEXT: lh a1, 748(sp) -; ZVFHMIN32-NEXT: lh a2, 492(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a3, fa3, fa5 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa2, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a7 -; ZVFHMIN32-NEXT: sb a1, 182(sp) -; ZVFHMIN32-NEXT: lh a1, 746(sp) -; ZVFHMIN32-NEXT: lh a2, 490(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s4 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a2, 183(sp) +; ZVFHMIN32-NEXT: lh a2, 748(sp) +; ZVFHMIN32-NEXT: lh a4, 492(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 +; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN32-NEXT: sb a1, 181(sp) -; ZVFHMIN32-NEXT: lh a1, 744(sp) -; ZVFHMIN32-NEXT: lh a2, 488(sp) +; ZVFHMIN32-NEXT: sb a2, 182(sp) +; ZVFHMIN32-NEXT: lh a2, 746(sp) +; ZVFHMIN32-NEXT: lh a4, 490(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, s3 ; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: lw a2, 104(sp) # 4-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: addi a2, sp, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s a2, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a5 +; ZVFHMIN32-NEXT: sb a2, 181(sp) +; ZVFHMIN32-NEXT: lh a2, 744(sp) +; ZVFHMIN32-NEXT: lh a4, 488(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: lw a4, 108(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN32-NEXT: vmv.x.s a5, v0 ; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 15 -; ZVFHMIN32-NEXT: vmv.x.s a5, v8 -; ZVFHMIN32-NEXT: sb a1, 180(sp) -; ZVFHMIN32-NEXT: lh a1, 742(sp) -; ZVFHMIN32-NEXT: lh a7, 486(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: vmv.x.s a4, v8 +; ZVFHMIN32-NEXT: sb a2, 180(sp) +; ZVFHMIN32-NEXT: lh a2, 742(sp) +; ZVFHMIN32-NEXT: lh t2, 486(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 ; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 179(sp) -; ZVFHMIN32-NEXT: lh a1, 740(sp) -; ZVFHMIN32-NEXT: lh a7, 484(sp) -; ZVFHMIN32-NEXT: sb a3, 140(sp) -; ZVFHMIN32-NEXT: sb t1, 141(sp) -; ZVFHMIN32-NEXT: sb t3, 142(sp) -; ZVFHMIN32-NEXT: sb t4, 143(sp) -; ZVFHMIN32-NEXT: sb a2, 136(sp) -; ZVFHMIN32-NEXT: sb a6, 137(sp) -; ZVFHMIN32-NEXT: sb a4, 138(sp) -; ZVFHMIN32-NEXT: sb a0, 139(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN32-NEXT: sb a2, 179(sp) +; ZVFHMIN32-NEXT: lh a2, 740(sp) +; ZVFHMIN32-NEXT: lh t2, 484(sp) +; ZVFHMIN32-NEXT: sb a1, 140(sp) +; ZVFHMIN32-NEXT: sb a3, 141(sp) +; ZVFHMIN32-NEXT: sb t1, 142(sp) +; ZVFHMIN32-NEXT: sb t0, 143(sp) +; ZVFHMIN32-NEXT: sb a5, 136(sp) +; ZVFHMIN32-NEXT: sb a0, 137(sp) +; ZVFHMIN32-NEXT: sb a6, 138(sp) +; ZVFHMIN32-NEXT: sb a7, 139(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 178(sp) -; ZVFHMIN32-NEXT: lh a1, 638(sp) -; ZVFHMIN32-NEXT: lh a2, 382(sp) +; ZVFHMIN32-NEXT: lh a0, 638(sp) +; ZVFHMIN32-NEXT: lh a1, 382(sp) ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 14 -; ZVFHMIN32-NEXT: vmv.x.s a0, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 255(sp) -; ZVFHMIN32-NEXT: lh a1, 636(sp) -; ZVFHMIN32-NEXT: lh a2, 380(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 13 ; ZVFHMIN32-NEXT: vmv.x.s t2, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 254(sp) -; ZVFHMIN32-NEXT: lh a1, 634(sp) -; ZVFHMIN32-NEXT: lh a2, 378(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 12 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 255(sp) +; ZVFHMIN32-NEXT: lh a0, 636(sp) +; ZVFHMIN32-NEXT: lh a1, 380(sp) +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 13 ; ZVFHMIN32-NEXT: vmv.x.s t1, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 253(sp) -; ZVFHMIN32-NEXT: lh a1, 632(sp) -; ZVFHMIN32-NEXT: lh a2, 376(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 11 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 254(sp) +; ZVFHMIN32-NEXT: lh a0, 634(sp) +; ZVFHMIN32-NEXT: lh a1, 378(sp) +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 12 ; ZVFHMIN32-NEXT: vmv.x.s t0, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 252(sp) -; ZVFHMIN32-NEXT: lh a1, 630(sp) -; ZVFHMIN32-NEXT: lh a2, 374(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 10 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 253(sp) +; ZVFHMIN32-NEXT: lh a0, 632(sp) +; ZVFHMIN32-NEXT: lh a1, 376(sp) +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 11 ; ZVFHMIN32-NEXT: vmv.x.s a7, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 251(sp) -; ZVFHMIN32-NEXT: lh a1, 628(sp) -; ZVFHMIN32-NEXT: lh a2, 372(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 9 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 252(sp) +; ZVFHMIN32-NEXT: lh a0, 630(sp) +; ZVFHMIN32-NEXT: lh a1, 374(sp) +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 10 ; ZVFHMIN32-NEXT: vmv.x.s a6, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: lw a2, 108(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: sb a1, 250(sp) -; ZVFHMIN32-NEXT: lh a1, 626(sp) -; ZVFHMIN32-NEXT: lh a2, 370(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: lw a2, 112(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: sb a1, 249(sp) -; ZVFHMIN32-NEXT: lh a1, 624(sp) -; ZVFHMIN32-NEXT: lh a2, 368(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: lw a1, 116(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: sb a0, 248(sp) -; ZVFHMIN32-NEXT: lh a0, 622(sp) -; ZVFHMIN32-NEXT: lh a1, 366(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 251(sp) +; ZVFHMIN32-NEXT: lh a0, 628(sp) +; ZVFHMIN32-NEXT: lh a1, 372(sp) +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 9 +; ZVFHMIN32-NEXT: vmv.x.s a5, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: lw a1, 120(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw a1, 116(sp) # 4-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: sb a0, 247(sp) -; ZVFHMIN32-NEXT: lh a0, 620(sp) -; ZVFHMIN32-NEXT: lh a1, 364(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 250(sp) +; ZVFHMIN32-NEXT: lh a0, 626(sp) +; ZVFHMIN32-NEXT: lh a1, 370(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: lw a1, 124(sp) # 4-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: sb a0, 246(sp) -; ZVFHMIN32-NEXT: lh a0, 618(sp) -; ZVFHMIN32-NEXT: lh a1, 362(sp) +; ZVFHMIN32-NEXT: sb a0, 249(sp) +; ZVFHMIN32-NEXT: lh a1, 624(sp) +; ZVFHMIN32-NEXT: lh a3, 368(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: lw a3, 112(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN32-NEXT: sb a1, 248(sp) +; ZVFHMIN32-NEXT: lh a1, 622(sp) +; ZVFHMIN32-NEXT: lh a3, 366(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, t1 +; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: lw a3, 120(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN32-NEXT: sb a1, 247(sp) +; ZVFHMIN32-NEXT: lh a1, 620(sp) +; ZVFHMIN32-NEXT: lh a3, 364(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, t0 ; ZVFHMIN32-NEXT: feq.h t0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, s2 -; ZVFHMIN32-NEXT: sb a0, 245(sp) -; ZVFHMIN32-NEXT: lh a0, 616(sp) -; ZVFHMIN32-NEXT: lh a1, 360(sp) +; ZVFHMIN32-NEXT: sb a1, 246(sp) +; ZVFHMIN32-NEXT: lh a1, 618(sp) +; ZVFHMIN32-NEXT: lh a3, 362(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 ; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN32-NEXT: sb a0, 244(sp) -; ZVFHMIN32-NEXT: lh a0, 614(sp) -; ZVFHMIN32-NEXT: lh a1, 358(sp) +; ZVFHMIN32-NEXT: sb a1, 245(sp) +; ZVFHMIN32-NEXT: lh a1, 616(sp) +; ZVFHMIN32-NEXT: lh a3, 360(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, a6 ; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN32-NEXT: sb a1, 244(sp) +; ZVFHMIN32-NEXT: lh a1, 614(sp) +; ZVFHMIN32-NEXT: lh a3, 358(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 8 -; ZVFHMIN32-NEXT: vmv.x.s a1, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: sb a0, 243(sp) -; ZVFHMIN32-NEXT: lh a0, 612(sp) -; ZVFHMIN32-NEXT: lh a1, 356(sp) -; ZVFHMIN32-NEXT: sb a5, 204(sp) -; ZVFHMIN32-NEXT: sb a2, 205(sp) -; ZVFHMIN32-NEXT: sb a3, 206(sp) -; ZVFHMIN32-NEXT: sb a4, 207(sp) -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: sb a2, 200(sp) -; ZVFHMIN32-NEXT: sb a6, 201(sp) -; ZVFHMIN32-NEXT: sb a7, 202(sp) -; ZVFHMIN32-NEXT: sb t0, 203(sp) -; ZVFHMIN32-NEXT: li a2, 128 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: vmv.x.s a3, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: sb a1, 243(sp) +; ZVFHMIN32-NEXT: lh a1, 612(sp) +; ZVFHMIN32-NEXT: lh a3, 356(sp) +; ZVFHMIN32-NEXT: sb t0, 204(sp) +; ZVFHMIN32-NEXT: sb a4, 205(sp) +; ZVFHMIN32-NEXT: sb a0, 206(sp) +; ZVFHMIN32-NEXT: sb a2, 207(sp) ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 242(sp) -; ZVFHMIN32-NEXT: addi a0, sp, 128 -; ZVFHMIN32-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; ZVFHMIN32-NEXT: vle8.v v8, (a0) +; ZVFHMIN32-NEXT: sb a0, 200(sp) +; ZVFHMIN32-NEXT: sb a5, 201(sp) +; ZVFHMIN32-NEXT: sb a6, 202(sp) +; ZVFHMIN32-NEXT: sb a7, 203(sp) +; ZVFHMIN32-NEXT: li a0, 128 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 242(sp) +; ZVFHMIN32-NEXT: addi a1, sp, 128 +; ZVFHMIN32-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; ZVFHMIN32-NEXT: vle8.v v8, (a1) ; ZVFHMIN32-NEXT: vand.vi v8, v8, 1 ; ZVFHMIN32-NEXT: vmsne.vi v0, v8, 0 ; ZVFHMIN32-NEXT: addi sp, s0, -896 @@ -2442,6 +2440,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: sb a0, 219(sp) ; ZVFHMIN64-NEXT: lh a0, 564(sp) ; ZVFHMIN64-NEXT: lh a1, 308(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 218(sp) +; ZVFHMIN64-NEXT: lh a0, 562(sp) +; ZVFHMIN64-NEXT: lh a1, 306(sp) ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 7 ; ZVFHMIN64-NEXT: csrr a2, vlenb @@ -2494,86 +2498,82 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN64-NEXT: vslidedown.vi v26, v8, 15 -; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 14 -; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 13 -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 12 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 1 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 14 +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 13 +; ZVFHMIN64-NEXT: addi a2, sp, 800 ; ZVFHMIN64-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v4, v8, 11 -; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 10 -; ZVFHMIN64-NEXT: vslidedown.vi v30, v8, 9 -; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 8 -; ZVFHMIN64-NEXT: vmv.x.s t5, v16 +; ZVFHMIN64-NEXT: vslidedown.vi v6, v8, 12 +; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 11 +; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 10 +; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 9 +; ZVFHMIN64-NEXT: vslidedown.vi v18, v8, 8 +; ZVFHMIN64-NEXT: vmv.x.s a3, v16 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 218(sp) -; ZVFHMIN64-NEXT: lh a0, 562(sp) -; ZVFHMIN64-NEXT: lh a1, 306(sp) +; ZVFHMIN64-NEXT: sb a0, 217(sp) +; ZVFHMIN64-NEXT: lh a0, 560(sp) +; ZVFHMIN64-NEXT: lh a1, 304(sp) ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 7 -; ZVFHMIN64-NEXT: vslidedown.vi v31, v16, 6 -; ZVFHMIN64-NEXT: vslidedown.vi v5, v16, 5 +; ZVFHMIN64-NEXT: vslidedown.vi v21, v16, 7 +; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 6 +; ZVFHMIN64-NEXT: vslidedown.vi v19, v16, 5 ; ZVFHMIN64-NEXT: vslidedown.vi v23, v16, 4 ; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 3 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 18 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: li a4, 10 +; ZVFHMIN64-NEXT: mul a2, a2, a4 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 2 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 22 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a2, a2, 4 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 1 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 21 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a4, a2, 4 +; ZVFHMIN64-NEXT: sub a2, a4, a2 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v18, v16, 15 -; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 14 -; ZVFHMIN64-NEXT: vslidedown.vi v12, v16, 13 -; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 12 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 11 -; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 10 +; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 15 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 14 +; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 13 +; ZVFHMIN64-NEXT: vslidedown.vi v12, v16, 12 +; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 11 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 19 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a4, a2, 4 +; ZVFHMIN64-NEXT: add a2, a4, a2 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 9 +; ZVFHMIN64-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 10 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 14 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: li a4, 11 +; ZVFHMIN64-NEXT: mul a2, a2, a4 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 8 +; ZVFHMIN64-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v4, v16, 9 +; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 217(sp) -; ZVFHMIN64-NEXT: lh a0, 560(sp) -; ZVFHMIN64-NEXT: lh a1, 304(sp) -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 7 -; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 6 -; ZVFHMIN64-NEXT: vslidedown.vi v13, v0, 5 -; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 4 -; ZVFHMIN64-NEXT: vslidedown.vi v27, v0, 3 -; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 2 -; ZVFHMIN64-NEXT: vslidedown.vi v21, v0, 1 +; ZVFHMIN64-NEXT: sb a0, 216(sp) +; ZVFHMIN64-NEXT: lh a0, 558(sp) +; ZVFHMIN64-NEXT: lh a1, 302(sp) +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 7 +; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 6 +; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 5 +; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 4 +; ZVFHMIN64-NEXT: vslidedown.vi v31, v0, 3 +; ZVFHMIN64-NEXT: vslidedown.vi v5, v0, 2 +; ZVFHMIN64-NEXT: vslidedown.vi v27, v0, 1 ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 15 ; ZVFHMIN64-NEXT: csrr a2, vlenb @@ -2583,99 +2583,88 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 14 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 3 +; ZVFHMIN64-NEXT: slli a2, a2, 1 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 13 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 6 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: li a4, 6 +; ZVFHMIN64-NEXT: mul a2, a2, a4 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 12 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 12 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a2, a2, 3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 11 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 10 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: li a4, 13 +; ZVFHMIN64-NEXT: mul a2, a2, a4 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 10 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 4 +; ZVFHMIN64-NEXT: li a4, 19 +; ZVFHMIN64-NEXT: mul a2, a2, a4 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 9 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a4, 21 +; ZVFHMIN64-NEXT: mul a2, a2, a4 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vslidedown.vi v0, v0, 8 -; ZVFHMIN64-NEXT: addi a2, sp, 800 -; ZVFHMIN64-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vmv.x.s t4, v26 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 216(sp) -; ZVFHMIN64-NEXT: lh a0, 558(sp) -; ZVFHMIN64-NEXT: lh a1, 302(sp) -; ZVFHMIN64-NEXT: vmv.x.s t3, v20 -; ZVFHMIN64-NEXT: vmv.x.s t1, v28 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 215(sp) ; ZVFHMIN64-NEXT: lh a0, 556(sp) ; ZVFHMIN64-NEXT: lh a1, 300(sp) -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 1 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s t2, v0 -; ZVFHMIN64-NEXT: vmv.x.s t0, v4 +; ZVFHMIN64-NEXT: vmv.x.s t3, v26 +; ZVFHMIN64-NEXT: vmv.x.s t2, v28 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 214(sp) ; ZVFHMIN64-NEXT: lh a0, 554(sp) ; ZVFHMIN64-NEXT: lh a1, 298(sp) -; ZVFHMIN64-NEXT: vmv.x.s a7, v2 -; ZVFHMIN64-NEXT: vmv.x.s a6, v30 +; ZVFHMIN64-NEXT: addi a2, sp, 800 +; ZVFHMIN64-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s t1, v16 +; ZVFHMIN64-NEXT: vmv.x.s t0, v6 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 213(sp) ; ZVFHMIN64-NEXT: lh a0, 552(sp) ; ZVFHMIN64-NEXT: lh a1, 296(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v22 -; ZVFHMIN64-NEXT: sd a2, 80(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: vmv.x.s a2, v18 -; ZVFHMIN64-NEXT: sd a2, 88(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s a7, v2 +; ZVFHMIN64-NEXT: vmv.x.s a6, v22 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 212(sp) ; ZVFHMIN64-NEXT: lh a0, 550(sp) ; ZVFHMIN64-NEXT: lh a1, 294(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v14 -; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: vmv.x.s a2, v12 -; ZVFHMIN64-NEXT: sd a2, 104(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s a5, v20 +; ZVFHMIN64-NEXT: vmv.x.s a2, v18 +; ZVFHMIN64-NEXT: sd a2, 88(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 211(sp) ; ZVFHMIN64-NEXT: lh a0, 548(sp) ; ZVFHMIN64-NEXT: lh a1, 292(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v10 -; ZVFHMIN64-NEXT: sd a2, 112(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s a2, v14 +; ZVFHMIN64-NEXT: sd a2, 104(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: vmv.x.s a2, v8 ; ZVFHMIN64-NEXT: sd a2, 120(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 @@ -2684,33 +2673,27 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: sb a0, 210(sp) ; ZVFHMIN64-NEXT: lh a0, 546(sp) ; ZVFHMIN64-NEXT: lh a1, 290(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN64-NEXT: vmv.x.s t5, v24 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN64-NEXT: vmv.x.s a3, v24 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 ; ZVFHMIN64-NEXT: sb a0, 209(sp) ; ZVFHMIN64-NEXT: lh a0, 544(sp) ; ZVFHMIN64-NEXT: lh a1, 288(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t5 -; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb t5, 192(sp) +; ZVFHMIN64-NEXT: sb a3, 192(sp) ; ZVFHMIN64-NEXT: sb a0, 208(sp) ; ZVFHMIN64-NEXT: lh a0, 738(sp) ; ZVFHMIN64-NEXT: lh a1, 482(sp) -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 29 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s7, 800(a2) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 28 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s a2, v10 +; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s a2, v12 +; ZVFHMIN64-NEXT: sd a2, 112(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -2718,15 +2701,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 736(sp) ; ZVFHMIN64-NEXT: lh a1, 480(sp) ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 27 +; ZVFHMIN64-NEXT: li a3, 29 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s8, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 26 +; ZVFHMIN64-NEXT: li a3, 28 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -2734,15 +2717,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 734(sp) ; ZVFHMIN64-NEXT: lh a1, 478(sp) ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 25 +; ZVFHMIN64-NEXT: li a3, 27 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s9, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s6, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 24 +; ZVFHMIN64-NEXT: li a3, 26 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s6, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -2750,138 +2733,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 732(sp) ; ZVFHMIN64-NEXT: lh a1, 476(sp) ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 23 +; ZVFHMIN64-NEXT: li a3, 25 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s t5, v3 +; ZVFHMIN64-NEXT: lh s7, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 24 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 174(sp) ; ZVFHMIN64-NEXT: lh a0, 730(sp) ; ZVFHMIN64-NEXT: lh a1, 474(sp) -; ZVFHMIN64-NEXT: vmv.x.s s2, v31 -; ZVFHMIN64-NEXT: vmv.x.s t6, v5 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 23 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh s8, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s t4, v21 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 173(sp) -; ZVFHMIN64-NEXT: lh a1, 728(sp) -; ZVFHMIN64-NEXT: lh s10, 472(sp) -; ZVFHMIN64-NEXT: vmv.x.s a3, v9 -; ZVFHMIN64-NEXT: vmv.x.s a4, v11 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 172(sp) -; ZVFHMIN64-NEXT: lh a1, 726(sp) -; ZVFHMIN64-NEXT: lh s10, 470(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v13 -; ZVFHMIN64-NEXT: vmv.x.s s11, v29 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 171(sp) -; ZVFHMIN64-NEXT: lh ra, 724(sp) -; ZVFHMIN64-NEXT: lh a0, 468(sp) -; ZVFHMIN64-NEXT: vmv.x.s a5, v27 -; ZVFHMIN64-NEXT: vmv.x.s s10, v7 -; ZVFHMIN64-NEXT: fmv.h.x fa5, ra -; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN64-NEXT: lh a0, 728(sp) +; ZVFHMIN64-NEXT: lh a1, 472(sp) +; ZVFHMIN64-NEXT: vmv.x.s t6, v3 +; ZVFHMIN64-NEXT: vmv.x.s t5, v19 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 172(sp) +; ZVFHMIN64-NEXT: lh a0, 726(sp) +; ZVFHMIN64-NEXT: lh a1, 470(sp) +; ZVFHMIN64-NEXT: vmv.x.s s10, v11 +; ZVFHMIN64-NEXT: vmv.x.s s11, v7 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 171(sp) +; ZVFHMIN64-NEXT: lh a0, 724(sp) +; ZVFHMIN64-NEXT: lh s9, 468(sp) +; ZVFHMIN64-NEXT: vmv.x.s a4, v9 +; ZVFHMIN64-NEXT: vmv.x.s ra, v29 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, s9 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 170(sp) ; ZVFHMIN64-NEXT: lh a0, 722(sp) ; ZVFHMIN64-NEXT: lh a1, 466(sp) -; ZVFHMIN64-NEXT: vmv.x.s ra, v21 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s7 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 +; ZVFHMIN64-NEXT: vmv.x.s s9, v31 +; ZVFHMIN64-NEXT: vmv.x.s a3, v5 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 169(sp) ; ZVFHMIN64-NEXT: lh a0, 720(sp) ; ZVFHMIN64-NEXT: lh a1, 464(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s4 -; ZVFHMIN64-NEXT: fmv.h.x fa3, s8 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN64-NEXT: vmv.x.s a2, v27 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s5 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 ; ZVFHMIN64-NEXT: sb a0, 168(sp) ; ZVFHMIN64-NEXT: lh a0, 718(sp) ; ZVFHMIN64-NEXT: lh a1, 462(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa2, s5 -; ZVFHMIN64-NEXT: fmv.h.x fa1, s9 -; ZVFHMIN64-NEXT: fmv.h.x fa0, a0 -; ZVFHMIN64-NEXT: fmv.h.x ft0, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa0, ft0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, s2 +; ZVFHMIN64-NEXT: fmv.h.x fa3, s6 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 167(sp) ; ZVFHMIN64-NEXT: lh a0, 716(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa0, s6 ; ZVFHMIN64-NEXT: lh a1, 460(sp) -; ZVFHMIN64-NEXT: fmv.h.x ft0, a3 +; ZVFHMIN64-NEXT: fmv.h.x fa2, s3 +; ZVFHMIN64-NEXT: fmv.h.x fa1, s7 +; ZVFHMIN64-NEXT: fmv.h.x fa0, a0 +; ZVFHMIN64-NEXT: fmv.h.x ft0, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa0, ft0 +; ZVFHMIN64-NEXT: sb a0, 166(sp) +; ZVFHMIN64-NEXT: lh a0, 714(sp) +; ZVFHMIN64-NEXT: lh a1, 458(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa0, s4 +; ZVFHMIN64-NEXT: fmv.h.x ft0, s8 ; ZVFHMIN64-NEXT: fmv.h.x ft1, a0 -; ZVFHMIN64-NEXT: feq.h a0, fa5, ft0 +; ZVFHMIN64-NEXT: fmv.h.x ft2, a1 +; ZVFHMIN64-NEXT: feq.h a0, ft1, ft2 +; ZVFHMIN64-NEXT: sb a0, 165(sp) +; ZVFHMIN64-NEXT: lh a0, 712(sp) +; ZVFHMIN64-NEXT: lh a1, 456(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft1, s10 +; ZVFHMIN64-NEXT: fmv.h.x ft2, s11 +; ZVFHMIN64-NEXT: fmv.h.x ft3, a0 +; ZVFHMIN64-NEXT: fmv.h.x ft4, a1 +; ZVFHMIN64-NEXT: feq.h a0, ft3, ft4 +; ZVFHMIN64-NEXT: sb a0, 164(sp) +; ZVFHMIN64-NEXT: lh a0, 710(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft3, a4 +; ZVFHMIN64-NEXT: lh a1, 454(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft4, ra +; ZVFHMIN64-NEXT: fmv.h.x ft5, a0 +; ZVFHMIN64-NEXT: feq.h a0, fa5, ft1 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: feq.h a1, ft1, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: sb a1, 166(sp) -; ZVFHMIN64-NEXT: lh a1, 714(sp) -; ZVFHMIN64-NEXT: fmv.h.x ft0, a2 -; ZVFHMIN64-NEXT: lh a2, 458(sp) -; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5 +; ZVFHMIN64-NEXT: feq.h a1, ft5, fa5 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN64-NEXT: sb a1, 163(sp) +; ZVFHMIN64-NEXT: lh a1, 708(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft1, a2 +; ZVFHMIN64-NEXT: lh a2, 452(sp) +; ZVFHMIN64-NEXT: feq.h a3, fa0, fa5 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: feq.h a1, fa3, ft0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s3 -; ZVFHMIN64-NEXT: sb a2, 165(sp) -; ZVFHMIN64-NEXT: lh a2, 712(sp) -; ZVFHMIN64-NEXT: lh a4, 456(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s11 -; ZVFHMIN64-NEXT: feq.h s3, fa2, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a4 -; ZVFHMIN64-NEXT: feq.h a2, fa4, fa3 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: sb a2, 164(sp) -; ZVFHMIN64-NEXT: lh a2, 710(sp) -; ZVFHMIN64-NEXT: lh a4, 454(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa3, s10 -; ZVFHMIN64-NEXT: feq.h a5, fa1, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a4 -; ZVFHMIN64-NEXT: feq.h a2, fa4, fa2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, ra -; ZVFHMIN64-NEXT: sb a2, 163(sp) -; ZVFHMIN64-NEXT: lh a2, 708(sp) -; ZVFHMIN64-NEXT: lh a4, 452(sp) -; ZVFHMIN64-NEXT: feq.h s4, fa0, fa3 -; ZVFHMIN64-NEXT: feq.h s5, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a1, ft0, ft1 +; ZVFHMIN64-NEXT: fmv.h.x fa0, a2 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa0 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s9 ; ZVFHMIN64-NEXT: sb a2, 162(sp) ; ZVFHMIN64-NEXT: lh a2, 706(sp) ; ZVFHMIN64-NEXT: lh a4, 450(sp) -; ZVFHMIN64-NEXT: sb s5, 129(sp) -; ZVFHMIN64-NEXT: sb s4, 130(sp) -; ZVFHMIN64-NEXT: sb a5, 131(sp) -; ZVFHMIN64-NEXT: sb s3, 132(sp) +; ZVFHMIN64-NEXT: sb a1, 129(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa1, fa5 +; ZVFHMIN64-NEXT: sb a3, 130(sp) +; ZVFHMIN64-NEXT: feq.h a3, fa2, ft4 +; ZVFHMIN64-NEXT: sb a1, 131(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa4, ft2 +; ZVFHMIN64-NEXT: sb a3, 132(sp) +; ZVFHMIN64-NEXT: feq.h a3, fa3, ft3 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 ; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 133(sp) -; ZVFHMIN64-NEXT: sb a3, 134(sp) +; ZVFHMIN64-NEXT: sb a3, 133(sp) +; ZVFHMIN64-NEXT: sb a1, 134(sp) ; ZVFHMIN64-NEXT: sb a0, 135(sp) ; ZVFHMIN64-NEXT: sb a2, 161(sp) ; ZVFHMIN64-NEXT: lh a0, 610(sp) ; ZVFHMIN64-NEXT: lh a1, 354(sp) -; ZVFHMIN64-NEXT: vmv.x.s s6, v23 +; ZVFHMIN64-NEXT: vmv.x.s s4, v23 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 18 +; ZVFHMIN64-NEXT: li a3, 10 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 @@ -2889,13 +2882,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: lh a0, 608(sp) ; ZVFHMIN64-NEXT: lh a1, 352(sp) ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 22 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a2, a2, 4 ; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 21 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a3, a2, 4 +; ZVFHMIN64-NEXT: sub a2, a3, a2 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 @@ -2904,148 +2896,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: sb a0, 240(sp) ; ZVFHMIN64-NEXT: lh a0, 606(sp) ; ZVFHMIN64-NEXT: lh a1, 350(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa3, t5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa2 +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7 +; ZVFHMIN64-NEXT: vmv.x.s s6, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 239(sp) ; ZVFHMIN64-NEXT: lh a0, 604(sp) ; ZVFHMIN64-NEXT: lh a1, 348(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6 +; ZVFHMIN64-NEXT: vmv.x.s s7, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 238(sp) ; ZVFHMIN64-NEXT: lh a0, 602(sp) ; ZVFHMIN64-NEXT: lh a1, 346(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5 +; ZVFHMIN64-NEXT: vmv.x.s s8, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 237(sp) ; ZVFHMIN64-NEXT: lh a0, 600(sp) ; ZVFHMIN64-NEXT: lh a1, 344(sp) -; ZVFHMIN64-NEXT: vmv.x.s a3, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4 +; ZVFHMIN64-NEXT: vmv.x.s s9, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 236(sp) ; ZVFHMIN64-NEXT: lh a0, 598(sp) ; ZVFHMIN64-NEXT: lh a1, 342(sp) -; ZVFHMIN64-NEXT: vmv.x.s a4, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3 +; ZVFHMIN64-NEXT: vmv.x.s s10, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 235(sp) ; ZVFHMIN64-NEXT: lh a0, 596(sp) ; ZVFHMIN64-NEXT: lh a1, 340(sp) -; ZVFHMIN64-NEXT: vmv.x.s a5, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2 +; ZVFHMIN64-NEXT: vmv.x.s s11, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 234(sp) ; ZVFHMIN64-NEXT: lh a0, 594(sp) ; ZVFHMIN64-NEXT: lh a1, 338(sp) -; ZVFHMIN64-NEXT: vmv.x.s t6, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1 +; ZVFHMIN64-NEXT: vmv.x.s ra, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 233(sp) ; ZVFHMIN64-NEXT: lh a0, 592(sp) ; ZVFHMIN64-NEXT: lh a1, 336(sp) -; ZVFHMIN64-NEXT: vmv.x.s s2, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 ; ZVFHMIN64-NEXT: sb a0, 232(sp) ; ZVFHMIN64-NEXT: lh a0, 590(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa2, a2 ; ZVFHMIN64-NEXT: lh a1, 334(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa3, t5 +; ZVFHMIN64-NEXT: fmv.h.x fa2, s4 ; ZVFHMIN64-NEXT: fmv.h.x fa1, a0 -; ZVFHMIN64-NEXT: feq.h t5, fa3, fa2 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa1, fa3 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a3 +; ZVFHMIN64-NEXT: fmv.h.x fa0, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa1, fa0 ; ZVFHMIN64-NEXT: sb a0, 231(sp) ; ZVFHMIN64-NEXT: lh a0, 588(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa2, a4 ; ZVFHMIN64-NEXT: lh a1, 332(sp) -; ZVFHMIN64-NEXT: feq.h a3, fa5, fa3 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s6 -; ZVFHMIN64-NEXT: sb a1, 230(sp) -; ZVFHMIN64-NEXT: lh a1, 586(sp) -; ZVFHMIN64-NEXT: lh a4, 330(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa1, s2 +; ZVFHMIN64-NEXT: fmv.h.x fa0, s5 +; ZVFHMIN64-NEXT: fmv.h.x ft0, a0 +; ZVFHMIN64-NEXT: fmv.h.x ft1, a1 +; ZVFHMIN64-NEXT: feq.h a0, ft0, ft1 +; ZVFHMIN64-NEXT: sb a0, 230(sp) +; ZVFHMIN64-NEXT: lh a0, 586(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft0, s3 +; ZVFHMIN64-NEXT: lh a1, 330(sp) +; ZVFHMIN64-NEXT: fmv.h.x ft1, s6 +; ZVFHMIN64-NEXT: fmv.h.x ft2, a0 +; ZVFHMIN64-NEXT: feq.h a0, fa5, ft1 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s5 +; ZVFHMIN64-NEXT: feq.h a1, ft2, fa5 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s7 ; ZVFHMIN64-NEXT: sb a1, 229(sp) ; ZVFHMIN64-NEXT: lh a1, 584(sp) -; ZVFHMIN64-NEXT: lh a4, 328(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN64-NEXT: feq.h t6, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s4 -; ZVFHMIN64-NEXT: sb a1, 228(sp) -; ZVFHMIN64-NEXT: lh a1, 582(sp) -; ZVFHMIN64-NEXT: lh a4, 326(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s2 -; ZVFHMIN64-NEXT: feq.h s2, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x ft1, s8 +; ZVFHMIN64-NEXT: lh a2, 328(sp) +; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s3 -; ZVFHMIN64-NEXT: sb a1, 227(sp) -; ZVFHMIN64-NEXT: lh a1, 580(sp) -; ZVFHMIN64-NEXT: lh a4, 324(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa3, ft1 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 226(sp) -; ZVFHMIN64-NEXT: lh a1, 578(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN64-NEXT: sb a2, 228(sp) +; ZVFHMIN64-NEXT: lh a2, 582(sp) +; ZVFHMIN64-NEXT: lh a4, 326(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, s10 +; ZVFHMIN64-NEXT: feq.h t4, fa2, fa5 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa3 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s11 +; ZVFHMIN64-NEXT: fmv.h.x fa3, ra +; ZVFHMIN64-NEXT: sb a2, 227(sp) +; ZVFHMIN64-NEXT: lh a2, 580(sp) +; ZVFHMIN64-NEXT: lh a4, 324(sp) +; ZVFHMIN64-NEXT: feq.h t5, fa0, fa5 +; ZVFHMIN64-NEXT: feq.h t6, ft0, fa3 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa3 +; ZVFHMIN64-NEXT: sb a2, 226(sp) +; ZVFHMIN64-NEXT: lh a2, 578(sp) ; ZVFHMIN64-NEXT: lh a4, 322(sp) -; ZVFHMIN64-NEXT: sb a2, 193(sp) -; ZVFHMIN64-NEXT: sb s2, 194(sp) +; ZVFHMIN64-NEXT: sb t6, 193(sp) +; ZVFHMIN64-NEXT: feq.h t6, fa1, fa4 +; ZVFHMIN64-NEXT: sb t5, 194(sp) ; ZVFHMIN64-NEXT: sb t6, 195(sp) -; ZVFHMIN64-NEXT: sb a5, 196(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: sb t4, 196(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 197(sp) +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 197(sp) ; ZVFHMIN64-NEXT: sb a3, 198(sp) -; ZVFHMIN64-NEXT: sb t5, 199(sp) -; ZVFHMIN64-NEXT: sb a1, 225(sp) +; ZVFHMIN64-NEXT: sb a0, 199(sp) +; ZVFHMIN64-NEXT: sb a2, 225(sp) ; ZVFHMIN64-NEXT: lh a0, 766(sp) ; ZVFHMIN64-NEXT: lh a1, 510(sp) ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 19 -; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: slli a3, a2, 4 +; ZVFHMIN64-NEXT: add a2, a3, a2 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 ; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload ; ZVFHMIN64-NEXT: vmv.x.s s2, v8 ; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 14 +; ZVFHMIN64-NEXT: li a3, 11 ; ZVFHMIN64-NEXT: mul a2, a2, a3 ; ZVFHMIN64-NEXT: add a2, sp, a2 ; ZVFHMIN64-NEXT: addi a2, a2, 800 @@ -3057,301 +3049,305 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: sb a0, 191(sp) ; ZVFHMIN64-NEXT: lh a0, 764(sp) ; ZVFHMIN64-NEXT: lh a1, 508(sp) -; ZVFHMIN64-NEXT: vmv.x.s t5, v6 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 2 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s a2, v8 +; ZVFHMIN64-NEXT: vmv.x.s t5, v4 +; ZVFHMIN64-NEXT: vmv.x.s t4, v30 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 190(sp) ; ZVFHMIN64-NEXT: lh a0, 762(sp) ; ZVFHMIN64-NEXT: lh a1, 506(sp) +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a2, a2, 2 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s a2, v8 ; ZVFHMIN64-NEXT: csrr a3, vlenb -; ZVFHMIN64-NEXT: slli a3, a3, 3 +; ZVFHMIN64-NEXT: slli a3, a3, 1 ; ZVFHMIN64-NEXT: add a3, sp, a3 ; ZVFHMIN64-NEXT: addi a3, a3, 800 ; ZVFHMIN64-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload ; ZVFHMIN64-NEXT: vmv.x.s a3, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 189(sp) +; ZVFHMIN64-NEXT: lh a0, 760(sp) +; ZVFHMIN64-NEXT: lh a1, 504(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, t3 ; ZVFHMIN64-NEXT: csrr a4, vlenb -; ZVFHMIN64-NEXT: li a5, 6 -; ZVFHMIN64-NEXT: mul a4, a4, a5 +; ZVFHMIN64-NEXT: li t3, 6 +; ZVFHMIN64-NEXT: mul a4, a4, t3 ; ZVFHMIN64-NEXT: add a4, sp, a4 ; ZVFHMIN64-NEXT: addi a4, a4, 800 ; ZVFHMIN64-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload ; ZVFHMIN64-NEXT: vmv.x.s a4, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 189(sp) -; ZVFHMIN64-NEXT: lh a1, 760(sp) -; ZVFHMIN64-NEXT: lh a5, 504(sp) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li s3, 12 -; ZVFHMIN64-NEXT: mul a0, a0, s3 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: addi a0, a0, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s s5, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 +; ZVFHMIN64-NEXT: sb a0, 188(sp) +; ZVFHMIN64-NEXT: lh a0, 758(sp) +; ZVFHMIN64-NEXT: lh a1, 502(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN64-NEXT: csrr t2, vlenb +; ZVFHMIN64-NEXT: slli t2, t2, 3 +; ZVFHMIN64-NEXT: add t2, sp, t2 +; ZVFHMIN64-NEXT: addi t2, t2, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (t2) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s t2, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: sb a0, 187(sp) +; ZVFHMIN64-NEXT: lh a0, 756(sp) +; ZVFHMIN64-NEXT: lh a1, 500(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa3, t1 +; ZVFHMIN64-NEXT: csrr t1, vlenb +; ZVFHMIN64-NEXT: li t3, 13 +; ZVFHMIN64-NEXT: mul t1, t1, t3 +; ZVFHMIN64-NEXT: add t1, sp, t1 +; ZVFHMIN64-NEXT: addi t1, t1, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (t1) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s t3, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 +; ZVFHMIN64-NEXT: sb a0, 186(sp) +; ZVFHMIN64-NEXT: lh a0, 754(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa2, t0 +; ZVFHMIN64-NEXT: lh a1, 498(sp) +; ZVFHMIN64-NEXT: csrr t0, vlenb +; ZVFHMIN64-NEXT: li t1, 19 +; ZVFHMIN64-NEXT: mul t0, t0, t1 +; ZVFHMIN64-NEXT: add t0, sp, t0 +; ZVFHMIN64-NEXT: addi t0, t0, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (t0) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s3, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a0 ; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li s3, 10 -; ZVFHMIN64-NEXT: mul a0, a0, s3 +; ZVFHMIN64-NEXT: li t0, 21 +; ZVFHMIN64-NEXT: mul a0, a0, t0 ; ZVFHMIN64-NEXT: add a0, sp, a0 ; ZVFHMIN64-NEXT: addi a0, a0, 800 ; ZVFHMIN64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN64-NEXT: vmv.x.s a0, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 188(sp) -; ZVFHMIN64-NEXT: lh a1, 758(sp) -; ZVFHMIN64-NEXT: lh a5, 502(sp) -; ZVFHMIN64-NEXT: csrr s3, vlenb -; ZVFHMIN64-NEXT: slli s3, s3, 4 -; ZVFHMIN64-NEXT: add s3, sp, s3 -; ZVFHMIN64-NEXT: addi s3, s3, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s s4, v8 -; ZVFHMIN64-NEXT: vmv.x.s s3, v16 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 -; ZVFHMIN64-NEXT: sb a1, 187(sp) -; ZVFHMIN64-NEXT: lh a1, 756(sp) -; ZVFHMIN64-NEXT: lh a5, 500(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h t4, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t3 -; ZVFHMIN64-NEXT: sb a1, 186(sp) -; ZVFHMIN64-NEXT: lh a1, 754(sp) -; ZVFHMIN64-NEXT: lh a2, 498(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h t3, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t1 +; ZVFHMIN64-NEXT: fmv.h.x fa0, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa1, fa0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a2 ; ZVFHMIN64-NEXT: sb a1, 185(sp) ; ZVFHMIN64-NEXT: lh a1, 752(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa0, a3 ; ZVFHMIN64-NEXT: lh a2, 496(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h t1, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h t0, fa5, fa1 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: feq.h t1, fa4, fa0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 ; ZVFHMIN64-NEXT: sb a1, 184(sp) ; ZVFHMIN64-NEXT: lh a1, 750(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 ; ZVFHMIN64-NEXT: lh a2, 494(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s5 -; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN64-NEXT: sb a1, 183(sp) -; ZVFHMIN64-NEXT: lh a1, 748(sp) -; ZVFHMIN64-NEXT: lh a2, 492(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a3, fa3, fa5 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa2, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a7 -; ZVFHMIN64-NEXT: sb a1, 182(sp) -; ZVFHMIN64-NEXT: lh a1, 746(sp) -; ZVFHMIN64-NEXT: lh a2, 490(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s4 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a2, 183(sp) +; ZVFHMIN64-NEXT: lh a2, 748(sp) +; ZVFHMIN64-NEXT: lh a4, 492(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 +; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN64-NEXT: sb a1, 181(sp) -; ZVFHMIN64-NEXT: lh a1, 744(sp) -; ZVFHMIN64-NEXT: lh a2, 488(sp) +; ZVFHMIN64-NEXT: sb a2, 182(sp) +; ZVFHMIN64-NEXT: lh a2, 746(sp) +; ZVFHMIN64-NEXT: lh a4, 490(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, s3 ; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: ld a2, 80(sp) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: addi a2, sp, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s a2, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a5 +; ZVFHMIN64-NEXT: sb a2, 181(sp) +; ZVFHMIN64-NEXT: lh a2, 744(sp) +; ZVFHMIN64-NEXT: lh a4, 488(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: ld a4, 88(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN64-NEXT: vmv.x.s a5, v0 ; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 15 -; ZVFHMIN64-NEXT: vmv.x.s a5, v8 -; ZVFHMIN64-NEXT: sb a1, 180(sp) -; ZVFHMIN64-NEXT: lh a1, 742(sp) -; ZVFHMIN64-NEXT: lh a7, 486(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: vmv.x.s a4, v8 +; ZVFHMIN64-NEXT: sb a2, 180(sp) +; ZVFHMIN64-NEXT: lh a2, 742(sp) +; ZVFHMIN64-NEXT: lh t2, 486(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 ; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 179(sp) -; ZVFHMIN64-NEXT: lh a1, 740(sp) -; ZVFHMIN64-NEXT: lh a7, 484(sp) -; ZVFHMIN64-NEXT: sb a3, 140(sp) -; ZVFHMIN64-NEXT: sb t1, 141(sp) -; ZVFHMIN64-NEXT: sb t3, 142(sp) -; ZVFHMIN64-NEXT: sb t4, 143(sp) -; ZVFHMIN64-NEXT: sb a2, 136(sp) -; ZVFHMIN64-NEXT: sb a6, 137(sp) -; ZVFHMIN64-NEXT: sb a4, 138(sp) -; ZVFHMIN64-NEXT: sb a0, 139(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN64-NEXT: sb a2, 179(sp) +; ZVFHMIN64-NEXT: lh a2, 740(sp) +; ZVFHMIN64-NEXT: lh t2, 484(sp) +; ZVFHMIN64-NEXT: sb a1, 140(sp) +; ZVFHMIN64-NEXT: sb a3, 141(sp) +; ZVFHMIN64-NEXT: sb t1, 142(sp) +; ZVFHMIN64-NEXT: sb t0, 143(sp) +; ZVFHMIN64-NEXT: sb a5, 136(sp) +; ZVFHMIN64-NEXT: sb a0, 137(sp) +; ZVFHMIN64-NEXT: sb a6, 138(sp) +; ZVFHMIN64-NEXT: sb a7, 139(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 178(sp) -; ZVFHMIN64-NEXT: lh a1, 638(sp) -; ZVFHMIN64-NEXT: lh a2, 382(sp) +; ZVFHMIN64-NEXT: lh a0, 638(sp) +; ZVFHMIN64-NEXT: lh a1, 382(sp) ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 14 -; ZVFHMIN64-NEXT: vmv.x.s a0, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 255(sp) -; ZVFHMIN64-NEXT: lh a1, 636(sp) -; ZVFHMIN64-NEXT: lh a2, 380(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 13 ; ZVFHMIN64-NEXT: vmv.x.s t2, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 254(sp) -; ZVFHMIN64-NEXT: lh a1, 634(sp) -; ZVFHMIN64-NEXT: lh a2, 378(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 12 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 255(sp) +; ZVFHMIN64-NEXT: lh a0, 636(sp) +; ZVFHMIN64-NEXT: lh a1, 380(sp) +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 13 ; ZVFHMIN64-NEXT: vmv.x.s t1, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 253(sp) -; ZVFHMIN64-NEXT: lh a1, 632(sp) -; ZVFHMIN64-NEXT: lh a2, 376(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 11 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 254(sp) +; ZVFHMIN64-NEXT: lh a0, 634(sp) +; ZVFHMIN64-NEXT: lh a1, 378(sp) +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 12 ; ZVFHMIN64-NEXT: vmv.x.s t0, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 252(sp) -; ZVFHMIN64-NEXT: lh a1, 630(sp) -; ZVFHMIN64-NEXT: lh a2, 374(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 10 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 253(sp) +; ZVFHMIN64-NEXT: lh a0, 632(sp) +; ZVFHMIN64-NEXT: lh a1, 376(sp) +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 11 ; ZVFHMIN64-NEXT: vmv.x.s a7, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 251(sp) -; ZVFHMIN64-NEXT: lh a1, 628(sp) -; ZVFHMIN64-NEXT: lh a2, 372(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 9 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 252(sp) +; ZVFHMIN64-NEXT: lh a0, 630(sp) +; ZVFHMIN64-NEXT: lh a1, 374(sp) +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 10 ; ZVFHMIN64-NEXT: vmv.x.s a6, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: ld a2, 88(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: sb a1, 250(sp) -; ZVFHMIN64-NEXT: lh a1, 626(sp) -; ZVFHMIN64-NEXT: lh a2, 370(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: ld a2, 96(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: sb a1, 249(sp) -; ZVFHMIN64-NEXT: lh a1, 624(sp) -; ZVFHMIN64-NEXT: lh a2, 368(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: ld a1, 104(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: sb a0, 248(sp) -; ZVFHMIN64-NEXT: lh a0, 622(sp) -; ZVFHMIN64-NEXT: lh a1, 366(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 251(sp) +; ZVFHMIN64-NEXT: lh a0, 628(sp) +; ZVFHMIN64-NEXT: lh a1, 372(sp) +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 9 +; ZVFHMIN64-NEXT: vmv.x.s a5, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: ld a1, 112(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld a1, 104(sp) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: sb a0, 247(sp) -; ZVFHMIN64-NEXT: lh a0, 620(sp) -; ZVFHMIN64-NEXT: lh a1, 364(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 250(sp) +; ZVFHMIN64-NEXT: lh a0, 626(sp) +; ZVFHMIN64-NEXT: lh a1, 370(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: ld a1, 120(sp) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: sb a0, 246(sp) -; ZVFHMIN64-NEXT: lh a0, 618(sp) -; ZVFHMIN64-NEXT: lh a1, 362(sp) +; ZVFHMIN64-NEXT: sb a0, 249(sp) +; ZVFHMIN64-NEXT: lh a1, 624(sp) +; ZVFHMIN64-NEXT: lh a3, 368(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: ld a3, 96(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN64-NEXT: sb a1, 248(sp) +; ZVFHMIN64-NEXT: lh a1, 622(sp) +; ZVFHMIN64-NEXT: lh a3, 366(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, t1 +; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: ld a3, 112(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 +; ZVFHMIN64-NEXT: sb a1, 247(sp) +; ZVFHMIN64-NEXT: lh a1, 620(sp) +; ZVFHMIN64-NEXT: lh a3, 364(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, t0 ; ZVFHMIN64-NEXT: feq.h t0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, s2 -; ZVFHMIN64-NEXT: sb a0, 245(sp) -; ZVFHMIN64-NEXT: lh a0, 616(sp) -; ZVFHMIN64-NEXT: lh a1, 360(sp) +; ZVFHMIN64-NEXT: sb a1, 246(sp) +; ZVFHMIN64-NEXT: lh a1, 618(sp) +; ZVFHMIN64-NEXT: lh a3, 362(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 ; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN64-NEXT: sb a0, 244(sp) -; ZVFHMIN64-NEXT: lh a0, 614(sp) -; ZVFHMIN64-NEXT: lh a1, 358(sp) +; ZVFHMIN64-NEXT: sb a1, 245(sp) +; ZVFHMIN64-NEXT: lh a1, 616(sp) +; ZVFHMIN64-NEXT: lh a3, 360(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, a6 ; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN64-NEXT: sb a1, 244(sp) +; ZVFHMIN64-NEXT: lh a1, 614(sp) +; ZVFHMIN64-NEXT: lh a3, 358(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 8 -; ZVFHMIN64-NEXT: vmv.x.s a1, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: sb a0, 243(sp) -; ZVFHMIN64-NEXT: lh a0, 612(sp) -; ZVFHMIN64-NEXT: lh a1, 356(sp) -; ZVFHMIN64-NEXT: sb a5, 204(sp) -; ZVFHMIN64-NEXT: sb a2, 205(sp) -; ZVFHMIN64-NEXT: sb a3, 206(sp) -; ZVFHMIN64-NEXT: sb a4, 207(sp) -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: sb a2, 200(sp) -; ZVFHMIN64-NEXT: sb a6, 201(sp) -; ZVFHMIN64-NEXT: sb a7, 202(sp) -; ZVFHMIN64-NEXT: sb t0, 203(sp) -; ZVFHMIN64-NEXT: li a2, 128 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: vmv.x.s a3, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: sb a1, 243(sp) +; ZVFHMIN64-NEXT: lh a1, 612(sp) +; ZVFHMIN64-NEXT: lh a3, 356(sp) +; ZVFHMIN64-NEXT: sb t0, 204(sp) +; ZVFHMIN64-NEXT: sb a4, 205(sp) +; ZVFHMIN64-NEXT: sb a0, 206(sp) +; ZVFHMIN64-NEXT: sb a2, 207(sp) ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 242(sp) -; ZVFHMIN64-NEXT: addi a0, sp, 128 -; ZVFHMIN64-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; ZVFHMIN64-NEXT: vle8.v v8, (a0) +; ZVFHMIN64-NEXT: sb a0, 200(sp) +; ZVFHMIN64-NEXT: sb a5, 201(sp) +; ZVFHMIN64-NEXT: sb a6, 202(sp) +; ZVFHMIN64-NEXT: sb a7, 203(sp) +; ZVFHMIN64-NEXT: li a0, 128 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 242(sp) +; ZVFHMIN64-NEXT: addi a1, sp, 128 +; ZVFHMIN64-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; ZVFHMIN64-NEXT: vle8.v v8, (a1) ; ZVFHMIN64-NEXT: vand.vi v8, v8, 1 ; ZVFHMIN64-NEXT: vmsne.vi v0, v8, 0 ; ZVFHMIN64-NEXT: addi sp, s0, -896 diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll index 5b272c98a1e0a..dd2a8240ee253 100644 --- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll +++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll @@ -507,34 +507,26 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask) define @match_nxv16i8_v32i8( %op1, <32 x i8> %op2, %mask) { ; RV32-LABEL: match_nxv16i8_v32i8: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: .cfi_offset s1, -12 -; RV32-NEXT: .cfi_offset s2, -16 -; RV32-NEXT: .cfi_offset s3, -20 -; RV32-NEXT: .cfi_offset s4, -24 -; RV32-NEXT: .cfi_offset s5, -28 -; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: .cfi_offset s7, -36 -; RV32-NEXT: .cfi_offset s8, -40 -; RV32-NEXT: .cfi_offset s9, -44 -; RV32-NEXT: .cfi_offset s10, -48 -; RV32-NEXT: .cfi_offset s11, -52 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: .cfi_offset s1, -8 +; RV32-NEXT: .cfi_offset s2, -12 +; RV32-NEXT: .cfi_offset s3, -16 +; RV32-NEXT: .cfi_offset s4, -20 +; RV32-NEXT: .cfi_offset s5, -24 +; RV32-NEXT: .cfi_offset s6, -28 +; RV32-NEXT: .cfi_offset s7, -32 +; RV32-NEXT: .cfi_offset s8, -36 ; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: vslidedown.vi v12, v10, 1 @@ -592,43 +584,43 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV32-NEXT: vmv.x.s s5, v15 ; RV32-NEXT: vmv.x.s s6, v16 ; RV32-NEXT: vmv.x.s s7, v17 -; RV32-NEXT: vmv.x.s s8, v18 -; RV32-NEXT: vmv.x.s s9, v19 -; RV32-NEXT: vmv.x.s s10, v20 -; RV32-NEXT: vmv.x.s s11, v21 -; RV32-NEXT: vsetvli ra, zero, e8, m2, ta, ma +; RV32-NEXT: vsetvli s8, zero, e8, m2, ta, ma ; RV32-NEXT: vmseq.vx v12, v8, a0 -; RV32-NEXT: vmv.x.s a0, v22 +; RV32-NEXT: vmv.x.s a0, v18 ; RV32-NEXT: vmseq.vx v13, v8, s2 -; RV32-NEXT: vmv.x.s s2, v23 +; RV32-NEXT: vmv.x.s s2, v19 ; RV32-NEXT: vmseq.vx v14, v8, s3 -; RV32-NEXT: vmv.x.s s3, v11 -; RV32-NEXT: vmseq.vx v11, v8, s4 -; RV32-NEXT: vmv.x.s s4, v24 -; RV32-NEXT: vmseq.vx v15, v8, s5 -; RV32-NEXT: vmv.x.s s5, v10 +; RV32-NEXT: vmv.x.s s3, v20 +; RV32-NEXT: vmseq.vx v15, v8, s4 +; RV32-NEXT: vmv.x.s s4, v21 +; RV32-NEXT: vmseq.vx v16, v8, s5 +; RV32-NEXT: vmv.x.s s5, v22 +; RV32-NEXT: vmseq.vx v17, v8, s6 +; RV32-NEXT: vmv.x.s s6, v23 +; RV32-NEXT: vmseq.vx v18, v8, s7 +; RV32-NEXT: vmv.x.s s7, v11 +; RV32-NEXT: vmseq.vx v11, v8, a0 +; RV32-NEXT: vmv.x.s a0, v24 +; RV32-NEXT: vmseq.vx v19, v8, s2 +; RV32-NEXT: vmv.x.s s2, v10 ; RV32-NEXT: vmor.mm v10, v12, v13 -; RV32-NEXT: vmseq.vx v12, v8, s6 ; RV32-NEXT: vmor.mm v10, v10, v14 -; RV32-NEXT: vmseq.vx v13, v8, s7 -; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmseq.vx v11, v8, s8 ; RV32-NEXT: vmor.mm v10, v10, v15 -; RV32-NEXT: vmseq.vx v14, v8, s9 -; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, s10 -; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, s11 -; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmseq.vx v11, v8, a0 -; RV32-NEXT: vmor.mm v10, v10, v14 -; RV32-NEXT: vmseq.vx v14, v8, s2 -; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmor.mm v10, v10, v16 +; RV32-NEXT: vmor.mm v10, v10, v17 ; RV32-NEXT: vmseq.vx v12, v8, s3 -; RV32-NEXT: vmor.mm v10, v10, v13 +; RV32-NEXT: vmor.mm v10, v10, v18 ; RV32-NEXT: vmseq.vx v13, v8, s4 ; RV32-NEXT: vmor.mm v10, v10, v11 ; RV32-NEXT: vmseq.vx v11, v8, s5 +; RV32-NEXT: vmor.mm v10, v10, v19 +; RV32-NEXT: vmseq.vx v14, v8, s6 +; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmseq.vx v12, v8, s7 +; RV32-NEXT: vmor.mm v10, v10, v13 +; RV32-NEXT: vmseq.vx v13, v8, a0 +; RV32-NEXT: vmor.mm v10, v10, v11 +; RV32-NEXT: vmseq.vx v11, v8, s2 ; RV32-NEXT: vmor.mm v10, v10, v14 ; RV32-NEXT: vmseq.vx v14, v8, a1 ; RV32-NEXT: vmor.mm v10, v10, v12 @@ -666,20 +658,15 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV32-NEXT: vmseq.vx v11, v8, s1 ; RV32-NEXT: vmor.mm v8, v10, v11 ; RV32-NEXT: vmand.mm v0, v8, v0 -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore s0 ; RV32-NEXT: .cfi_restore s1 ; RV32-NEXT: .cfi_restore s2 @@ -689,43 +676,32 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV32-NEXT: .cfi_restore s6 ; RV32-NEXT: .cfi_restore s7 ; RV32-NEXT: .cfi_restore s8 -; RV32-NEXT: .cfi_restore s9 -; RV32-NEXT: .cfi_restore s10 -; RV32-NEXT: .cfi_restore s11 -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: match_nxv16i8_v32i8: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -112 -; RV64-NEXT: .cfi_def_cfa_offset 112 -; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: .cfi_offset s1, -24 -; RV64-NEXT: .cfi_offset s2, -32 -; RV64-NEXT: .cfi_offset s3, -40 -; RV64-NEXT: .cfi_offset s4, -48 -; RV64-NEXT: .cfi_offset s5, -56 -; RV64-NEXT: .cfi_offset s6, -64 -; RV64-NEXT: .cfi_offset s7, -72 -; RV64-NEXT: .cfi_offset s8, -80 -; RV64-NEXT: .cfi_offset s9, -88 -; RV64-NEXT: .cfi_offset s10, -96 -; RV64-NEXT: .cfi_offset s11, -104 +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: .cfi_def_cfa_offset 80 +; RV64-NEXT: sd s0, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s5, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s6, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s7, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s8, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset s0, -8 +; RV64-NEXT: .cfi_offset s1, -16 +; RV64-NEXT: .cfi_offset s2, -24 +; RV64-NEXT: .cfi_offset s3, -32 +; RV64-NEXT: .cfi_offset s4, -40 +; RV64-NEXT: .cfi_offset s5, -48 +; RV64-NEXT: .cfi_offset s6, -56 +; RV64-NEXT: .cfi_offset s7, -64 +; RV64-NEXT: .cfi_offset s8, -72 ; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: vslidedown.vi v12, v10, 1 @@ -783,43 +759,43 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV64-NEXT: vmv.x.s s5, v15 ; RV64-NEXT: vmv.x.s s6, v16 ; RV64-NEXT: vmv.x.s s7, v17 -; RV64-NEXT: vmv.x.s s8, v18 -; RV64-NEXT: vmv.x.s s9, v19 -; RV64-NEXT: vmv.x.s s10, v20 -; RV64-NEXT: vmv.x.s s11, v21 -; RV64-NEXT: vsetvli ra, zero, e8, m2, ta, ma +; RV64-NEXT: vsetvli s8, zero, e8, m2, ta, ma ; RV64-NEXT: vmseq.vx v12, v8, a0 -; RV64-NEXT: vmv.x.s a0, v22 +; RV64-NEXT: vmv.x.s a0, v18 ; RV64-NEXT: vmseq.vx v13, v8, s2 -; RV64-NEXT: vmv.x.s s2, v23 +; RV64-NEXT: vmv.x.s s2, v19 ; RV64-NEXT: vmseq.vx v14, v8, s3 -; RV64-NEXT: vmv.x.s s3, v11 -; RV64-NEXT: vmseq.vx v11, v8, s4 -; RV64-NEXT: vmv.x.s s4, v24 -; RV64-NEXT: vmseq.vx v15, v8, s5 -; RV64-NEXT: vmv.x.s s5, v10 +; RV64-NEXT: vmv.x.s s3, v20 +; RV64-NEXT: vmseq.vx v15, v8, s4 +; RV64-NEXT: vmv.x.s s4, v21 +; RV64-NEXT: vmseq.vx v16, v8, s5 +; RV64-NEXT: vmv.x.s s5, v22 +; RV64-NEXT: vmseq.vx v17, v8, s6 +; RV64-NEXT: vmv.x.s s6, v23 +; RV64-NEXT: vmseq.vx v18, v8, s7 +; RV64-NEXT: vmv.x.s s7, v11 +; RV64-NEXT: vmseq.vx v11, v8, a0 +; RV64-NEXT: vmv.x.s a0, v24 +; RV64-NEXT: vmseq.vx v19, v8, s2 +; RV64-NEXT: vmv.x.s s2, v10 ; RV64-NEXT: vmor.mm v10, v12, v13 -; RV64-NEXT: vmseq.vx v12, v8, s6 ; RV64-NEXT: vmor.mm v10, v10, v14 -; RV64-NEXT: vmseq.vx v13, v8, s7 -; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmseq.vx v11, v8, s8 ; RV64-NEXT: vmor.mm v10, v10, v15 -; RV64-NEXT: vmseq.vx v14, v8, s9 -; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, s10 -; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, s11 -; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmseq.vx v11, v8, a0 -; RV64-NEXT: vmor.mm v10, v10, v14 -; RV64-NEXT: vmseq.vx v14, v8, s2 -; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmor.mm v10, v10, v16 +; RV64-NEXT: vmor.mm v10, v10, v17 ; RV64-NEXT: vmseq.vx v12, v8, s3 -; RV64-NEXT: vmor.mm v10, v10, v13 +; RV64-NEXT: vmor.mm v10, v10, v18 ; RV64-NEXT: vmseq.vx v13, v8, s4 ; RV64-NEXT: vmor.mm v10, v10, v11 ; RV64-NEXT: vmseq.vx v11, v8, s5 +; RV64-NEXT: vmor.mm v10, v10, v19 +; RV64-NEXT: vmseq.vx v14, v8, s6 +; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmseq.vx v12, v8, s7 +; RV64-NEXT: vmor.mm v10, v10, v13 +; RV64-NEXT: vmseq.vx v13, v8, a0 +; RV64-NEXT: vmor.mm v10, v10, v11 +; RV64-NEXT: vmseq.vx v11, v8, s2 ; RV64-NEXT: vmor.mm v10, v10, v14 ; RV64-NEXT: vmseq.vx v14, v8, a1 ; RV64-NEXT: vmor.mm v10, v10, v12 @@ -857,20 +833,15 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV64-NEXT: vmseq.vx v11, v8, s1 ; RV64-NEXT: vmor.mm v8, v10, v11 ; RV64-NEXT: vmand.mm v0, v8, v0 -; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: ld s0, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s4, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s5, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s6, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s7, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s8, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: .cfi_restore s0 ; RV64-NEXT: .cfi_restore s1 ; RV64-NEXT: .cfi_restore s2 @@ -880,10 +851,7 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ; RV64-NEXT: .cfi_restore s6 ; RV64-NEXT: .cfi_restore s7 ; RV64-NEXT: .cfi_restore s8 -; RV64-NEXT: .cfi_restore s9 -; RV64-NEXT: .cfi_restore s10 -; RV64-NEXT: .cfi_restore s11 -; RV64-NEXT: addi sp, sp, 112 +; RV64-NEXT: addi sp, sp, 80 ; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %r = tail call @llvm.experimental.vector.match( %op1, <32 x i8> %op2, %mask) @@ -893,20 +861,16 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) { ; RV32-LABEL: match_v16i8_v32i8: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s9, 8(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s10, 4(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s11, 0(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 4(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 0(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 ; RV32-NEXT: .cfi_offset s1, -8 ; RV32-NEXT: .cfi_offset s2, -12 @@ -915,10 +879,6 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV32-NEXT: .cfi_offset s5, -24 ; RV32-NEXT: .cfi_offset s6, -28 ; RV32-NEXT: .cfi_offset s7, -32 -; RV32-NEXT: .cfi_offset s8, -36 -; RV32-NEXT: .cfi_offset s9, -40 -; RV32-NEXT: .cfi_offset s10, -44 -; RV32-NEXT: .cfi_offset s11, -48 ; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: vslidedown.vi v9, v10, 1 @@ -976,42 +936,42 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV32-NEXT: vmv.x.s s5, v14 ; RV32-NEXT: vmv.x.s s6, v15 ; RV32-NEXT: vmv.x.s s7, v16 -; RV32-NEXT: vmv.x.s s8, v17 -; RV32-NEXT: vmv.x.s s9, v18 -; RV32-NEXT: vmv.x.s s10, v19 -; RV32-NEXT: vmv.x.s s11, v20 ; RV32-NEXT: vmseq.vx v9, v8, a0 -; RV32-NEXT: vmv.x.s a0, v21 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: vmseq.vx v12, v8, s2 -; RV32-NEXT: vmv.x.s s2, v22 +; RV32-NEXT: vmv.x.s s2, v18 ; RV32-NEXT: vmseq.vx v13, v8, s3 -; RV32-NEXT: vmv.x.s s3, v11 -; RV32-NEXT: vmseq.vx v11, v8, s4 -; RV32-NEXT: vmv.x.s s4, v23 -; RV32-NEXT: vmseq.vx v14, v8, s5 -; RV32-NEXT: vmv.x.s s5, v10 +; RV32-NEXT: vmv.x.s s3, v19 +; RV32-NEXT: vmseq.vx v14, v8, s4 +; RV32-NEXT: vmv.x.s s4, v20 +; RV32-NEXT: vmseq.vx v15, v8, s5 +; RV32-NEXT: vmv.x.s s5, v21 +; RV32-NEXT: vmseq.vx v16, v8, s6 +; RV32-NEXT: vmv.x.s s6, v22 +; RV32-NEXT: vmseq.vx v17, v8, s7 +; RV32-NEXT: vmv.x.s s7, v11 +; RV32-NEXT: vmseq.vx v11, v8, a0 +; RV32-NEXT: vmv.x.s a0, v23 +; RV32-NEXT: vmseq.vx v18, v8, s2 +; RV32-NEXT: vmv.x.s s2, v10 ; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v10, v8, s6 ; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v12, v8, s7 -; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, s8 ; RV32-NEXT: vmor.mm v9, v9, v14 -; RV32-NEXT: vmseq.vx v13, v8, s9 -; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, s10 -; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, s11 -; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, a0 -; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v13, v8, s2 -; RV32-NEXT: vmor.mm v9, v9, v10 +; RV32-NEXT: vmor.mm v9, v9, v15 +; RV32-NEXT: vmor.mm v9, v9, v16 ; RV32-NEXT: vmseq.vx v10, v8, s3 -; RV32-NEXT: vmor.mm v9, v9, v12 +; RV32-NEXT: vmor.mm v9, v9, v17 ; RV32-NEXT: vmseq.vx v12, v8, s4 ; RV32-NEXT: vmor.mm v9, v9, v11 ; RV32-NEXT: vmseq.vx v11, v8, s5 +; RV32-NEXT: vmor.mm v9, v9, v18 +; RV32-NEXT: vmseq.vx v13, v8, s6 +; RV32-NEXT: vmor.mm v9, v9, v10 +; RV32-NEXT: vmseq.vx v10, v8, s7 +; RV32-NEXT: vmor.mm v9, v9, v12 +; RV32-NEXT: vmseq.vx v12, v8, a0 +; RV32-NEXT: vmor.mm v9, v9, v11 +; RV32-NEXT: vmseq.vx v11, v8, s2 ; RV32-NEXT: vmor.mm v9, v9, v13 ; RV32-NEXT: vmseq.vx v13, v8, a1 ; RV32-NEXT: vmor.mm v9, v9, v10 @@ -1049,18 +1009,14 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV32-NEXT: vmseq.vx v8, v8, s1 ; RV32-NEXT: vmor.mm v8, v9, v8 ; RV32-NEXT: vmand.mm v0, v8, v0 -; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s9, 8(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s10, 4(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s11, 0(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 4(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 0(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore s0 ; RV32-NEXT: .cfi_restore s1 ; RV32-NEXT: .cfi_restore s2 @@ -1069,30 +1025,22 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV32-NEXT: .cfi_restore s5 ; RV32-NEXT: .cfi_restore s6 ; RV32-NEXT: .cfi_restore s7 -; RV32-NEXT: .cfi_restore s8 -; RV32-NEXT: .cfi_restore s9 -; RV32-NEXT: .cfi_restore s10 -; RV32-NEXT: .cfi_restore s11 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: match_v16i8_v32i8: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -96 -; RV64-NEXT: .cfi_def_cfa_offset 96 -; RV64-NEXT: sd s0, 88(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s1, 80(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s2, 72(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s3, 64(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s4, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s5, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s6, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s7, 32(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s8, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s9, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s10, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s11, 0(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -64 +; RV64-NEXT: .cfi_def_cfa_offset 64 +; RV64-NEXT: sd s0, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s5, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s6, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s7, 0(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset s0, -8 ; RV64-NEXT: .cfi_offset s1, -16 ; RV64-NEXT: .cfi_offset s2, -24 @@ -1101,10 +1049,6 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV64-NEXT: .cfi_offset s5, -48 ; RV64-NEXT: .cfi_offset s6, -56 ; RV64-NEXT: .cfi_offset s7, -64 -; RV64-NEXT: .cfi_offset s8, -72 -; RV64-NEXT: .cfi_offset s9, -80 -; RV64-NEXT: .cfi_offset s10, -88 -; RV64-NEXT: .cfi_offset s11, -96 ; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: vslidedown.vi v9, v10, 1 @@ -1162,42 +1106,42 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV64-NEXT: vmv.x.s s5, v14 ; RV64-NEXT: vmv.x.s s6, v15 ; RV64-NEXT: vmv.x.s s7, v16 -; RV64-NEXT: vmv.x.s s8, v17 -; RV64-NEXT: vmv.x.s s9, v18 -; RV64-NEXT: vmv.x.s s10, v19 -; RV64-NEXT: vmv.x.s s11, v20 ; RV64-NEXT: vmseq.vx v9, v8, a0 -; RV64-NEXT: vmv.x.s a0, v21 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: vmseq.vx v12, v8, s2 -; RV64-NEXT: vmv.x.s s2, v22 +; RV64-NEXT: vmv.x.s s2, v18 ; RV64-NEXT: vmseq.vx v13, v8, s3 -; RV64-NEXT: vmv.x.s s3, v11 -; RV64-NEXT: vmseq.vx v11, v8, s4 -; RV64-NEXT: vmv.x.s s4, v23 -; RV64-NEXT: vmseq.vx v14, v8, s5 -; RV64-NEXT: vmv.x.s s5, v10 +; RV64-NEXT: vmv.x.s s3, v19 +; RV64-NEXT: vmseq.vx v14, v8, s4 +; RV64-NEXT: vmv.x.s s4, v20 +; RV64-NEXT: vmseq.vx v15, v8, s5 +; RV64-NEXT: vmv.x.s s5, v21 +; RV64-NEXT: vmseq.vx v16, v8, s6 +; RV64-NEXT: vmv.x.s s6, v22 +; RV64-NEXT: vmseq.vx v17, v8, s7 +; RV64-NEXT: vmv.x.s s7, v11 +; RV64-NEXT: vmseq.vx v11, v8, a0 +; RV64-NEXT: vmv.x.s a0, v23 +; RV64-NEXT: vmseq.vx v18, v8, s2 +; RV64-NEXT: vmv.x.s s2, v10 ; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v10, v8, s6 ; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v12, v8, s7 -; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, s8 ; RV64-NEXT: vmor.mm v9, v9, v14 -; RV64-NEXT: vmseq.vx v13, v8, s9 -; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, s10 -; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, s11 -; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, a0 -; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v13, v8, s2 -; RV64-NEXT: vmor.mm v9, v9, v10 +; RV64-NEXT: vmor.mm v9, v9, v15 +; RV64-NEXT: vmor.mm v9, v9, v16 ; RV64-NEXT: vmseq.vx v10, v8, s3 -; RV64-NEXT: vmor.mm v9, v9, v12 +; RV64-NEXT: vmor.mm v9, v9, v17 ; RV64-NEXT: vmseq.vx v12, v8, s4 ; RV64-NEXT: vmor.mm v9, v9, v11 ; RV64-NEXT: vmseq.vx v11, v8, s5 +; RV64-NEXT: vmor.mm v9, v9, v18 +; RV64-NEXT: vmseq.vx v13, v8, s6 +; RV64-NEXT: vmor.mm v9, v9, v10 +; RV64-NEXT: vmseq.vx v10, v8, s7 +; RV64-NEXT: vmor.mm v9, v9, v12 +; RV64-NEXT: vmseq.vx v12, v8, a0 +; RV64-NEXT: vmor.mm v9, v9, v11 +; RV64-NEXT: vmseq.vx v11, v8, s2 ; RV64-NEXT: vmor.mm v9, v9, v13 ; RV64-NEXT: vmseq.vx v13, v8, a1 ; RV64-NEXT: vmor.mm v9, v9, v10 @@ -1235,18 +1179,14 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV64-NEXT: vmseq.vx v8, v8, s1 ; RV64-NEXT: vmor.mm v8, v9, v8 ; RV64-NEXT: vmand.mm v0, v8, v0 -; RV64-NEXT: ld s0, 88(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s1, 80(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s2, 72(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s3, 64(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s4, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s5, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s6, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s7, 32(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s8, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s9, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s10, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s11, 0(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s4, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s5, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s6, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s7, 0(sp) # 8-byte Folded Reload ; RV64-NEXT: .cfi_restore s0 ; RV64-NEXT: .cfi_restore s1 ; RV64-NEXT: .cfi_restore s2 @@ -1255,11 +1195,7 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m ; RV64-NEXT: .cfi_restore s5 ; RV64-NEXT: .cfi_restore s6 ; RV64-NEXT: .cfi_restore s7 -; RV64-NEXT: .cfi_restore s8 -; RV64-NEXT: .cfi_restore s9 -; RV64-NEXT: .cfi_restore s10 -; RV64-NEXT: .cfi_restore s11 -; RV64-NEXT: addi sp, sp, 96 +; RV64-NEXT: addi sp, sp, 64 ; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll index 123048d996360..22e6f23d4d6e6 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -2203,139 +2203,136 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: lshr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu s1, 0(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu t1, 4(a0) -; RV32I-NEXT: lbu t3, 5(a0) -; RV32I-NEXT: lbu t4, 6(a0) -; RV32I-NEXT: lbu s0, 7(a0) -; RV32I-NEXT: lbu t2, 8(a0) -; RV32I-NEXT: lbu s3, 9(a0) -; RV32I-NEXT: lbu s6, 10(a0) -; RV32I-NEXT: lbu s8, 11(a0) -; RV32I-NEXT: lbu s9, 12(a0) -; RV32I-NEXT: lbu s10, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s7, 15(a0) -; RV32I-NEXT: lbu s5, 16(a0) -; RV32I-NEXT: lbu s11, 17(a0) -; RV32I-NEXT: lbu ra, 18(a0) -; RV32I-NEXT: lbu a3, 19(a0) -; RV32I-NEXT: lbu t5, 20(a0) -; RV32I-NEXT: lbu t6, 21(a0) -; RV32I-NEXT: lbu a7, 22(a0) -; RV32I-NEXT: lbu t0, 23(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t3, t1 -; RV32I-NEXT: or a6, s0, t4 -; RV32I-NEXT: lbu t1, 24(a0) +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t6, 24(a0) ; RV32I-NEXT: lbu s0, 25(a0) ; RV32I-NEXT: lbu s1, 26(a0) ; RV32I-NEXT: lbu s2, 27(a0) -; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s5, s5, 8 ; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli s8, s8, 24 -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: or t2, s3, t2 -; RV32I-NEXT: or t3, s8, s6 -; RV32I-NEXT: or t4, s10, s9 -; RV32I-NEXT: lbu s3, 28(a0) -; RV32I-NEXT: lbu s6, 29(a0) -; RV32I-NEXT: lbu s8, 30(a0) -; RV32I-NEXT: lbu s9, 31(a0) -; RV32I-NEXT: slli s4, s4, 16 ; RV32I-NEXT: slli s7, s7, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a0, s7, s4 -; RV32I-NEXT: or s4, s11, s5 -; RV32I-NEXT: or s5, a3, ra -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu s7, 1(a1) -; RV32I-NEXT: lbu s10, 2(a1) +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu s6, 31(a0) +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: or a0, s11, s10 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: lbu s2, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 64(sp) -; RV32I-NEXT: sw zero, 68(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t5, t6, t5 -; RV32I-NEXT: addi t6, sp, 8 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: slli s6, s6, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: or s3, s4, s3 +; RV32I-NEXT: mv s4, sp +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, s0, t1 -; RV32I-NEXT: or t1, s2, s1 -; RV32I-NEXT: or s0, s6, s3 -; RV32I-NEXT: or s1, s9, s8 -; RV32I-NEXT: or a3, s7, a3 -; RV32I-NEXT: or a1, a1, s10 -; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, s2 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t3, t2 -; RV32I-NEXT: or a0, a0, t4 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: or a7, a7, t5 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: sw t2, 24(sp) -; RV32I-NEXT: sw a7, 28(sp) -; RV32I-NEXT: sw t0, 32(sp) -; RV32I-NEXT: sw s0, 36(sp) -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) -; RV32I-NEXT: sw a6, 16(sp) +; RV32I-NEXT: or s5, s6, s5 +; RV32I-NEXT: or s1, s2, s1 +; RV32I-NEXT: or a1, a1, s7 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or a0, a0, t5 +; RV32I-NEXT: or t0, s0, t6 +; RV32I-NEXT: or t1, s5, s3 +; RV32I-NEXT: or a1, a1, s1 +; RV32I-NEXT: sw a7, 16(sp) ; RV32I-NEXT: sw a0, 20(sp) +; RV32I-NEXT: sw t0, 24(sp) +; RV32I-NEXT: sw t1, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: slli t1, a1, 3 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: add a1, t6, a1 +; RV32I-NEXT: add a1, s4, a1 ; RV32I-NEXT: andi a0, t1, 24 -; RV32I-NEXT: xori t0, a0, 31 +; RV32I-NEXT: xori a7, a0, 31 ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a5, 8(a1) ; RV32I-NEXT: lw a6, 12(a1) -; RV32I-NEXT: lw a7, 16(a1) +; RV32I-NEXT: lw t0, 16(a1) ; RV32I-NEXT: lw t2, 20(a1) ; RV32I-NEXT: lw t3, 24(a1) ; RV32I-NEXT: lw t4, 28(a1) @@ -2344,33 +2341,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srl a1, a3, t1 ; RV32I-NEXT: slli t6, a4, 1 ; RV32I-NEXT: srl a3, a6, t1 -; RV32I-NEXT: slli s0, a7, 1 +; RV32I-NEXT: slli s0, t0, 1 ; RV32I-NEXT: srl a4, a5, t1 ; RV32I-NEXT: slli s1, a6, 1 ; RV32I-NEXT: srl a5, t2, t1 ; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: srl a6, a7, t1 +; RV32I-NEXT: srl a6, t0, t1 ; RV32I-NEXT: slli t2, t2, 1 -; RV32I-NEXT: srl a7, t3, t1 +; RV32I-NEXT: srl t0, t3, t1 ; RV32I-NEXT: slli t3, t4, 1 ; RV32I-NEXT: srl t1, t4, t1 -; RV32I-NEXT: sll t4, t5, t0 -; RV32I-NEXT: sll t5, t6, t0 -; RV32I-NEXT: sll t6, s0, t0 -; RV32I-NEXT: sll s0, s1, t0 -; RV32I-NEXT: sll s1, s2, t0 -; RV32I-NEXT: sll t2, t2, t0 -; RV32I-NEXT: sll t3, t3, t0 +; RV32I-NEXT: sll t4, t5, a7 +; RV32I-NEXT: sll t5, t6, a7 +; RV32I-NEXT: sll t6, s0, a7 +; RV32I-NEXT: sll s0, s1, a7 +; RV32I-NEXT: sll s1, s2, a7 +; RV32I-NEXT: sll t2, t2, a7 +; RV32I-NEXT: sll t3, t3, a7 ; RV32I-NEXT: srli s2, t1, 24 ; RV32I-NEXT: srli s3, t1, 16 ; RV32I-NEXT: srli s4, t1, 8 -; RV32I-NEXT: or t0, a0, t4 +; RV32I-NEXT: or a7, a0, t4 ; RV32I-NEXT: or t4, a1, t5 ; RV32I-NEXT: or t5, a3, t6 ; RV32I-NEXT: or s0, a4, s0 ; RV32I-NEXT: or s1, a5, s1 ; RV32I-NEXT: or t2, a6, t2 -; RV32I-NEXT: or t3, a7, t3 +; RV32I-NEXT: or t3, t0, t3 ; RV32I-NEXT: sb t1, 28(a2) ; RV32I-NEXT: sb s4, 29(a2) ; RV32I-NEXT: sb s3, 30(a2) @@ -2387,23 +2384,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s6, s0, 24 ; RV32I-NEXT: srli s7, s0, 16 ; RV32I-NEXT: srli s0, s0, 8 -; RV32I-NEXT: srli s8, t5, 24 -; RV32I-NEXT: srli s9, t5, 16 -; RV32I-NEXT: srli t5, t5, 8 -; RV32I-NEXT: srli s10, t4, 24 -; RV32I-NEXT: srli s11, t4, 16 -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: sb t0, 24(a2) +; RV32I-NEXT: srli t0, t5, 24 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, t5, 16 +; RV32I-NEXT: srli t5, t5, 8 ; RV32I-NEXT: sb t6, 26(a2) +; RV32I-NEXT: srli t6, t4, 24 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, t0, 24 +; RV32I-NEXT: srli t1, t4, 16 +; RV32I-NEXT: srli t4, t4, 8 ; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: srli a6, a7, 24 ; RV32I-NEXT: sb t2, 17(a2) ; RV32I-NEXT: sb s3, 18(a2) ; RV32I-NEXT: sb s2, 19(a2) -; RV32I-NEXT: srli a6, t0, 16 -; RV32I-NEXT: srli t0, t0, 8 +; RV32I-NEXT: srli t2, a7, 16 +; RV32I-NEXT: srli a7, a7, 8 ; RV32I-NEXT: sb a5, 20(a2) ; RV32I-NEXT: sb s1, 21(a2) ; RV32I-NEXT: sb s5, 22(a2) @@ -2414,30 +2411,29 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: sb a3, 12(a2) ; RV32I-NEXT: sb t5, 13(a2) -; RV32I-NEXT: sb s9, 14(a2) -; RV32I-NEXT: sb s8, 15(a2) +; RV32I-NEXT: sb t3, 14(a2) +; RV32I-NEXT: sb t0, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: sb t4, 1(a2) -; RV32I-NEXT: sb s11, 2(a2) -; RV32I-NEXT: sb s10, 3(a2) +; RV32I-NEXT: sb t1, 2(a2) +; RV32I-NEXT: sb t6, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: sb t0, 5(a2) -; RV32I-NEXT: sb a6, 6(a2) -; RV32I-NEXT: sb a7, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb t2, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -2682,132 +2678,128 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; ; RV32I-LABEL: lshr_32bytes_wordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv a3, a1 -; RV32I-NEXT: lbu a5, 0(a0) -; RV32I-NEXT: lbu a7, 1(a0) -; RV32I-NEXT: lbu t0, 2(a0) -; RV32I-NEXT: lbu t1, 3(a0) -; RV32I-NEXT: lbu s2, 4(a0) -; RV32I-NEXT: lbu s4, 5(a0) -; RV32I-NEXT: lbu s5, 6(a0) -; RV32I-NEXT: lbu s6, 7(a0) -; RV32I-NEXT: lbu s3, 8(a0) -; RV32I-NEXT: lbu s9, 9(a0) -; RV32I-NEXT: lbu s10, 10(a0) -; RV32I-NEXT: lbu s11, 11(a0) -; RV32I-NEXT: lbu ra, 12(a0) -; RV32I-NEXT: lbu a1, 13(a0) -; RV32I-NEXT: lbu t4, 14(a0) -; RV32I-NEXT: lbu t6, 15(a0) -; RV32I-NEXT: lbu a4, 16(a0) -; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a6, 17(a0) -; RV32I-NEXT: lbu t2, 18(a0) -; RV32I-NEXT: lbu t3, 19(a0) -; RV32I-NEXT: lbu a4, 20(a0) -; RV32I-NEXT: lbu t5, 21(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s2, 12(a0) +; RV32I-NEXT: lbu s3, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) ; RV32I-NEXT: lbu s0, 22(a0) ; RV32I-NEXT: lbu s1, 23(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: slli s5, s5, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: or t0, s4, s2 -; RV32I-NEXT: or t1, s6, s5 -; RV32I-NEXT: lbu s2, 24(a0) -; RV32I-NEXT: lbu s6, 25(a0) -; RV32I-NEXT: lbu s7, 26(a0) -; RV32I-NEXT: lbu s8, 27(a0) -; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or s3, s9, s3 -; RV32I-NEXT: or s4, s11, s10 -; RV32I-NEXT: or s5, a1, ra -; RV32I-NEXT: lbu s9, 28(a0) -; RV32I-NEXT: lbu a1, 29(a0) -; RV32I-NEXT: lbu s10, 30(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s3, s2 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu s2, 25(a0) +; RV32I-NEXT: lbu s3, 26(a0) +; RV32I-NEXT: lbu s4, 27(a0) +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or t6, s11, s10 +; RV32I-NEXT: lbu s5, 28(a0) +; RV32I-NEXT: lbu s6, 29(a0) +; RV32I-NEXT: lbu s7, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: lbu a3, 0(a3) +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 64(sp) -; RV32I-NEXT: sw zero, 68(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or t4, t6, t4 -; RV32I-NEXT: addi t6, sp, 8 -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: slli t5, t5, 8 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: mv s1, sp +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s3, s3, 16 +; RV32I-NEXT: slli s4, s4, 24 ; RV32I-NEXT: slli s6, s6, 8 ; RV32I-NEXT: slli s7, s7, 16 -; RV32I-NEXT: slli s8, s8, 24 -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: slli a3, a3, 2 -; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a6, a6, s11 -; RV32I-NEXT: or t2, t3, t2 -; RV32I-NEXT: or a4, t5, a4 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or t3, s6, s2 -; RV32I-NEXT: or t5, s8, s7 -; RV32I-NEXT: or a1, a1, s9 -; RV32I-NEXT: or a0, a0, s10 -; RV32I-NEXT: andi a3, a3, 28 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: or t0, s4, s3 -; RV32I-NEXT: or t1, t4, s5 -; RV32I-NEXT: or a6, t2, a6 -; RV32I-NEXT: or a4, s0, a4 -; RV32I-NEXT: or t2, t5, t3 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: add t6, t6, a3 -; RV32I-NEXT: sw a6, 24(sp) -; RV32I-NEXT: sw a4, 28(sp) -; RV32I-NEXT: sw t2, 32(sp) -; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: or t3, s2, t3 +; RV32I-NEXT: or s2, s4, s3 +; RV32I-NEXT: or s3, s6, s5 +; RV32I-NEXT: or a0, a0, s7 +; RV32I-NEXT: andi a1, a1, 28 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t5, t4 +; RV32I-NEXT: or t0, s0, t6 +; RV32I-NEXT: or t1, s2, t3 +; RV32I-NEXT: or a0, a0, s3 +; RV32I-NEXT: add s1, s1, a1 +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) ; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a7, 12(sp) -; RV32I-NEXT: sw t0, 16(sp) -; RV32I-NEXT: sw t1, 20(sp) -; RV32I-NEXT: lw a6, 16(t6) -; RV32I-NEXT: lw a5, 20(t6) -; RV32I-NEXT: lw a7, 24(t6) -; RV32I-NEXT: lw a1, 0(t6) -; RV32I-NEXT: lw a0, 4(t6) -; RV32I-NEXT: lw a4, 8(t6) -; RV32I-NEXT: lw a3, 12(t6) -; RV32I-NEXT: lw t0, 28(t6) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: lw a6, 16(s1) +; RV32I-NEXT: lw a5, 20(s1) +; RV32I-NEXT: lw a7, 24(s1) +; RV32I-NEXT: lw a1, 0(s1) +; RV32I-NEXT: lw a0, 4(s1) +; RV32I-NEXT: lw a4, 8(s1) +; RV32I-NEXT: lw a3, 12(s1) +; RV32I-NEXT: lw t0, 28(s1) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -2822,21 +2814,21 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 -; RV32I-NEXT: srli s8, a4, 8 -; RV32I-NEXT: srli s9, a3, 24 -; RV32I-NEXT: srli s10, a3, 16 -; RV32I-NEXT: srli s11, a3, 8 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a1, 24 +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) +; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a1, 16 +; RV32I-NEXT: srli t1, a3, 8 ; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli t4, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -2848,36 +2840,35 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) -; RV32I-NEXT: sb s9, 15(a2) +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb t3, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb t6, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %wordOff = load i256, ptr %wordOff.ptr, align 1 @@ -2903,111 +2894,111 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a5, 0(a0) -; RV64I-NEXT: lbu a7, 1(a0) -; RV64I-NEXT: lbu t2, 2(a0) -; RV64I-NEXT: lbu s3, 3(a0) -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu s8, 5(a0) -; RV64I-NEXT: lbu s9, 6(a0) -; RV64I-NEXT: lbu s10, 7(a0) -; RV64I-NEXT: lbu s2, 8(a0) -; RV64I-NEXT: lbu s4, 9(a0) -; RV64I-NEXT: lbu s5, 10(a0) -; RV64I-NEXT: lbu s6, 11(a0) -; RV64I-NEXT: lbu s7, 12(a0) -; RV64I-NEXT: lbu s11, 13(a0) -; RV64I-NEXT: lbu t1, 14(a0) -; RV64I-NEXT: lbu t3, 15(a0) -; RV64I-NEXT: lbu a3, 16(a0) -; RV64I-NEXT: lbu a6, 17(a0) -; RV64I-NEXT: lbu t4, 18(a0) -; RV64I-NEXT: lbu t5, 19(a0) -; RV64I-NEXT: lbu a4, 20(a0) -; RV64I-NEXT: lbu t6, 21(a0) -; RV64I-NEXT: lbu s0, 22(a0) -; RV64I-NEXT: lbu s1, 23(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 ; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: slli s9, s9, 16 -; RV64I-NEXT: slli s10, s10, 24 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, s3, t2 -; RV64I-NEXT: or t0, s8, t0 -; RV64I-NEXT: or t2, s10, s9 -; RV64I-NEXT: lbu s3, 24(a0) -; RV64I-NEXT: lbu s8, 25(a0) -; RV64I-NEXT: lbu s9, 26(a0) -; RV64I-NEXT: lbu s10, 27(a0) -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: slli s5, s5, 16 -; RV64I-NEXT: slli s6, s6, 24 -; RV64I-NEXT: slli s11, s11, 8 -; RV64I-NEXT: or s2, s4, s2 -; RV64I-NEXT: or s4, s6, s5 -; RV64I-NEXT: or s5, s11, s7 -; RV64I-NEXT: lbu s6, 28(a0) -; RV64I-NEXT: lbu s7, 29(a0) -; RV64I-NEXT: lbu s11, 30(a0) +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t3, 24(a0) +; RV64I-NEXT: lbu t4, 25(a0) +; RV64I-NEXT: lbu t5, 26(a0) +; RV64I-NEXT: lbu t6, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or s0, s5, s4 +; RV64I-NEXT: or s1, s7, s6 +; RV64I-NEXT: or s2, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) ; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) ; RV64I-NEXT: sd zero, 56(sp) -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t1, t3, t1 -; RV64I-NEXT: mv t3, sp -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: slli t4, t4, 16 -; RV64I-NEXT: slli t5, t5, 24 -; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s0, s0, 16 -; RV64I-NEXT: slli s1, s1, 24 -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: slli s9, s9, 16 -; RV64I-NEXT: slli s10, s10, 24 -; RV64I-NEXT: slli s7, s7, 8 -; RV64I-NEXT: slli s11, s11, 16 +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: or s6, s11, s10 +; RV64I-NEXT: mv s7, sp +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: or a6, t5, t4 -; RV64I-NEXT: or a4, t6, a4 -; RV64I-NEXT: or s0, s1, s0 -; RV64I-NEXT: or t4, s8, s3 -; RV64I-NEXT: or t5, s10, s9 -; RV64I-NEXT: or t6, s7, s6 -; RV64I-NEXT: or a0, a0, s11 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or t4, t6, t5 +; RV64I-NEXT: or t5, s4, s3 +; RV64I-NEXT: or a0, a0, s5 ; RV64I-NEXT: andi a1, a1, 24 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, t2, t0 -; RV64I-NEXT: or t0, s4, s2 -; RV64I-NEXT: or t1, t1, s5 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: or a4, s0, a4 -; RV64I-NEXT: or a6, t5, t4 -; RV64I-NEXT: or a0, a0, t6 -; RV64I-NEXT: add t3, t3, a1 -; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or a7, s6, s2 +; RV64I-NEXT: or t0, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: add s7, s7, a1 ; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: or a1, a7, a5 -; RV64I-NEXT: or a5, t1, t0 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: sd a1, 0(sp) -; RV64I-NEXT: sd a5, 8(sp) -; RV64I-NEXT: sd a3, 16(sp) +; RV64I-NEXT: or a1, a6, a5 +; RV64I-NEXT: or a4, a7, s0 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a1, 8(sp) +; RV64I-NEXT: sd a4, 16(sp) ; RV64I-NEXT: sd a0, 24(sp) -; RV64I-NEXT: ld a4, 16(t3) -; RV64I-NEXT: ld a0, 8(t3) -; RV64I-NEXT: ld a1, 0(t3) -; RV64I-NEXT: ld a3, 24(t3) +; RV64I-NEXT: ld a4, 16(s7) +; RV64I-NEXT: ld a0, 8(s7) +; RV64I-NEXT: ld a1, 0(s7) +; RV64I-NEXT: ld a3, 24(s7) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 ; RV64I-NEXT: srli a7, a4, 40 @@ -3026,25 +3017,25 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: srli s5, a1, 48 ; RV64I-NEXT: srli s6, a1, 40 ; RV64I-NEXT: srli s7, a1, 32 -; RV64I-NEXT: srli s8, a1, 24 -; RV64I-NEXT: srli s9, a1, 16 -; RV64I-NEXT: srli s10, a1, 8 -; RV64I-NEXT: srli s11, a0, 56 ; RV64I-NEXT: sb t0, 20(a2) +; RV64I-NEXT: srli t0, a1, 24 ; RV64I-NEXT: sb a7, 21(a2) +; RV64I-NEXT: srli a7, a1, 16 ; RV64I-NEXT: sb a6, 22(a2) +; RV64I-NEXT: srli a6, a1, 8 ; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, a0, 48 +; RV64I-NEXT: srli a5, a0, 56 ; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: srli a4, a0, 48 ; RV64I-NEXT: sb t3, 17(a2) ; RV64I-NEXT: sb t2, 18(a2) ; RV64I-NEXT: sb t1, 19(a2) -; RV64I-NEXT: srli a4, a0, 40 +; RV64I-NEXT: srli t1, a0, 40 ; RV64I-NEXT: sb s0, 28(a2) ; RV64I-NEXT: sb t6, 29(a2) ; RV64I-NEXT: sb t5, 30(a2) ; RV64I-NEXT: sb t4, 31(a2) -; RV64I-NEXT: srli a6, a0, 32 +; RV64I-NEXT: srli t2, a0, 32 ; RV64I-NEXT: sb a3, 24(a2) ; RV64I-NEXT: sb s3, 25(a2) ; RV64I-NEXT: sb s2, 26(a2) @@ -3054,19 +3045,19 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sb s6, 5(a2) ; RV64I-NEXT: sb s5, 6(a2) ; RV64I-NEXT: sb s4, 7(a2) -; RV64I-NEXT: srli a7, a0, 16 +; RV64I-NEXT: srli t3, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb s10, 1(a2) -; RV64I-NEXT: sb s9, 2(a2) -; RV64I-NEXT: sb s8, 3(a2) +; RV64I-NEXT: sb a6, 1(a2) +; RV64I-NEXT: sb a7, 2(a2) +; RV64I-NEXT: sb t0, 3(a2) ; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: sb a6, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a5, 14(a2) -; RV64I-NEXT: sb s11, 15(a2) +; RV64I-NEXT: sb t2, 12(a2) +; RV64I-NEXT: sb t1, 13(a2) +; RV64I-NEXT: sb a4, 14(a2) +; RV64I-NEXT: sb a5, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: sb a7, 10(a2) +; RV64I-NEXT: sb t3, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) ; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload @@ -3085,132 +3076,128 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; ; RV32I-LABEL: lshr_32bytes_dwordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv a3, a1 -; RV32I-NEXT: lbu a5, 0(a0) -; RV32I-NEXT: lbu a7, 1(a0) -; RV32I-NEXT: lbu t0, 2(a0) -; RV32I-NEXT: lbu t1, 3(a0) -; RV32I-NEXT: lbu s2, 4(a0) -; RV32I-NEXT: lbu s4, 5(a0) -; RV32I-NEXT: lbu s5, 6(a0) -; RV32I-NEXT: lbu s6, 7(a0) -; RV32I-NEXT: lbu s3, 8(a0) -; RV32I-NEXT: lbu s9, 9(a0) -; RV32I-NEXT: lbu s10, 10(a0) -; RV32I-NEXT: lbu s11, 11(a0) -; RV32I-NEXT: lbu ra, 12(a0) -; RV32I-NEXT: lbu a1, 13(a0) -; RV32I-NEXT: lbu t4, 14(a0) -; RV32I-NEXT: lbu t6, 15(a0) -; RV32I-NEXT: lbu a4, 16(a0) -; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a6, 17(a0) -; RV32I-NEXT: lbu t2, 18(a0) -; RV32I-NEXT: lbu t3, 19(a0) -; RV32I-NEXT: lbu a4, 20(a0) -; RV32I-NEXT: lbu t5, 21(a0) -; RV32I-NEXT: lbu s0, 22(a0) -; RV32I-NEXT: lbu s1, 23(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: slli s5, s5, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: or t0, s4, s2 -; RV32I-NEXT: or t1, s6, s5 -; RV32I-NEXT: lbu s2, 24(a0) -; RV32I-NEXT: lbu s6, 25(a0) -; RV32I-NEXT: lbu s7, 26(a0) -; RV32I-NEXT: lbu s8, 27(a0) -; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or s3, s9, s3 -; RV32I-NEXT: or s4, s11, s10 -; RV32I-NEXT: or s5, a1, ra -; RV32I-NEXT: lbu s9, 28(a0) -; RV32I-NEXT: lbu a1, 29(a0) -; RV32I-NEXT: lbu s10, 30(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s2, 12(a0) +; RV32I-NEXT: lbu s3, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: lbu s0, 22(a0) +; RV32I-NEXT: lbu s1, 23(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s3, s2 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu s2, 25(a0) +; RV32I-NEXT: lbu s3, 26(a0) +; RV32I-NEXT: lbu s4, 27(a0) +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or t6, s11, s10 +; RV32I-NEXT: lbu s5, 28(a0) +; RV32I-NEXT: lbu s6, 29(a0) +; RV32I-NEXT: lbu s7, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: lbu a3, 0(a3) +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 64(sp) -; RV32I-NEXT: sw zero, 68(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or t4, t6, t4 -; RV32I-NEXT: addi t6, sp, 8 -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: slli t5, t5, 8 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: mv s1, sp +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s3, s3, 16 +; RV32I-NEXT: slli s4, s4, 24 ; RV32I-NEXT: slli s6, s6, 8 ; RV32I-NEXT: slli s7, s7, 16 -; RV32I-NEXT: slli s8, s8, 24 -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: slli a3, a3, 3 -; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a6, a6, s11 -; RV32I-NEXT: or t2, t3, t2 -; RV32I-NEXT: or a4, t5, a4 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or t3, s6, s2 -; RV32I-NEXT: or t5, s8, s7 -; RV32I-NEXT: or a1, a1, s9 -; RV32I-NEXT: or a0, a0, s10 -; RV32I-NEXT: andi a3, a3, 24 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: or t0, s4, s3 -; RV32I-NEXT: or t1, t4, s5 -; RV32I-NEXT: or a6, t2, a6 -; RV32I-NEXT: or a4, s0, a4 -; RV32I-NEXT: or t2, t5, t3 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: add t6, t6, a3 -; RV32I-NEXT: sw a6, 24(sp) -; RV32I-NEXT: sw a4, 28(sp) -; RV32I-NEXT: sw t2, 32(sp) -; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: or t3, s2, t3 +; RV32I-NEXT: or s2, s4, s3 +; RV32I-NEXT: or s3, s6, s5 +; RV32I-NEXT: or a0, a0, s7 +; RV32I-NEXT: andi a1, a1, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t5, t4 +; RV32I-NEXT: or t0, s0, t6 +; RV32I-NEXT: or t1, s2, t3 +; RV32I-NEXT: or a0, a0, s3 +; RV32I-NEXT: add s1, s1, a1 +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) ; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a7, 12(sp) -; RV32I-NEXT: sw t0, 16(sp) -; RV32I-NEXT: sw t1, 20(sp) -; RV32I-NEXT: lw a6, 16(t6) -; RV32I-NEXT: lw a5, 20(t6) -; RV32I-NEXT: lw a7, 24(t6) -; RV32I-NEXT: lw a1, 0(t6) -; RV32I-NEXT: lw a0, 4(t6) -; RV32I-NEXT: lw a4, 8(t6) -; RV32I-NEXT: lw a3, 12(t6) -; RV32I-NEXT: lw t0, 28(t6) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: lw a6, 16(s1) +; RV32I-NEXT: lw a5, 20(s1) +; RV32I-NEXT: lw a7, 24(s1) +; RV32I-NEXT: lw a1, 0(s1) +; RV32I-NEXT: lw a0, 4(s1) +; RV32I-NEXT: lw a4, 8(s1) +; RV32I-NEXT: lw a3, 12(s1) +; RV32I-NEXT: lw t0, 28(s1) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -3225,21 +3212,21 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 -; RV32I-NEXT: srli s8, a4, 8 -; RV32I-NEXT: srli s9, a3, 24 -; RV32I-NEXT: srli s10, a3, 16 -; RV32I-NEXT: srli s11, a3, 8 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a1, 24 +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) +; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a1, 16 +; RV32I-NEXT: srli t1, a3, 8 ; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli t4, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -3251,36 +3238,35 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) -; RV32I-NEXT: sb s9, 15(a2) +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb t3, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb t6, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 @@ -3524,132 +3510,129 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu s1, 0(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu t1, 4(a0) -; RV32I-NEXT: lbu t3, 5(a0) -; RV32I-NEXT: lbu t4, 6(a0) -; RV32I-NEXT: lbu s0, 7(a0) -; RV32I-NEXT: lbu t2, 8(a0) -; RV32I-NEXT: lbu s3, 9(a0) -; RV32I-NEXT: lbu s6, 10(a0) -; RV32I-NEXT: lbu s8, 11(a0) -; RV32I-NEXT: lbu s9, 12(a0) -; RV32I-NEXT: lbu s10, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s7, 15(a0) -; RV32I-NEXT: lbu s5, 16(a0) -; RV32I-NEXT: lbu s11, 17(a0) -; RV32I-NEXT: lbu ra, 18(a0) -; RV32I-NEXT: lbu a3, 19(a0) -; RV32I-NEXT: lbu t5, 20(a0) -; RV32I-NEXT: lbu t6, 21(a0) -; RV32I-NEXT: lbu a7, 22(a0) -; RV32I-NEXT: lbu t0, 23(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t3, t1 -; RV32I-NEXT: or a6, s0, t4 -; RV32I-NEXT: lbu t1, 24(a0) +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t6, 24(a0) ; RV32I-NEXT: lbu s0, 25(a0) ; RV32I-NEXT: lbu s1, 26(a0) ; RV32I-NEXT: lbu s2, 27(a0) -; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s5, s5, 8 ; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli s8, s8, 24 -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: or t2, s3, t2 -; RV32I-NEXT: or t3, s8, s6 -; RV32I-NEXT: or t4, s10, s9 -; RV32I-NEXT: lbu s3, 28(a0) -; RV32I-NEXT: lbu s6, 29(a0) -; RV32I-NEXT: lbu s8, 30(a0) -; RV32I-NEXT: lbu s9, 31(a0) -; RV32I-NEXT: slli s4, s4, 16 ; RV32I-NEXT: slli s7, s7, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a0, s7, s4 -; RV32I-NEXT: or s4, s11, s5 -; RV32I-NEXT: or s5, a3, ra -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu s7, 1(a1) -; RV32I-NEXT: lbu s10, 2(a1) +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu s6, 31(a0) +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: or a0, s11, s10 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: lbu s2, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t5, t6, t5 -; RV32I-NEXT: addi t6, sp, 40 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: slli s6, s6, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: or s3, s4, s3 +; RV32I-NEXT: addi s4, sp, 32 +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, s0, t1 -; RV32I-NEXT: or t1, s2, s1 -; RV32I-NEXT: or s0, s6, s3 -; RV32I-NEXT: or s1, s9, s8 -; RV32I-NEXT: or a3, s7, a3 -; RV32I-NEXT: or a1, a1, s10 -; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, s2 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t3, t2 -; RV32I-NEXT: or a0, a0, t4 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: or a7, a7, t5 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: sw t2, 56(sp) -; RV32I-NEXT: sw a7, 60(sp) -; RV32I-NEXT: sw t0, 64(sp) -; RV32I-NEXT: sw s0, 68(sp) -; RV32I-NEXT: sw a4, 40(sp) -; RV32I-NEXT: sw a5, 44(sp) -; RV32I-NEXT: sw a6, 48(sp) +; RV32I-NEXT: or s5, s6, s5 +; RV32I-NEXT: or s1, s2, s1 +; RV32I-NEXT: or a1, a1, s7 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or a0, a0, t5 +; RV32I-NEXT: or t0, s0, t6 +; RV32I-NEXT: or t1, s5, s3 +; RV32I-NEXT: or a1, a1, s1 +; RV32I-NEXT: sw a7, 48(sp) ; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: sw t0, 56(sp) +; RV32I-NEXT: sw t1, 60(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: sw a4, 36(sp) +; RV32I-NEXT: sw a5, 40(sp) +; RV32I-NEXT: sw a6, 44(sp) ; RV32I-NEXT: slli a3, a1, 3 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: sub a1, t6, a1 +; RV32I-NEXT: sub a1, s4, a1 ; RV32I-NEXT: andi a0, a3, 24 ; RV32I-NEXT: xori a0, a0, 31 ; RV32I-NEXT: lw a4, 0(a1) @@ -3664,10 +3647,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli t4, a4, 1 ; RV32I-NEXT: sll t5, a7, a3 ; RV32I-NEXT: srli t6, a6, 1 -; RV32I-NEXT: sll s0, a6, a3 +; RV32I-NEXT: sll a6, a6, a3 ; RV32I-NEXT: srli a5, a5, 1 -; RV32I-NEXT: sll s1, t1, a3 -; RV32I-NEXT: srli a6, t0, 1 +; RV32I-NEXT: sll s0, t1, a3 +; RV32I-NEXT: srli s1, t0, 1 ; RV32I-NEXT: sll s2, t0, a3 ; RV32I-NEXT: srli a7, a7, 1 ; RV32I-NEXT: sll s3, a1, a3 @@ -3675,56 +3658,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sll s4, t2, a3 ; RV32I-NEXT: srli t0, t1, 1 ; RV32I-NEXT: sll s5, a4, a3 -; RV32I-NEXT: srl t2, t4, a0 -; RV32I-NEXT: srl t4, t6, a0 -; RV32I-NEXT: srl t6, a5, a0 -; RV32I-NEXT: srl s6, a6, a0 -; RV32I-NEXT: srl s7, a7, a0 -; RV32I-NEXT: srl s8, a1, a0 -; RV32I-NEXT: srl s9, t0, a0 -; RV32I-NEXT: srli t1, s4, 24 -; RV32I-NEXT: srli a7, s3, 24 +; RV32I-NEXT: srl t4, t4, a0 +; RV32I-NEXT: srl a4, t6, a0 +; RV32I-NEXT: srl t1, a5, a0 +; RV32I-NEXT: srl t6, s1, a0 +; RV32I-NEXT: srl s1, a7, a0 +; RV32I-NEXT: srl s6, a1, a0 +; RV32I-NEXT: srl s7, t0, a0 +; RV32I-NEXT: srli t2, s4, 24 +; RV32I-NEXT: srli t0, s3, 24 ; RV32I-NEXT: srli a5, s2, 24 -; RV32I-NEXT: srli a3, s1, 24 -; RV32I-NEXT: srli a1, s0, 24 +; RV32I-NEXT: srli a3, s0, 24 +; RV32I-NEXT: srli a1, a6, 24 ; RV32I-NEXT: srli a0, t5, 24 -; RV32I-NEXT: srli s10, s5, 24 -; RV32I-NEXT: srli s11, s5, 16 -; RV32I-NEXT: srli ra, s5, 8 -; RV32I-NEXT: srli a4, t3, 24 -; RV32I-NEXT: or a6, t3, t2 -; RV32I-NEXT: or t0, t5, t4 -; RV32I-NEXT: or t2, s0, t6 -; RV32I-NEXT: or t3, s1, s6 -; RV32I-NEXT: or t4, s2, s7 -; RV32I-NEXT: or t5, s3, s8 -; RV32I-NEXT: or t6, s4, s9 +; RV32I-NEXT: srli s8, s5, 24 +; RV32I-NEXT: or a4, t5, a4 +; RV32I-NEXT: srli t5, s5, 16 +; RV32I-NEXT: or t1, a6, t1 +; RV32I-NEXT: srli s9, s5, 8 +; RV32I-NEXT: or a7, t3, t4 +; RV32I-NEXT: srli a6, t3, 24 +; RV32I-NEXT: or t3, s0, t6 +; RV32I-NEXT: or t4, s2, s1 +; RV32I-NEXT: or t6, s3, s6 +; RV32I-NEXT: or s0, s4, s7 ; RV32I-NEXT: sb s5, 0(a2) -; RV32I-NEXT: sb ra, 1(a2) -; RV32I-NEXT: sb s11, 2(a2) -; RV32I-NEXT: sb s10, 3(a2) -; RV32I-NEXT: srli s0, t6, 16 -; RV32I-NEXT: srli s1, t6, 8 -; RV32I-NEXT: srli s2, t5, 16 -; RV32I-NEXT: srli s3, t5, 8 +; RV32I-NEXT: sb s9, 1(a2) +; RV32I-NEXT: sb t5, 2(a2) +; RV32I-NEXT: sb s8, 3(a2) +; RV32I-NEXT: srli t5, s0, 16 +; RV32I-NEXT: srli s1, s0, 8 +; RV32I-NEXT: srli s2, t6, 16 +; RV32I-NEXT: srli s3, t6, 8 ; RV32I-NEXT: srli s4, t4, 16 ; RV32I-NEXT: srli s5, t4, 8 ; RV32I-NEXT: srli s6, t3, 16 ; RV32I-NEXT: srli s7, t3, 8 -; RV32I-NEXT: srli s8, t2, 16 -; RV32I-NEXT: srli s9, t2, 8 -; RV32I-NEXT: srli s10, t0, 16 -; RV32I-NEXT: srli s11, t0, 8 -; RV32I-NEXT: sb t6, 24(a2) +; RV32I-NEXT: sb s0, 24(a2) +; RV32I-NEXT: srli s0, t1, 16 ; RV32I-NEXT: sb s1, 25(a2) -; RV32I-NEXT: sb s0, 26(a2) -; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a6, 16 -; RV32I-NEXT: sb t5, 28(a2) +; RV32I-NEXT: srli s1, t1, 8 +; RV32I-NEXT: sb t5, 26(a2) +; RV32I-NEXT: srli t5, a4, 16 +; RV32I-NEXT: sb t2, 27(a2) +; RV32I-NEXT: srli t2, a4, 8 +; RV32I-NEXT: sb t6, 28(a2) +; RV32I-NEXT: srli t6, a7, 16 ; RV32I-NEXT: sb s3, 29(a2) ; RV32I-NEXT: sb s2, 30(a2) -; RV32I-NEXT: sb a7, 31(a2) -; RV32I-NEXT: srli a7, a6, 8 +; RV32I-NEXT: sb t0, 31(a2) +; RV32I-NEXT: srli t0, a7, 8 ; RV32I-NEXT: sb t4, 16(a2) ; RV32I-NEXT: sb s5, 17(a2) ; RV32I-NEXT: sb s4, 18(a2) @@ -3733,32 +3716,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s7, 21(a2) ; RV32I-NEXT: sb s6, 22(a2) ; RV32I-NEXT: sb a3, 23(a2) -; RV32I-NEXT: sb t2, 8(a2) -; RV32I-NEXT: sb s9, 9(a2) -; RV32I-NEXT: sb s8, 10(a2) +; RV32I-NEXT: sb t1, 8(a2) +; RV32I-NEXT: sb s1, 9(a2) +; RV32I-NEXT: sb s0, 10(a2) ; RV32I-NEXT: sb a1, 11(a2) -; RV32I-NEXT: sb t0, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb a4, 12(a2) +; RV32I-NEXT: sb t2, 13(a2) +; RV32I-NEXT: sb t5, 14(a2) ; RV32I-NEXT: sb a0, 15(a2) -; RV32I-NEXT: sb a6, 4(a2) -; RV32I-NEXT: sb a7, 5(a2) -; RV32I-NEXT: sb t1, 6(a2) -; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: sb a7, 4(a2) +; RV32I-NEXT: sb t0, 5(a2) +; RV32I-NEXT: sb t6, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -4003,132 +3985,128 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; ; RV32I-LABEL: shl_32bytes_wordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv a3, a1 -; RV32I-NEXT: lbu a5, 0(a0) -; RV32I-NEXT: lbu a7, 1(a0) -; RV32I-NEXT: lbu t0, 2(a0) -; RV32I-NEXT: lbu t1, 3(a0) -; RV32I-NEXT: lbu s2, 4(a0) -; RV32I-NEXT: lbu s4, 5(a0) -; RV32I-NEXT: lbu s5, 6(a0) -; RV32I-NEXT: lbu s6, 7(a0) -; RV32I-NEXT: lbu s3, 8(a0) -; RV32I-NEXT: lbu s9, 9(a0) -; RV32I-NEXT: lbu s10, 10(a0) -; RV32I-NEXT: lbu s11, 11(a0) -; RV32I-NEXT: lbu ra, 12(a0) -; RV32I-NEXT: lbu a1, 13(a0) -; RV32I-NEXT: lbu t4, 14(a0) -; RV32I-NEXT: lbu t6, 15(a0) -; RV32I-NEXT: lbu a4, 16(a0) -; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a6, 17(a0) -; RV32I-NEXT: lbu t2, 18(a0) -; RV32I-NEXT: lbu t3, 19(a0) -; RV32I-NEXT: lbu a4, 20(a0) -; RV32I-NEXT: lbu t5, 21(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s2, 12(a0) +; RV32I-NEXT: lbu s3, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) ; RV32I-NEXT: lbu s0, 22(a0) ; RV32I-NEXT: lbu s1, 23(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: slli s5, s5, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: or t0, s4, s2 -; RV32I-NEXT: or t1, s6, s5 -; RV32I-NEXT: lbu s2, 24(a0) -; RV32I-NEXT: lbu s6, 25(a0) -; RV32I-NEXT: lbu s7, 26(a0) -; RV32I-NEXT: lbu s8, 27(a0) -; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or s3, s9, s3 -; RV32I-NEXT: or s4, s11, s10 -; RV32I-NEXT: or s5, a1, ra -; RV32I-NEXT: lbu s9, 28(a0) -; RV32I-NEXT: lbu a1, 29(a0) -; RV32I-NEXT: lbu s10, 30(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s3, s2 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu s2, 25(a0) +; RV32I-NEXT: lbu s3, 26(a0) +; RV32I-NEXT: lbu s4, 27(a0) +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or t6, s11, s10 +; RV32I-NEXT: lbu s5, 28(a0) +; RV32I-NEXT: lbu s6, 29(a0) +; RV32I-NEXT: lbu s7, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: lbu a3, 0(a3) +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or t4, t6, t4 -; RV32I-NEXT: addi t6, sp, 40 -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: slli t5, t5, 8 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: addi s1, sp, 32 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s3, s3, 16 +; RV32I-NEXT: slli s4, s4, 24 ; RV32I-NEXT: slli s6, s6, 8 ; RV32I-NEXT: slli s7, s7, 16 -; RV32I-NEXT: slli s8, s8, 24 -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: slli a3, a3, 2 -; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a6, a6, s11 -; RV32I-NEXT: or t2, t3, t2 -; RV32I-NEXT: or a4, t5, a4 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or t3, s6, s2 -; RV32I-NEXT: or t5, s8, s7 -; RV32I-NEXT: or a1, a1, s9 -; RV32I-NEXT: or a0, a0, s10 -; RV32I-NEXT: andi a3, a3, 28 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: or t0, s4, s3 -; RV32I-NEXT: or t1, t4, s5 -; RV32I-NEXT: or a6, t2, a6 -; RV32I-NEXT: or a4, s0, a4 -; RV32I-NEXT: or t2, t5, t3 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: sub t3, t6, a3 -; RV32I-NEXT: sw a6, 56(sp) -; RV32I-NEXT: sw a4, 60(sp) -; RV32I-NEXT: sw t2, 64(sp) -; RV32I-NEXT: sw a0, 68(sp) +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: or t3, s2, t3 +; RV32I-NEXT: or s2, s4, s3 +; RV32I-NEXT: or s3, s6, s5 +; RV32I-NEXT: or a0, a0, s7 +; RV32I-NEXT: andi a1, a1, 28 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t5, t4 +; RV32I-NEXT: or t0, s0, t6 +; RV32I-NEXT: or t1, s2, t3 +; RV32I-NEXT: or a0, a0, s3 +; RV32I-NEXT: sub s1, s1, a1 +; RV32I-NEXT: sw a7, 48(sp) +; RV32I-NEXT: sw t0, 52(sp) +; RV32I-NEXT: sw t1, 56(sp) +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: sw a4, 36(sp) ; RV32I-NEXT: sw a5, 40(sp) -; RV32I-NEXT: sw a7, 44(sp) -; RV32I-NEXT: sw t0, 48(sp) -; RV32I-NEXT: sw t1, 52(sp) -; RV32I-NEXT: lw a6, 16(t3) -; RV32I-NEXT: lw a5, 20(t3) -; RV32I-NEXT: lw a7, 24(t3) -; RV32I-NEXT: lw a1, 0(t3) -; RV32I-NEXT: lw a0, 4(t3) -; RV32I-NEXT: lw a4, 8(t3) -; RV32I-NEXT: lw a3, 12(t3) -; RV32I-NEXT: lw t0, 28(t3) +; RV32I-NEXT: sw a6, 44(sp) +; RV32I-NEXT: lw a6, 16(s1) +; RV32I-NEXT: lw a5, 20(s1) +; RV32I-NEXT: lw a7, 24(s1) +; RV32I-NEXT: lw a1, 0(s1) +; RV32I-NEXT: lw a0, 4(s1) +; RV32I-NEXT: lw a4, 8(s1) +; RV32I-NEXT: lw a3, 12(s1) +; RV32I-NEXT: lw t0, 28(s1) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -4143,21 +4121,21 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 -; RV32I-NEXT: srli s8, a4, 8 -; RV32I-NEXT: srli s9, a3, 24 -; RV32I-NEXT: srli s10, a3, 16 -; RV32I-NEXT: srli s11, a3, 8 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a1, 24 +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) +; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a1, 16 +; RV32I-NEXT: srli t1, a3, 8 ; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli t4, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -4169,36 +4147,35 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) -; RV32I-NEXT: sb s9, 15(a2) +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb t3, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb t6, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %wordOff = load i256, ptr %wordOff.ptr, align 1 @@ -4224,111 +4201,111 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a5, 0(a0) -; RV64I-NEXT: lbu a7, 1(a0) -; RV64I-NEXT: lbu t2, 2(a0) -; RV64I-NEXT: lbu s3, 3(a0) -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu s8, 5(a0) -; RV64I-NEXT: lbu s9, 6(a0) -; RV64I-NEXT: lbu s10, 7(a0) -; RV64I-NEXT: lbu s2, 8(a0) -; RV64I-NEXT: lbu s4, 9(a0) -; RV64I-NEXT: lbu s5, 10(a0) -; RV64I-NEXT: lbu s6, 11(a0) -; RV64I-NEXT: lbu s7, 12(a0) -; RV64I-NEXT: lbu s11, 13(a0) -; RV64I-NEXT: lbu t1, 14(a0) -; RV64I-NEXT: lbu t3, 15(a0) -; RV64I-NEXT: lbu a3, 16(a0) -; RV64I-NEXT: lbu a6, 17(a0) -; RV64I-NEXT: lbu t4, 18(a0) -; RV64I-NEXT: lbu t5, 19(a0) -; RV64I-NEXT: lbu a4, 20(a0) -; RV64I-NEXT: lbu t6, 21(a0) -; RV64I-NEXT: lbu s0, 22(a0) -; RV64I-NEXT: lbu s1, 23(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 ; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: slli s9, s9, 16 -; RV64I-NEXT: slli s10, s10, 24 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, s3, t2 -; RV64I-NEXT: or t0, s8, t0 -; RV64I-NEXT: or t2, s10, s9 -; RV64I-NEXT: lbu s3, 24(a0) -; RV64I-NEXT: lbu s8, 25(a0) -; RV64I-NEXT: lbu s9, 26(a0) -; RV64I-NEXT: lbu s10, 27(a0) -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: slli s5, s5, 16 -; RV64I-NEXT: slli s6, s6, 24 -; RV64I-NEXT: slli s11, s11, 8 -; RV64I-NEXT: or s2, s4, s2 -; RV64I-NEXT: or s4, s6, s5 -; RV64I-NEXT: or s5, s11, s7 -; RV64I-NEXT: lbu s6, 28(a0) -; RV64I-NEXT: lbu s7, 29(a0) -; RV64I-NEXT: lbu s11, 30(a0) +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t3, 24(a0) +; RV64I-NEXT: lbu t4, 25(a0) +; RV64I-NEXT: lbu t5, 26(a0) +; RV64I-NEXT: lbu t6, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or s0, s5, s4 +; RV64I-NEXT: or s1, s7, s6 +; RV64I-NEXT: or s2, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) ; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t1, t3, t1 -; RV64I-NEXT: addi t3, sp, 32 -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: slli t4, t4, 16 -; RV64I-NEXT: slli t5, t5, 24 -; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s0, s0, 16 -; RV64I-NEXT: slli s1, s1, 24 -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: slli s9, s9, 16 -; RV64I-NEXT: slli s10, s10, 24 -; RV64I-NEXT: slli s7, s7, 8 -; RV64I-NEXT: slli s11, s11, 16 +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: or s6, s11, s10 +; RV64I-NEXT: addi s7, sp, 32 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: or a6, t5, t4 -; RV64I-NEXT: or a4, t6, a4 -; RV64I-NEXT: or s0, s1, s0 -; RV64I-NEXT: or t4, s8, s3 -; RV64I-NEXT: or t5, s10, s9 -; RV64I-NEXT: or t6, s7, s6 -; RV64I-NEXT: or a0, a0, s11 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or t4, t6, t5 +; RV64I-NEXT: or t5, s4, s3 +; RV64I-NEXT: or a0, a0, s5 ; RV64I-NEXT: andi a1, a1, 24 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, t2, t0 -; RV64I-NEXT: or t0, s4, s2 -; RV64I-NEXT: or t1, t1, s5 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: or a4, s0, a4 -; RV64I-NEXT: or a6, t5, t4 -; RV64I-NEXT: or a0, a0, t6 -; RV64I-NEXT: sub t2, t3, a1 -; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or a7, s6, s2 +; RV64I-NEXT: or t0, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: sub t1, s7, a1 ; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: or a1, a7, a5 -; RV64I-NEXT: or a5, t1, t0 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: sd a1, 32(sp) -; RV64I-NEXT: sd a5, 40(sp) -; RV64I-NEXT: sd a3, 48(sp) +; RV64I-NEXT: or a1, a6, a5 +; RV64I-NEXT: or a4, a7, s0 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: sd a3, 32(sp) +; RV64I-NEXT: sd a1, 40(sp) +; RV64I-NEXT: sd a4, 48(sp) ; RV64I-NEXT: sd a0, 56(sp) -; RV64I-NEXT: ld a4, 16(t2) -; RV64I-NEXT: ld a0, 8(t2) -; RV64I-NEXT: ld a1, 0(t2) -; RV64I-NEXT: ld a3, 24(t2) +; RV64I-NEXT: ld a4, 16(t1) +; RV64I-NEXT: ld a0, 8(t1) +; RV64I-NEXT: ld a1, 0(t1) +; RV64I-NEXT: ld a3, 24(t1) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 ; RV64I-NEXT: srli a7, a4, 40 @@ -4347,25 +4324,25 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: srli s5, a1, 48 ; RV64I-NEXT: srli s6, a1, 40 ; RV64I-NEXT: srli s7, a1, 32 -; RV64I-NEXT: srli s8, a1, 24 -; RV64I-NEXT: srli s9, a1, 16 -; RV64I-NEXT: srli s10, a1, 8 -; RV64I-NEXT: srli s11, a0, 56 ; RV64I-NEXT: sb t0, 20(a2) +; RV64I-NEXT: srli t0, a1, 24 ; RV64I-NEXT: sb a7, 21(a2) +; RV64I-NEXT: srli a7, a1, 16 ; RV64I-NEXT: sb a6, 22(a2) +; RV64I-NEXT: srli a6, a1, 8 ; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, a0, 48 +; RV64I-NEXT: srli a5, a0, 56 ; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: srli a4, a0, 48 ; RV64I-NEXT: sb t3, 17(a2) ; RV64I-NEXT: sb t2, 18(a2) ; RV64I-NEXT: sb t1, 19(a2) -; RV64I-NEXT: srli a4, a0, 40 +; RV64I-NEXT: srli t1, a0, 40 ; RV64I-NEXT: sb s0, 28(a2) ; RV64I-NEXT: sb t6, 29(a2) ; RV64I-NEXT: sb t5, 30(a2) ; RV64I-NEXT: sb t4, 31(a2) -; RV64I-NEXT: srli a6, a0, 32 +; RV64I-NEXT: srli t2, a0, 32 ; RV64I-NEXT: sb a3, 24(a2) ; RV64I-NEXT: sb s3, 25(a2) ; RV64I-NEXT: sb s2, 26(a2) @@ -4375,19 +4352,19 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: sb s6, 5(a2) ; RV64I-NEXT: sb s5, 6(a2) ; RV64I-NEXT: sb s4, 7(a2) -; RV64I-NEXT: srli a7, a0, 16 +; RV64I-NEXT: srli t3, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb s10, 1(a2) -; RV64I-NEXT: sb s9, 2(a2) -; RV64I-NEXT: sb s8, 3(a2) +; RV64I-NEXT: sb a6, 1(a2) +; RV64I-NEXT: sb a7, 2(a2) +; RV64I-NEXT: sb t0, 3(a2) ; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: sb a6, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a5, 14(a2) -; RV64I-NEXT: sb s11, 15(a2) +; RV64I-NEXT: sb t2, 12(a2) +; RV64I-NEXT: sb t1, 13(a2) +; RV64I-NEXT: sb a4, 14(a2) +; RV64I-NEXT: sb a5, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: sb a7, 10(a2) +; RV64I-NEXT: sb t3, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) ; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload @@ -4406,132 +4383,128 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; ; RV32I-LABEL: shl_32bytes_dwordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv a3, a1 -; RV32I-NEXT: lbu a5, 0(a0) -; RV32I-NEXT: lbu a7, 1(a0) -; RV32I-NEXT: lbu t0, 2(a0) -; RV32I-NEXT: lbu t1, 3(a0) -; RV32I-NEXT: lbu s2, 4(a0) -; RV32I-NEXT: lbu s4, 5(a0) -; RV32I-NEXT: lbu s5, 6(a0) -; RV32I-NEXT: lbu s6, 7(a0) -; RV32I-NEXT: lbu s3, 8(a0) -; RV32I-NEXT: lbu s9, 9(a0) -; RV32I-NEXT: lbu s10, 10(a0) -; RV32I-NEXT: lbu s11, 11(a0) -; RV32I-NEXT: lbu ra, 12(a0) -; RV32I-NEXT: lbu a1, 13(a0) -; RV32I-NEXT: lbu t4, 14(a0) -; RV32I-NEXT: lbu t6, 15(a0) -; RV32I-NEXT: lbu a4, 16(a0) -; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a6, 17(a0) -; RV32I-NEXT: lbu t2, 18(a0) -; RV32I-NEXT: lbu t3, 19(a0) -; RV32I-NEXT: lbu a4, 20(a0) -; RV32I-NEXT: lbu t5, 21(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s2, 12(a0) +; RV32I-NEXT: lbu s3, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) ; RV32I-NEXT: lbu s0, 22(a0) ; RV32I-NEXT: lbu s1, 23(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: slli s5, s5, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: or t0, s4, s2 -; RV32I-NEXT: or t1, s6, s5 -; RV32I-NEXT: lbu s2, 24(a0) -; RV32I-NEXT: lbu s6, 25(a0) -; RV32I-NEXT: lbu s7, 26(a0) -; RV32I-NEXT: lbu s8, 27(a0) -; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or s3, s9, s3 -; RV32I-NEXT: or s4, s11, s10 -; RV32I-NEXT: or s5, a1, ra -; RV32I-NEXT: lbu s9, 28(a0) -; RV32I-NEXT: lbu a1, 29(a0) -; RV32I-NEXT: lbu s10, 30(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s3, s2 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu s2, 25(a0) +; RV32I-NEXT: lbu s3, 26(a0) +; RV32I-NEXT: lbu s4, 27(a0) +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or t6, s11, s10 +; RV32I-NEXT: lbu s5, 28(a0) +; RV32I-NEXT: lbu s6, 29(a0) +; RV32I-NEXT: lbu s7, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: lbu a3, 0(a3) +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or t4, t6, t4 -; RV32I-NEXT: addi t6, sp, 40 -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: slli t5, t5, 8 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: addi s1, sp, 32 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s3, s3, 16 +; RV32I-NEXT: slli s4, s4, 24 ; RV32I-NEXT: slli s6, s6, 8 ; RV32I-NEXT: slli s7, s7, 16 -; RV32I-NEXT: slli s8, s8, 24 -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: slli a3, a3, 3 -; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a6, a6, s11 -; RV32I-NEXT: or t2, t3, t2 -; RV32I-NEXT: or a4, t5, a4 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or t3, s6, s2 -; RV32I-NEXT: or t5, s8, s7 -; RV32I-NEXT: or a1, a1, s9 -; RV32I-NEXT: or a0, a0, s10 -; RV32I-NEXT: andi a3, a3, 24 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: or t0, s4, s3 -; RV32I-NEXT: or t1, t4, s5 -; RV32I-NEXT: or a6, t2, a6 -; RV32I-NEXT: or a4, s0, a4 -; RV32I-NEXT: or t2, t5, t3 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: sub t3, t6, a3 -; RV32I-NEXT: sw a6, 56(sp) -; RV32I-NEXT: sw a4, 60(sp) -; RV32I-NEXT: sw t2, 64(sp) -; RV32I-NEXT: sw a0, 68(sp) +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: or t3, s2, t3 +; RV32I-NEXT: or s2, s4, s3 +; RV32I-NEXT: or s3, s6, s5 +; RV32I-NEXT: or a0, a0, s7 +; RV32I-NEXT: andi a1, a1, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t5, t4 +; RV32I-NEXT: or t0, s0, t6 +; RV32I-NEXT: or t1, s2, t3 +; RV32I-NEXT: or a0, a0, s3 +; RV32I-NEXT: sub s1, s1, a1 +; RV32I-NEXT: sw a7, 48(sp) +; RV32I-NEXT: sw t0, 52(sp) +; RV32I-NEXT: sw t1, 56(sp) +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: sw a4, 36(sp) ; RV32I-NEXT: sw a5, 40(sp) -; RV32I-NEXT: sw a7, 44(sp) -; RV32I-NEXT: sw t0, 48(sp) -; RV32I-NEXT: sw t1, 52(sp) -; RV32I-NEXT: lw a6, 16(t3) -; RV32I-NEXT: lw a5, 20(t3) -; RV32I-NEXT: lw a7, 24(t3) -; RV32I-NEXT: lw a1, 0(t3) -; RV32I-NEXT: lw a0, 4(t3) -; RV32I-NEXT: lw a4, 8(t3) -; RV32I-NEXT: lw a3, 12(t3) -; RV32I-NEXT: lw t0, 28(t3) +; RV32I-NEXT: sw a6, 44(sp) +; RV32I-NEXT: lw a6, 16(s1) +; RV32I-NEXT: lw a5, 20(s1) +; RV32I-NEXT: lw a7, 24(s1) +; RV32I-NEXT: lw a1, 0(s1) +; RV32I-NEXT: lw a0, 4(s1) +; RV32I-NEXT: lw a4, 8(s1) +; RV32I-NEXT: lw a3, 12(s1) +; RV32I-NEXT: lw t0, 28(s1) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -4546,21 +4519,21 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 -; RV32I-NEXT: srli s8, a4, 8 -; RV32I-NEXT: srli s9, a3, 24 -; RV32I-NEXT: srli s10, a3, 16 -; RV32I-NEXT: srli s11, a3, 8 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a1, 24 +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) +; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a1, 16 +; RV32I-NEXT: srli t1, a3, 8 ; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli t4, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -4572,36 +4545,35 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) -; RV32I-NEXT: sb s9, 15(a2) +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb t3, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb t6, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 @@ -4846,140 +4818,137 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu t6, 0(a0) +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu t1, 4(a0) -; RV32I-NEXT: lbu t3, 5(a0) -; RV32I-NEXT: lbu t4, 6(a0) -; RV32I-NEXT: lbu t5, 7(a0) -; RV32I-NEXT: lbu t2, 8(a0) -; RV32I-NEXT: lbu s1, 9(a0) -; RV32I-NEXT: lbu s7, 10(a0) -; RV32I-NEXT: lbu s8, 11(a0) -; RV32I-NEXT: lbu s9, 12(a0) -; RV32I-NEXT: lbu s10, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s6, 15(a0) -; RV32I-NEXT: lbu s5, 16(a0) -; RV32I-NEXT: lbu s11, 17(a0) -; RV32I-NEXT: lbu ra, 18(a0) -; RV32I-NEXT: lbu a3, 19(a0) -; RV32I-NEXT: lbu s2, 20(a0) -; RV32I-NEXT: lbu s3, 21(a0) -; RV32I-NEXT: lbu a7, 22(a0) -; RV32I-NEXT: lbu t0, 23(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: or a4, a4, t6 -; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t3, t1 -; RV32I-NEXT: or a6, t5, t4 -; RV32I-NEXT: lbu t1, 24(a0) -; RV32I-NEXT: lbu t5, 25(a0) -; RV32I-NEXT: lbu t6, 26(a0) -; RV32I-NEXT: lbu s0, 27(a0) +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 ; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: slli s7, s7, 16 -; RV32I-NEXT: slli s8, s8, 24 -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: or t2, s1, t2 -; RV32I-NEXT: or t3, s8, s7 -; RV32I-NEXT: or t4, s10, s9 -; RV32I-NEXT: lbu s1, 28(a0) -; RV32I-NEXT: lbu s7, 29(a0) -; RV32I-NEXT: lbu s8, 30(a0) -; RV32I-NEXT: lbu s9, 31(a0) -; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a0, s6, s4 -; RV32I-NEXT: or s4, s11, s5 -; RV32I-NEXT: or s5, a3, ra -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu s6, 1(a1) -; RV32I-NEXT: lbu s10, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: or s2, s3, s2 -; RV32I-NEXT: addi s3, sp, 8 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: slli t6, t6, 16 -; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t6, 24(a0) +; RV32I-NEXT: lbu s0, 25(a0) +; RV32I-NEXT: lbu s1, 26(a0) +; RV32I-NEXT: lbu s2, 27(a0) +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: or s6, s11, s10 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: lbu s2, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: or s3, s4, s3 +; RV32I-NEXT: mv s4, sp +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, t5, t1 +; RV32I-NEXT: or s5, a0, s5 +; RV32I-NEXT: or s1, s2, s1 +; RV32I-NEXT: or a1, a1, s7 +; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, s6, t5 ; RV32I-NEXT: or t1, s0, t6 -; RV32I-NEXT: or t5, s7, s1 -; RV32I-NEXT: or t6, s9, s8 -; RV32I-NEXT: or a3, s6, a3 -; RV32I-NEXT: or a1, a1, s10 -; RV32I-NEXT: srai s0, s9, 31 -; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t3, t2 -; RV32I-NEXT: or a0, a0, t4 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: or a7, a7, s2 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: or t1, t6, t5 -; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: sw s0, 56(sp) -; RV32I-NEXT: sw s0, 60(sp) -; RV32I-NEXT: sw s0, 64(sp) -; RV32I-NEXT: sw s0, 68(sp) -; RV32I-NEXT: sw s0, 40(sp) -; RV32I-NEXT: sw s0, 44(sp) -; RV32I-NEXT: sw s0, 48(sp) -; RV32I-NEXT: sw s0, 52(sp) -; RV32I-NEXT: sw t2, 24(sp) -; RV32I-NEXT: sw a7, 28(sp) -; RV32I-NEXT: sw t0, 32(sp) -; RV32I-NEXT: sw t1, 36(sp) -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) -; RV32I-NEXT: sw a6, 16(sp) -; RV32I-NEXT: sw a0, 20(sp) +; RV32I-NEXT: or t2, s5, s3 +; RV32I-NEXT: or a1, a1, s1 +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: sw a0, 56(sp) +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw a0, 32(sp) +; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: sw a0, 40(sp) +; RV32I-NEXT: sw a0, 44(sp) +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t2, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: slli t1, a1, 3 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: add a1, s3, a1 +; RV32I-NEXT: add a1, s4, a1 ; RV32I-NEXT: andi a0, t1, 24 -; RV32I-NEXT: xori t0, a0, 31 +; RV32I-NEXT: xori a7, a0, 31 ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a5, 8(a1) ; RV32I-NEXT: lw a6, 12(a1) -; RV32I-NEXT: lw a7, 16(a1) +; RV32I-NEXT: lw t0, 16(a1) ; RV32I-NEXT: lw t2, 20(a1) ; RV32I-NEXT: lw t3, 24(a1) ; RV32I-NEXT: lw t4, 28(a1) @@ -4988,33 +4957,33 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srl a1, a3, t1 ; RV32I-NEXT: slli t6, a4, 1 ; RV32I-NEXT: srl a3, a6, t1 -; RV32I-NEXT: slli s0, a7, 1 +; RV32I-NEXT: slli s0, t0, 1 ; RV32I-NEXT: srl a4, a5, t1 ; RV32I-NEXT: slli s1, a6, 1 ; RV32I-NEXT: srl a5, t2, t1 ; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: srl a6, a7, t1 +; RV32I-NEXT: srl a6, t0, t1 ; RV32I-NEXT: slli t2, t2, 1 -; RV32I-NEXT: srl a7, t3, t1 +; RV32I-NEXT: srl t0, t3, t1 ; RV32I-NEXT: slli t3, t4, 1 ; RV32I-NEXT: sra t1, t4, t1 -; RV32I-NEXT: sll t4, t5, t0 -; RV32I-NEXT: sll t5, t6, t0 -; RV32I-NEXT: sll t6, s0, t0 -; RV32I-NEXT: sll s0, s1, t0 -; RV32I-NEXT: sll s1, s2, t0 -; RV32I-NEXT: sll t2, t2, t0 -; RV32I-NEXT: sll t3, t3, t0 +; RV32I-NEXT: sll t4, t5, a7 +; RV32I-NEXT: sll t5, t6, a7 +; RV32I-NEXT: sll t6, s0, a7 +; RV32I-NEXT: sll s0, s1, a7 +; RV32I-NEXT: sll s1, s2, a7 +; RV32I-NEXT: sll t2, t2, a7 +; RV32I-NEXT: sll t3, t3, a7 ; RV32I-NEXT: srli s2, t1, 24 ; RV32I-NEXT: srli s3, t1, 16 ; RV32I-NEXT: srli s4, t1, 8 -; RV32I-NEXT: or t0, a0, t4 +; RV32I-NEXT: or a7, a0, t4 ; RV32I-NEXT: or t4, a1, t5 ; RV32I-NEXT: or t5, a3, t6 ; RV32I-NEXT: or s0, a4, s0 ; RV32I-NEXT: or s1, a5, s1 ; RV32I-NEXT: or t2, a6, t2 -; RV32I-NEXT: or t3, a7, t3 +; RV32I-NEXT: or t3, t0, t3 ; RV32I-NEXT: sb t1, 28(a2) ; RV32I-NEXT: sb s4, 29(a2) ; RV32I-NEXT: sb s3, 30(a2) @@ -5031,23 +5000,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s6, s0, 24 ; RV32I-NEXT: srli s7, s0, 16 ; RV32I-NEXT: srli s0, s0, 8 -; RV32I-NEXT: srli s8, t5, 24 -; RV32I-NEXT: srli s9, t5, 16 -; RV32I-NEXT: srli t5, t5, 8 -; RV32I-NEXT: srli s10, t4, 24 -; RV32I-NEXT: srli s11, t4, 16 -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: sb t0, 24(a2) +; RV32I-NEXT: srli t0, t5, 24 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, t5, 16 +; RV32I-NEXT: srli t5, t5, 8 ; RV32I-NEXT: sb t6, 26(a2) +; RV32I-NEXT: srli t6, t4, 24 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, t0, 24 +; RV32I-NEXT: srli t1, t4, 16 +; RV32I-NEXT: srli t4, t4, 8 ; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: srli a6, a7, 24 ; RV32I-NEXT: sb t2, 17(a2) ; RV32I-NEXT: sb s3, 18(a2) ; RV32I-NEXT: sb s2, 19(a2) -; RV32I-NEXT: srli a6, t0, 16 -; RV32I-NEXT: srli t0, t0, 8 +; RV32I-NEXT: srli t2, a7, 16 +; RV32I-NEXT: srli a7, a7, 8 ; RV32I-NEXT: sb a5, 20(a2) ; RV32I-NEXT: sb s1, 21(a2) ; RV32I-NEXT: sb s5, 22(a2) @@ -5058,30 +5027,29 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: sb a3, 12(a2) ; RV32I-NEXT: sb t5, 13(a2) -; RV32I-NEXT: sb s9, 14(a2) -; RV32I-NEXT: sb s8, 15(a2) +; RV32I-NEXT: sb t3, 14(a2) +; RV32I-NEXT: sb t0, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: sb t4, 1(a2) -; RV32I-NEXT: sb s11, 2(a2) -; RV32I-NEXT: sb s10, 3(a2) +; RV32I-NEXT: sb t1, 2(a2) +; RV32I-NEXT: sb t6, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: sb t0, 5(a2) -; RV32I-NEXT: sb a6, 6(a2) -; RV32I-NEXT: sb a7, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb t2, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -5327,130 +5295,129 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; ; RV32I-LABEL: ashr_32bytes_wordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a5, 0(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 2(a0) -; RV32I-NEXT: lbu t1, 3(a0) -; RV32I-NEXT: lbu s0, 4(a0) -; RV32I-NEXT: lbu s2, 5(a0) -; RV32I-NEXT: lbu s3, 6(a0) -; RV32I-NEXT: lbu s6, 7(a0) -; RV32I-NEXT: lbu s1, 8(a0) -; RV32I-NEXT: lbu s7, 9(a0) -; RV32I-NEXT: lbu s8, 10(a0) -; RV32I-NEXT: lbu s9, 11(a0) -; RV32I-NEXT: lbu s10, 12(a0) -; RV32I-NEXT: lbu s11, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu a3, 16(a0) -; RV32I-NEXT: lbu t0, 17(a0) -; RV32I-NEXT: lbu t2, 18(a0) -; RV32I-NEXT: lbu t3, 19(a0) -; RV32I-NEXT: lbu a4, 20(a0) -; RV32I-NEXT: lbu t4, 21(a0) -; RV32I-NEXT: lbu t5, 22(a0) -; RV32I-NEXT: lbu t6, 23(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s3, s3, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t1, a7 -; RV32I-NEXT: or a7, s2, s0 -; RV32I-NEXT: or t1, s6, s3 -; RV32I-NEXT: lbu s0, 24(a0) -; RV32I-NEXT: lbu s6, 25(a0) -; RV32I-NEXT: lbu ra, 26(a0) -; RV32I-NEXT: lbu s2, 27(a0) -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: or s1, s7, s1 -; RV32I-NEXT: or s7, s9, s8 -; RV32I-NEXT: or s3, s11, s10 -; RV32I-NEXT: lbu s8, 28(a0) -; RV32I-NEXT: lbu s9, 29(a0) -; RV32I-NEXT: lbu s10, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: or s4, s5, s4 -; RV32I-NEXT: addi s5, sp, 8 +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: slli t4, t4, 8 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s6, s6, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu t5, 25(a0) +; RV32I-NEXT: lbu t6, 26(a0) +; RV32I-NEXT: lbu s0, 27(a0) +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s7, s7, 24 ; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t4, s5, s4 +; RV32I-NEXT: or s1, s7, s6 +; RV32I-NEXT: or s2, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: lbu a1, 0(a1) ; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: or s6, s11, s10 +; RV32I-NEXT: mv s7, sp +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: slli s0, s0, 24 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: or a3, t0, a3 -; RV32I-NEXT: or t0, t3, t2 -; RV32I-NEXT: or a4, t4, a4 -; RV32I-NEXT: or t2, t6, t5 -; RV32I-NEXT: or t3, s6, s0 -; RV32I-NEXT: or t4, s2, ra -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: or t6, a0, s10 +; RV32I-NEXT: or t3, t5, t3 +; RV32I-NEXT: or t5, s0, t6 +; RV32I-NEXT: or t6, s4, s3 +; RV32I-NEXT: or s0, a0, s5 ; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t1, a7 -; RV32I-NEXT: or a7, s7, s1 -; RV32I-NEXT: or t1, s4, s3 -; RV32I-NEXT: or a3, t0, a3 -; RV32I-NEXT: or a4, t2, a4 -; RV32I-NEXT: or t0, t4, t3 -; RV32I-NEXT: or t2, t6, t5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, s1, t4 +; RV32I-NEXT: or t0, s6, s2 +; RV32I-NEXT: or t1, t5, t3 +; RV32I-NEXT: or t2, s0, t6 +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 52(sp) ; RV32I-NEXT: sw a0, 56(sp) ; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a0, 64(sp) -; RV32I-NEXT: sw a0, 68(sp) +; RV32I-NEXT: sw a0, 32(sp) +; RV32I-NEXT: sw a0, 36(sp) ; RV32I-NEXT: sw a0, 40(sp) ; RV32I-NEXT: sw a0, 44(sp) -; RV32I-NEXT: sw a0, 48(sp) -; RV32I-NEXT: sw a0, 52(sp) -; RV32I-NEXT: add s5, s5, a1 -; RV32I-NEXT: sw a3, 24(sp) -; RV32I-NEXT: sw a4, 28(sp) -; RV32I-NEXT: sw t0, 32(sp) -; RV32I-NEXT: sw t2, 36(sp) +; RV32I-NEXT: add s7, s7, a1 +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t2, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) ; RV32I-NEXT: sw a5, 8(sp) ; RV32I-NEXT: sw a6, 12(sp) -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t1, 20(sp) -; RV32I-NEXT: lw a6, 16(s5) -; RV32I-NEXT: lw a5, 20(s5) -; RV32I-NEXT: lw a7, 24(s5) -; RV32I-NEXT: lw a1, 0(s5) -; RV32I-NEXT: lw a0, 4(s5) -; RV32I-NEXT: lw a4, 8(s5) -; RV32I-NEXT: lw a3, 12(s5) -; RV32I-NEXT: lw t0, 28(s5) +; RV32I-NEXT: lw a6, 16(s7) +; RV32I-NEXT: lw a5, 20(s7) +; RV32I-NEXT: lw a7, 24(s7) +; RV32I-NEXT: lw a1, 0(s7) +; RV32I-NEXT: lw a0, 4(s7) +; RV32I-NEXT: lw a4, 8(s7) +; RV32I-NEXT: lw a3, 12(s7) +; RV32I-NEXT: lw t0, 28(s7) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -5465,21 +5432,21 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 -; RV32I-NEXT: srli s8, a4, 8 -; RV32I-NEXT: srli s9, a3, 24 -; RV32I-NEXT: srli s10, a3, 16 -; RV32I-NEXT: srli s11, a3, 8 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a1, 24 +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) +; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a1, 16 +; RV32I-NEXT: srli t1, a3, 8 ; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli t4, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -5491,36 +5458,35 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) -; RV32I-NEXT: sb s9, 15(a2) +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb t3, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb t6, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %wordOff = load i256, ptr %wordOff.ptr, align 1 @@ -5546,112 +5512,112 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a5, 0(a0) -; RV64I-NEXT: lbu a7, 1(a0) -; RV64I-NEXT: lbu t1, 2(a0) -; RV64I-NEXT: lbu s3, 3(a0) -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu s8, 5(a0) -; RV64I-NEXT: lbu s9, 6(a0) -; RV64I-NEXT: lbu s10, 7(a0) -; RV64I-NEXT: lbu s2, 8(a0) -; RV64I-NEXT: lbu s4, 9(a0) -; RV64I-NEXT: lbu s5, 10(a0) -; RV64I-NEXT: lbu s6, 11(a0) -; RV64I-NEXT: lbu s7, 12(a0) -; RV64I-NEXT: lbu s11, 13(a0) -; RV64I-NEXT: lbu t4, 14(a0) -; RV64I-NEXT: lbu t5, 15(a0) -; RV64I-NEXT: lbu a3, 16(a0) -; RV64I-NEXT: lbu a6, 17(a0) -; RV64I-NEXT: lbu t2, 18(a0) -; RV64I-NEXT: lbu t3, 19(a0) -; RV64I-NEXT: lbu a4, 20(a0) -; RV64I-NEXT: lbu t6, 21(a0) -; RV64I-NEXT: lbu s0, 22(a0) -; RV64I-NEXT: lbu s1, 23(a0) -; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 ; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: slli s9, s9, 16 -; RV64I-NEXT: slli s10, s10, 24 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, s3, t1 -; RV64I-NEXT: or t0, s8, t0 -; RV64I-NEXT: or t1, s10, s9 -; RV64I-NEXT: lbu s3, 24(a0) -; RV64I-NEXT: lbu s8, 25(a0) -; RV64I-NEXT: lbu s9, 26(a0) -; RV64I-NEXT: lbu s10, 27(a0) -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: slli s5, s5, 16 -; RV64I-NEXT: slli s6, s6, 24 -; RV64I-NEXT: slli s11, s11, 8 -; RV64I-NEXT: or s2, s4, s2 -; RV64I-NEXT: or s4, s6, s5 -; RV64I-NEXT: or s5, s11, s7 -; RV64I-NEXT: lbu s6, 28(a0) -; RV64I-NEXT: lbu s7, 29(a0) -; RV64I-NEXT: lbu s11, 30(a0) +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t3, 24(a0) +; RV64I-NEXT: lbu t4, 25(a0) +; RV64I-NEXT: lbu t5, 26(a0) +; RV64I-NEXT: lbu t6, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or s0, s5, s4 +; RV64I-NEXT: or s1, s7, s6 +; RV64I-NEXT: or s2, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) ; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: lbu a1, 0(a1) -; RV64I-NEXT: slli t4, t4, 16 -; RV64I-NEXT: slli t5, t5, 24 -; RV64I-NEXT: or t4, t5, t4 -; RV64I-NEXT: mv t5, sp -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s0, s0, 16 -; RV64I-NEXT: slli s1, s1, 24 -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: slli s9, s9, 16 -; RV64I-NEXT: slli s10, s10, 24 -; RV64I-NEXT: slli s7, s7, 8 -; RV64I-NEXT: slli s11, s11, 16 +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: or s6, s11, s10 +; RV64I-NEXT: mv s7, sp +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: or a6, t3, t2 -; RV64I-NEXT: or a4, t6, a4 -; RV64I-NEXT: or s0, s1, s0 -; RV64I-NEXT: or t2, s8, s3 -; RV64I-NEXT: or t3, s10, s9 -; RV64I-NEXT: or t6, s7, s6 -; RV64I-NEXT: or a0, a0, s11 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or t4, t6, t5 +; RV64I-NEXT: or t5, s4, s3 +; RV64I-NEXT: or a0, a0, s5 ; RV64I-NEXT: andi a1, a1, 24 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, t1, t0 -; RV64I-NEXT: or t0, s4, s2 -; RV64I-NEXT: or t1, t4, s5 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: or a4, s0, a4 -; RV64I-NEXT: or a6, t3, t2 -; RV64I-NEXT: or a0, a0, t6 -; RV64I-NEXT: add t5, t5, a1 -; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or a7, s6, s2 +; RV64I-NEXT: or t0, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: add s7, s7, a1 ; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: sraiw a0, a0, 31 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, a7, s0 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: sd a0, 32(sp) ; RV64I-NEXT: sd a0, 40(sp) ; RV64I-NEXT: sd a0, 48(sp) ; RV64I-NEXT: sd a0, 56(sp) -; RV64I-NEXT: sd a5, 0(sp) -; RV64I-NEXT: sd a7, 8(sp) -; RV64I-NEXT: sd a3, 16(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a4, 8(sp) +; RV64I-NEXT: sd a5, 16(sp) ; RV64I-NEXT: sd a1, 24(sp) -; RV64I-NEXT: ld a4, 16(t5) -; RV64I-NEXT: ld a0, 8(t5) -; RV64I-NEXT: ld a1, 0(t5) -; RV64I-NEXT: ld a3, 24(t5) +; RV64I-NEXT: ld a4, 16(s7) +; RV64I-NEXT: ld a0, 8(s7) +; RV64I-NEXT: ld a1, 0(s7) +; RV64I-NEXT: ld a3, 24(s7) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 ; RV64I-NEXT: srli a7, a4, 40 @@ -5670,25 +5636,25 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: srli s5, a1, 48 ; RV64I-NEXT: srli s6, a1, 40 ; RV64I-NEXT: srli s7, a1, 32 -; RV64I-NEXT: srli s8, a1, 24 -; RV64I-NEXT: srli s9, a1, 16 -; RV64I-NEXT: srli s10, a1, 8 -; RV64I-NEXT: srli s11, a0, 56 ; RV64I-NEXT: sb t0, 20(a2) +; RV64I-NEXT: srli t0, a1, 24 ; RV64I-NEXT: sb a7, 21(a2) +; RV64I-NEXT: srli a7, a1, 16 ; RV64I-NEXT: sb a6, 22(a2) +; RV64I-NEXT: srli a6, a1, 8 ; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, a0, 48 +; RV64I-NEXT: srli a5, a0, 56 ; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: srli a4, a0, 48 ; RV64I-NEXT: sb t3, 17(a2) ; RV64I-NEXT: sb t2, 18(a2) ; RV64I-NEXT: sb t1, 19(a2) -; RV64I-NEXT: srli a4, a0, 40 +; RV64I-NEXT: srli t1, a0, 40 ; RV64I-NEXT: sb s0, 28(a2) ; RV64I-NEXT: sb t6, 29(a2) ; RV64I-NEXT: sb t5, 30(a2) ; RV64I-NEXT: sb t4, 31(a2) -; RV64I-NEXT: srli a6, a0, 32 +; RV64I-NEXT: srli t2, a0, 32 ; RV64I-NEXT: sb a3, 24(a2) ; RV64I-NEXT: sb s3, 25(a2) ; RV64I-NEXT: sb s2, 26(a2) @@ -5698,19 +5664,19 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sb s6, 5(a2) ; RV64I-NEXT: sb s5, 6(a2) ; RV64I-NEXT: sb s4, 7(a2) -; RV64I-NEXT: srli a7, a0, 16 +; RV64I-NEXT: srli t3, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb s10, 1(a2) -; RV64I-NEXT: sb s9, 2(a2) -; RV64I-NEXT: sb s8, 3(a2) +; RV64I-NEXT: sb a6, 1(a2) +; RV64I-NEXT: sb a7, 2(a2) +; RV64I-NEXT: sb t0, 3(a2) ; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: sb a6, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a5, 14(a2) -; RV64I-NEXT: sb s11, 15(a2) +; RV64I-NEXT: sb t2, 12(a2) +; RV64I-NEXT: sb t1, 13(a2) +; RV64I-NEXT: sb a4, 14(a2) +; RV64I-NEXT: sb a5, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: sb a7, 10(a2) +; RV64I-NEXT: sb t3, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) ; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload @@ -5729,130 +5695,129 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; ; RV32I-LABEL: ashr_32bytes_dwordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a5, 0(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 2(a0) -; RV32I-NEXT: lbu t1, 3(a0) -; RV32I-NEXT: lbu s0, 4(a0) -; RV32I-NEXT: lbu s2, 5(a0) -; RV32I-NEXT: lbu s3, 6(a0) -; RV32I-NEXT: lbu s6, 7(a0) -; RV32I-NEXT: lbu s1, 8(a0) -; RV32I-NEXT: lbu s7, 9(a0) -; RV32I-NEXT: lbu s8, 10(a0) -; RV32I-NEXT: lbu s9, 11(a0) -; RV32I-NEXT: lbu s10, 12(a0) -; RV32I-NEXT: lbu s11, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu a3, 16(a0) -; RV32I-NEXT: lbu t0, 17(a0) -; RV32I-NEXT: lbu t2, 18(a0) -; RV32I-NEXT: lbu t3, 19(a0) -; RV32I-NEXT: lbu a4, 20(a0) -; RV32I-NEXT: lbu t4, 21(a0) -; RV32I-NEXT: lbu t5, 22(a0) -; RV32I-NEXT: lbu t6, 23(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s3, s3, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t1, a7 -; RV32I-NEXT: or a7, s2, s0 -; RV32I-NEXT: or t1, s6, s3 -; RV32I-NEXT: lbu s0, 24(a0) -; RV32I-NEXT: lbu s6, 25(a0) -; RV32I-NEXT: lbu ra, 26(a0) -; RV32I-NEXT: lbu s2, 27(a0) -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: or s1, s7, s1 -; RV32I-NEXT: or s7, s9, s8 -; RV32I-NEXT: or s3, s11, s10 -; RV32I-NEXT: lbu s8, 28(a0) -; RV32I-NEXT: lbu s9, 29(a0) -; RV32I-NEXT: lbu s10, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: or s4, s5, s4 -; RV32I-NEXT: addi s5, sp, 8 +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: slli t4, t4, 8 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s6, s6, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu t5, 25(a0) +; RV32I-NEXT: lbu t6, 26(a0) +; RV32I-NEXT: lbu s0, 27(a0) +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s7, s7, 24 ; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t4, s5, s4 +; RV32I-NEXT: or s1, s7, s6 +; RV32I-NEXT: or s2, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: lbu a1, 0(a1) ; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: or s6, s11, s10 +; RV32I-NEXT: mv s7, sp +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: slli s0, s0, 24 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: or a3, t0, a3 -; RV32I-NEXT: or t0, t3, t2 -; RV32I-NEXT: or a4, t4, a4 -; RV32I-NEXT: or t2, t6, t5 -; RV32I-NEXT: or t3, s6, s0 -; RV32I-NEXT: or t4, s2, ra -; RV32I-NEXT: or t5, s9, s8 -; RV32I-NEXT: or t6, a0, s10 +; RV32I-NEXT: or t3, t5, t3 +; RV32I-NEXT: or t5, s0, t6 +; RV32I-NEXT: or t6, s4, s3 +; RV32I-NEXT: or s0, a0, s5 ; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: andi a1, a1, 24 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t1, a7 -; RV32I-NEXT: or a7, s7, s1 -; RV32I-NEXT: or t1, s4, s3 -; RV32I-NEXT: or a3, t0, a3 -; RV32I-NEXT: or a4, t2, a4 -; RV32I-NEXT: or t0, t4, t3 -; RV32I-NEXT: or t2, t6, t5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, s1, t4 +; RV32I-NEXT: or t0, s6, s2 +; RV32I-NEXT: or t1, t5, t3 +; RV32I-NEXT: or t2, s0, t6 +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 52(sp) ; RV32I-NEXT: sw a0, 56(sp) ; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a0, 64(sp) -; RV32I-NEXT: sw a0, 68(sp) +; RV32I-NEXT: sw a0, 32(sp) +; RV32I-NEXT: sw a0, 36(sp) ; RV32I-NEXT: sw a0, 40(sp) ; RV32I-NEXT: sw a0, 44(sp) -; RV32I-NEXT: sw a0, 48(sp) -; RV32I-NEXT: sw a0, 52(sp) -; RV32I-NEXT: add s5, s5, a1 -; RV32I-NEXT: sw a3, 24(sp) -; RV32I-NEXT: sw a4, 28(sp) -; RV32I-NEXT: sw t0, 32(sp) -; RV32I-NEXT: sw t2, 36(sp) +; RV32I-NEXT: add s7, s7, a1 +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t2, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) ; RV32I-NEXT: sw a5, 8(sp) ; RV32I-NEXT: sw a6, 12(sp) -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t1, 20(sp) -; RV32I-NEXT: lw a6, 16(s5) -; RV32I-NEXT: lw a5, 20(s5) -; RV32I-NEXT: lw a7, 24(s5) -; RV32I-NEXT: lw a1, 0(s5) -; RV32I-NEXT: lw a0, 4(s5) -; RV32I-NEXT: lw a4, 8(s5) -; RV32I-NEXT: lw a3, 12(s5) -; RV32I-NEXT: lw t0, 28(s5) +; RV32I-NEXT: lw a6, 16(s7) +; RV32I-NEXT: lw a5, 20(s7) +; RV32I-NEXT: lw a7, 24(s7) +; RV32I-NEXT: lw a1, 0(s7) +; RV32I-NEXT: lw a0, 4(s7) +; RV32I-NEXT: lw a4, 8(s7) +; RV32I-NEXT: lw a3, 12(s7) +; RV32I-NEXT: lw t0, 28(s7) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 @@ -5867,21 +5832,21 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: srli s5, a5, 8 ; RV32I-NEXT: srli s6, a4, 24 ; RV32I-NEXT: srli s7, a4, 16 -; RV32I-NEXT: srli s8, a4, 8 -; RV32I-NEXT: srli s9, a3, 24 -; RV32I-NEXT: srli s10, a3, 16 -; RV32I-NEXT: srli s11, a3, 8 ; RV32I-NEXT: sb a7, 24(a2) -; RV32I-NEXT: srli a7, a1, 24 +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, a3, 24 ; RV32I-NEXT: sb t2, 26(a2) +; RV32I-NEXT: srli t2, a3, 16 ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, a1, 16 +; RV32I-NEXT: srli t1, a3, 8 ; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: srli t0, a1, 24 ; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: srli t6, a1, 16 ; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb t4, 31(a2) -; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: srli t4, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb s2, 17(a2) ; RV32I-NEXT: sb s1, 18(a2) @@ -5893,36 +5858,35 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: sb s3, 23(a2) ; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb s7, 10(a2) ; RV32I-NEXT: sb s6, 11(a2) ; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb s11, 13(a2) -; RV32I-NEXT: sb s10, 14(a2) -; RV32I-NEXT: sb s9, 15(a2) +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb t3, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb t0, 1(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb t6, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index b2c130c2d7c10..b8952d2cb2b29 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -1530,25 +1530,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: lshr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a6, 2(a0) -; RV32I-NEXT: lbu a7, 3(a0) -; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) @@ -1557,107 +1556,105 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu t5, 10(a0) ; RV32I-NEXT: lbu t6, 11(a0) ; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s2, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu s6, 16(a0) -; RV32I-NEXT: lbu s7, 17(a0) -; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: or a4, a7, a6 -; RV32I-NEXT: lbu s10, 20(a0) -; RV32I-NEXT: lbu s11, 21(a0) -; RV32I-NEXT: lbu ra, 22(a0) -; RV32I-NEXT: lbu a3, 23(a0) +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: slli t4, t4, 8 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 ; RV32I-NEXT: or a7, t4, t3 ; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: lbu s1, 24(a0) -; RV32I-NEXT: lbu s3, 25(a0) -; RV32I-NEXT: lbu t4, 26(a0) -; RV32I-NEXT: lbu t5, 27(a0) -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: or t1, s2, s0 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: or t3, s7, s6 -; RV32I-NEXT: lbu t6, 28(a0) +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t6, 24(a0) +; RV32I-NEXT: lbu s0, 25(a0) +; RV32I-NEXT: lbu s1, 26(a0) +; RV32I-NEXT: lbu s2, 27(a0) +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) ; RV32I-NEXT: lbu s4, 29(a0) ; RV32I-NEXT: lbu s5, 30(a0) ; RV32I-NEXT: lbu s6, 31(a0) -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a0, s9, s8 -; RV32I-NEXT: or s0, s11, s10 -; RV32I-NEXT: or s2, a3, ra -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu s7, 1(a1) -; RV32I-NEXT: lbu s8, 2(a1) +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: or a0, s11, s10 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: lbu s2, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 64(sp) -; RV32I-NEXT: sw zero, 68(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) -; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: or s1, s3, s1 -; RV32I-NEXT: addi s3, sp, 8 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t5, t5, 24 ; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: or s3, s4, s3 +; RV32I-NEXT: mv s4, sp ; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: or t5, s4, t6 -; RV32I-NEXT: or t6, s6, s5 -; RV32I-NEXT: or a3, s7, a3 -; RV32I-NEXT: or a1, a1, s8 -; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, s4 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a7, t2, t1 -; RV32I-NEXT: or t0, a0, t3 -; RV32I-NEXT: or t1, s2, s0 -; RV32I-NEXT: or t2, t4, s1 -; RV32I-NEXT: or t3, t6, t5 -; RV32I-NEXT: or a0, a1, a3 -; RV32I-NEXT: sw t0, 24(sp) -; RV32I-NEXT: sw t1, 28(sp) -; RV32I-NEXT: sw t2, 32(sp) -; RV32I-NEXT: sw t3, 36(sp) -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) -; RV32I-NEXT: sw a6, 16(sp) -; RV32I-NEXT: sw a7, 20(sp) +; RV32I-NEXT: or s5, s6, s5 +; RV32I-NEXT: or s1, s2, s1 +; RV32I-NEXT: or a1, a1, s7 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, a0, t5 +; RV32I-NEXT: or t1, s0, t6 +; RV32I-NEXT: or t2, s5, s3 +; RV32I-NEXT: or a0, a1, s1 +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t2, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: srli a1, a0, 3 ; RV32I-NEXT: andi a3, a0, 31 ; RV32I-NEXT: andi a4, a1, 28 ; RV32I-NEXT: xori a1, a3, 31 -; RV32I-NEXT: add a4, s3, a4 +; RV32I-NEXT: add a4, s4, a4 ; RV32I-NEXT: lw a3, 0(a4) ; RV32I-NEXT: lw a5, 4(a4) ; RV32I-NEXT: lw a6, 8(a4) @@ -1717,13 +1714,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s5, a3, 24 ; RV32I-NEXT: srli s6, a3, 16 ; RV32I-NEXT: srli s7, a3, 8 -; RV32I-NEXT: srli s8, a1, 24 -; RV32I-NEXT: srli s9, a1, 16 ; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t2, 25(a2) +; RV32I-NEXT: srli t2, a1, 16 ; RV32I-NEXT: sb t1, 26(a2) ; RV32I-NEXT: sb t0, 27(a2) -; RV32I-NEXT: srli a7, a1, 8 +; RV32I-NEXT: srli t0, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb t5, 17(a2) ; RV32I-NEXT: sb t4, 18(a2) @@ -1744,27 +1741,26 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 14(a2) ; RV32I-NEXT: sb s5, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: sb s9, 2(a2) -; RV32I-NEXT: sb s8, 3(a2) +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb t2, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 @@ -2006,25 +2002,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a6, 2(a0) -; RV32I-NEXT: lbu a7, 3(a0) -; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) @@ -2033,107 +2028,105 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu t5, 10(a0) ; RV32I-NEXT: lbu t6, 11(a0) ; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s2, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu s6, 16(a0) -; RV32I-NEXT: lbu s7, 17(a0) -; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: or a4, a7, a6 -; RV32I-NEXT: lbu s10, 20(a0) -; RV32I-NEXT: lbu s11, 21(a0) -; RV32I-NEXT: lbu ra, 22(a0) -; RV32I-NEXT: lbu a3, 23(a0) +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: slli t4, t4, 8 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 ; RV32I-NEXT: or a7, t4, t3 ; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: lbu s1, 24(a0) -; RV32I-NEXT: lbu s3, 25(a0) -; RV32I-NEXT: lbu t4, 26(a0) -; RV32I-NEXT: lbu t5, 27(a0) -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: or t1, s2, s0 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: or t3, s7, s6 -; RV32I-NEXT: lbu t6, 28(a0) +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: lbu t6, 24(a0) +; RV32I-NEXT: lbu s0, 25(a0) +; RV32I-NEXT: lbu s1, 26(a0) +; RV32I-NEXT: lbu s2, 27(a0) +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) ; RV32I-NEXT: lbu s4, 29(a0) ; RV32I-NEXT: lbu s5, 30(a0) ; RV32I-NEXT: lbu s6, 31(a0) -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a0, s9, s8 -; RV32I-NEXT: or s0, s11, s10 -; RV32I-NEXT: or s2, a3, ra -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu s7, 1(a1) -; RV32I-NEXT: lbu s8, 2(a1) +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: or a0, s11, s10 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: lbu s2, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) -; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: or s1, s3, s1 -; RV32I-NEXT: addi s3, sp, 40 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t5, t5, 24 ; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: or s3, s4, s3 +; RV32I-NEXT: addi s4, sp, 32 ; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: or t5, s4, t6 -; RV32I-NEXT: or t6, s6, s5 -; RV32I-NEXT: or a3, s7, a3 -; RV32I-NEXT: or a1, a1, s8 -; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, s4 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a7, t2, t1 -; RV32I-NEXT: or t0, a0, t3 -; RV32I-NEXT: or t1, s2, s0 -; RV32I-NEXT: or t2, t4, s1 -; RV32I-NEXT: or t3, t6, t5 -; RV32I-NEXT: or a0, a1, a3 -; RV32I-NEXT: sw t0, 56(sp) -; RV32I-NEXT: sw t1, 60(sp) -; RV32I-NEXT: sw t2, 64(sp) -; RV32I-NEXT: sw t3, 68(sp) -; RV32I-NEXT: sw a4, 40(sp) -; RV32I-NEXT: sw a5, 44(sp) -; RV32I-NEXT: sw a6, 48(sp) -; RV32I-NEXT: sw a7, 52(sp) +; RV32I-NEXT: or s5, s6, s5 +; RV32I-NEXT: or s1, s2, s1 +; RV32I-NEXT: or a1, a1, s7 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, a0, t5 +; RV32I-NEXT: or t1, s0, t6 +; RV32I-NEXT: or t2, s5, s3 +; RV32I-NEXT: or a0, a1, s1 +; RV32I-NEXT: sw a7, 48(sp) +; RV32I-NEXT: sw t0, 52(sp) +; RV32I-NEXT: sw t1, 56(sp) +; RV32I-NEXT: sw t2, 60(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: sw a4, 36(sp) +; RV32I-NEXT: sw a5, 40(sp) +; RV32I-NEXT: sw a6, 44(sp) ; RV32I-NEXT: srli a1, a0, 3 ; RV32I-NEXT: andi a3, a0, 31 ; RV32I-NEXT: andi a4, a1, 28 ; RV32I-NEXT: xori a1, a3, 31 -; RV32I-NEXT: sub a3, s3, a4 +; RV32I-NEXT: sub a3, s4, a4 ; RV32I-NEXT: lw a4, 0(a3) ; RV32I-NEXT: lw a5, 4(a3) ; RV32I-NEXT: lw a6, 8(a3) @@ -2193,13 +2186,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s5, a3, 24 ; RV32I-NEXT: srli s6, a3, 16 ; RV32I-NEXT: srli s7, a3, 8 -; RV32I-NEXT: srli s8, a1, 24 -; RV32I-NEXT: srli s9, a1, 16 ; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t2, 25(a2) +; RV32I-NEXT: srli t2, a1, 16 ; RV32I-NEXT: sb t1, 26(a2) ; RV32I-NEXT: sb t0, 27(a2) -; RV32I-NEXT: srli a7, a1, 8 +; RV32I-NEXT: srli t0, a1, 8 ; RV32I-NEXT: sb a6, 28(a2) ; RV32I-NEXT: sb t5, 29(a2) ; RV32I-NEXT: sb t4, 30(a2) @@ -2220,27 +2213,26 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 10(a2) ; RV32I-NEXT: sb s5, 11(a2) ; RV32I-NEXT: sb a1, 12(a2) -; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: sb s9, 14(a2) -; RV32I-NEXT: sb s8, 15(a2) +; RV32I-NEXT: sb t0, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb a7, 15(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 @@ -2483,25 +2475,24 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -128 -; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -112 +; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a6, 2(a0) -; RV32I-NEXT: lbu a7, 3(a0) -; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) @@ -2518,100 +2509,98 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s6, 18(a0) ; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: or a4, a7, a6 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 ; RV32I-NEXT: lbu s8, 20(a0) ; RV32I-NEXT: lbu s9, 21(a0) ; RV32I-NEXT: lbu s10, 22(a0) ; RV32I-NEXT: lbu s11, 23(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: slli t4, t4, 8 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: lbu ra, 24(a0) -; RV32I-NEXT: lbu a3, 25(a0) -; RV32I-NEXT: lbu t4, 26(a0) -; RV32I-NEXT: lbu t5, 27(a0) ; RV32I-NEXT: slli s1, s1, 8 ; RV32I-NEXT: slli s2, s2, 16 ; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 ; RV32I-NEXT: or t1, s1, s0 ; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: or t3, s5, s4 -; RV32I-NEXT: lbu t6, 28(a0) -; RV32I-NEXT: lbu s0, 29(a0) -; RV32I-NEXT: lbu s1, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: lbu t6, 24(a0) +; RV32I-NEXT: lbu s0, 25(a0) +; RV32I-NEXT: lbu s1, 26(a0) +; RV32I-NEXT: lbu s2, 27(a0) +; RV32I-NEXT: slli s5, s5, 8 ; RV32I-NEXT: slli s6, s6, 16 ; RV32I-NEXT: slli s7, s7, 24 ; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: or s2, s7, s6 -; RV32I-NEXT: or s3, s9, s8 -; RV32I-NEXT: or s4, s11, s10 -; RV32I-NEXT: lbu s5, 0(a1) -; RV32I-NEXT: lbu s6, 1(a1) -; RV32I-NEXT: lbu s7, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, ra -; RV32I-NEXT: addi s8, sp, 8 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t5, t5, 24 ; RV32I-NEXT: slli s0, s0, 8 ; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: or s6, s11, s10 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: lbu s2, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: or s3, s4, s3 +; RV32I-NEXT: mv s4, sp +; RV32I-NEXT: slli s5, s5, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s2, s2, 8 ; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: or t5, s0, t6 -; RV32I-NEXT: or s1, a0, s1 -; RV32I-NEXT: or t6, s6, s5 +; RV32I-NEXT: or s5, a0, s5 +; RV32I-NEXT: or s1, s2, s1 ; RV32I-NEXT: or a1, a1, s7 -; RV32I-NEXT: srai s0, a0, 31 -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, a0 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a7, t2, t1 -; RV32I-NEXT: or t0, s2, t3 -; RV32I-NEXT: or t1, s4, s3 -; RV32I-NEXT: or a3, t4, a3 -; RV32I-NEXT: or t2, s1, t5 -; RV32I-NEXT: or a0, a1, t6 -; RV32I-NEXT: sw s0, 56(sp) -; RV32I-NEXT: sw s0, 60(sp) -; RV32I-NEXT: sw s0, 64(sp) -; RV32I-NEXT: sw s0, 68(sp) -; RV32I-NEXT: sw s0, 40(sp) -; RV32I-NEXT: sw s0, 44(sp) -; RV32I-NEXT: sw s0, 48(sp) -; RV32I-NEXT: sw s0, 52(sp) -; RV32I-NEXT: sw t0, 24(sp) -; RV32I-NEXT: sw t1, 28(sp) -; RV32I-NEXT: sw a3, 32(sp) -; RV32I-NEXT: sw t2, 36(sp) -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) -; RV32I-NEXT: sw a6, 16(sp) -; RV32I-NEXT: sw a7, 20(sp) +; RV32I-NEXT: srai s2, a0, 31 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, s6, t5 +; RV32I-NEXT: or t1, s0, t6 +; RV32I-NEXT: or t2, s5, s3 +; RV32I-NEXT: or a0, a1, s1 +; RV32I-NEXT: sw s2, 48(sp) +; RV32I-NEXT: sw s2, 52(sp) +; RV32I-NEXT: sw s2, 56(sp) +; RV32I-NEXT: sw s2, 60(sp) +; RV32I-NEXT: sw s2, 32(sp) +; RV32I-NEXT: sw s2, 36(sp) +; RV32I-NEXT: sw s2, 40(sp) +; RV32I-NEXT: sw s2, 44(sp) +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t2, 28(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: srli a1, a0, 3 ; RV32I-NEXT: andi a3, a0, 31 ; RV32I-NEXT: andi a4, a1, 28 ; RV32I-NEXT: xori a1, a3, 31 -; RV32I-NEXT: add a4, s8, a4 +; RV32I-NEXT: add a4, s4, a4 ; RV32I-NEXT: lw a3, 0(a4) ; RV32I-NEXT: lw a5, 4(a4) ; RV32I-NEXT: lw a6, 8(a4) @@ -2671,13 +2660,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli s5, a3, 24 ; RV32I-NEXT: srli s6, a3, 16 ; RV32I-NEXT: srli s7, a3, 8 -; RV32I-NEXT: srli s8, a1, 24 -; RV32I-NEXT: srli s9, a1, 16 ; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: srli a7, a1, 24 ; RV32I-NEXT: sb t2, 25(a2) +; RV32I-NEXT: srli t2, a1, 16 ; RV32I-NEXT: sb t1, 26(a2) ; RV32I-NEXT: sb t0, 27(a2) -; RV32I-NEXT: srli a7, a1, 8 +; RV32I-NEXT: srli t0, a1, 8 ; RV32I-NEXT: sb a6, 16(a2) ; RV32I-NEXT: sb t5, 17(a2) ; RV32I-NEXT: sb t4, 18(a2) @@ -2698,27 +2687,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s6, 14(a2) ; RV32I-NEXT: sb s5, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: sb s9, 2(a2) -; RV32I-NEXT: sb s8, 3(a2) +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb t2, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a5, 6(a2) ; RV32I-NEXT: sb a6, 7(a2) -; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 128 +; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 112 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc index 2c4b1f36ffd23..67759bd5c4632 100644 --- a/llvm/unittests/CodeGen/MFCommon.inc +++ b/llvm/unittests/CodeGen/MFCommon.inc @@ -50,8 +50,8 @@ public: const char *getRegPressureSetName(unsigned Idx) const override { return "bogus"; } - unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx, - bool RemoveReserved) const override { + unsigned getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const override { return 0; } const int * diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index 674925c1b2acd..a6f87119aca5b 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -275,8 +275,7 @@ void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS, OS << "// Get the register unit pressure limit for this dimension.\n" << "// This limit must be adjusted dynamically for reserved registers.\n" << "unsigned " << ClassName << "::\n" - << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx, bool " - "RemoveReserved) const " + << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const " "{\n" << " static const " << getMinimalTypeForRange(MaxRegUnitWeight, 32) << " PressureLimitTable[] = {\n"; @@ -1131,7 +1130,7 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) { << " unsigned getNumRegPressureSets() const override;\n" << " const char *getRegPressureSetName(unsigned Idx) const override;\n" << " unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned " - "Idx, bool RemoveReserved = true) const override;\n" + "Idx) const override;\n" << " const int *getRegClassPressureSets(" << "const TargetRegisterClass *RC) const override;\n" << " const int *getRegUnitPressureSets("