From e5c19148b23a2e666c79f2f1bd21661b7dbeeb4e Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal Date: Wed, 9 Apr 2025 19:10:28 +0530 Subject: [PATCH 1/3] [X86][DAGCombiner][SelectionDAG] - Fold Zext Build Vector to Bitcast of widen Build Vector --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 58 ++++ llvm/test/CodeGen/X86/WidenBuildVector.ll | 258 ++++++++++++++++++ 2 files changed, 316 insertions(+) create mode 100644 llvm/test/CodeGen/X86/WidenBuildVector.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 38376de5783ae..77c659aad0ed2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14195,6 +14195,61 @@ static SDValue widenAbs(SDNode *Extend, SelectionDAG &DAG) { return DAG.getZExtOrTrunc(NewAbs, SDLoc(Extend), VT); } +// Try to widen the build vector and bitcast it to the type of zext. +// This is a special case for the 128-bit vector types. Intention is to remove +// the zext and replace it with a bitcast the wider type. While lowering +// the bitcast is removed and extra commutation due to zext is avoided. +static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) { + + assert(Extend->getOpcode() == ISD::ZERO_EXTEND && "Expected zero extend."); + + EVT ExtendVT = Extend->getValueType(0); + + SDValue BV = Extend->getOperand(0); + if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse()) + return SDValue(); + + SDLoc dl(BV); + EVT VT = BV.getValueType(); + EVT EltVT = BV.getOperand(0).getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + unsigned WidenNumElts = WidenVT.getVectorNumElements(); + + SmallVector NewOps(BV->op_begin(), BV->op_end()); + assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!"); + // Fill the new elements with Zero. + NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT)); + // Compute the step to place the elements in the right place and control the + // iteration. + unsigned step = WidenNumElts / NumElts; + if (WidenVT.is128BitVector()) { + if (Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) { + for (int i = NumElts - 1, j = WidenNumElts - step; i > 0; + i--, j -= step) { + SDValue temp = NewOps[i]; + NewOps[i] = NewOps[j]; + NewOps[j] = temp; + } + // Create new build vector with WidenVT and NewOps + SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps); + // Replace the old build vector with the new one. Bitcast the + // new build vector to the type of the zext. + SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV); + DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast); + LLVM_DEBUG( + dbgs() << DAG.getMachineFunction().getFunction().getName() + << " - Widening buildvector and replace zext with bitcast\n"; + BV.dump(); Extend->dump(); dbgs() << " to \n"; + NewBV.getNode()->dump(); NewBVBitcast->dump();); + return NewBV; + } + } + return SDValue(); +} + SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -14521,6 +14576,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { return SDValue(CSENode, 0); } + if (SDValue V = widenBuildVec(N, DAG)) + return V; + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/WidenBuildVector.ll b/llvm/test/CodeGen/X86/WidenBuildVector.ll new file mode 100644 index 0000000000000..d2924d016a1bf --- /dev/null +++ b/llvm/test/CodeGen/X86/WidenBuildVector.ll @@ -0,0 +1,258 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mcpu=znver5 -mtriple=x86_64-unknown-unknown < %s | FileCheck %s + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable +define dso_local i32 @foov8i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { +; CHECK-LABEL: foov8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movzbl (%rdi), %eax +; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx +; CHECK-NEXT: leaq (%rsi,%rsi,4), %r8 +; CHECK-NEXT: leaq (,%rsi,8), %r9 +; CHECK-NEXT: subq %rsi, %r9 +; CHECK-NEXT: vmovd %eax, %xmm0 +; CHECK-NEXT: vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrb $10, (%rdi,%r8), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrb $12, (%rdi,%rcx,2), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrb $14, (%rdi,%r9), %xmm0, %xmm0 +; CHECK-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: retq +entry: + %var0 = load i8, ptr %a, align 1 + %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride + %var1 = load i8, ptr %arrayidx.1, align 1 + %mul.2 = shl nsw i64 %a_stride, 1 + %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2 + %var2 = load i8, ptr %arrayidx.2, align 1 + %mul.3 = mul nsw i64 %a_stride, 3 + %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3 + %var3 = load i8, ptr %arrayidx.3, align 1 + %mul.4 = shl nsw i64 %a_stride, 2 + %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 %mul.4 + %var4 = load i8, ptr %arrayidx.4, align 1 + %mul.5 = mul nsw i64 %a_stride, 5 + %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 %mul.5 + %var5 = load i8, ptr %arrayidx.5, align 1 + %mul.6 = mul nsw i64 %a_stride, 6 + %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 %mul.6 + %var6 = load i8, ptr %arrayidx.6, align 1 + %mul.7 = mul nsw i64 %a_stride, 7 + %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 %mul.7 + %var7 = load i8, ptr %arrayidx.7, align 1 + %var8 = insertelement <8 x i8> poison, i8 %var0, i64 0 + %var9 = insertelement <8 x i8> %var8, i8 %var1, i64 1 + %var10 = insertelement <8 x i8> %var9, i8 %var2, i64 2 + %var11 = insertelement <8 x i8> %var10, i8 %var3, i64 3 + %var12 = insertelement <8 x i8> %var11, i8 %var4, i64 4 + %var13 = insertelement <8 x i8> %var12, i8 %var5, i64 5 + %var14 = insertelement <8 x i8> %var13, i8 %var6, i64 6 + %var15 = insertelement <8 x i8> %var14, i8 %var7, i64 7 + %var16 = zext <8 x i8> %var15 to <8 x i32> + %var17 = load <8 x i16>, ptr %b, align 2 + %var18 = sext <8 x i16> %var17 to <8 x i32> + %var19 = mul nsw <8 x i32> %var18, %var16 + %var20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %var19) + ret i32 %var20 +} + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable +define dso_local i32 @foov4i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { +; CHECK-LABEL: foov4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movzbl (%rdi), %eax +; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; CHECK-NEXT: vmovd %eax, %xmm0 +; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi,2), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrb $12, (%rdi,%rcx), %xmm0, %xmm0 +; CHECK-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: retq +entry: + %var0 = load i8, ptr %a, align 1 + %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride + %var1 = load i8, ptr %arrayidx.1, align 1 + %mul.2 = shl nsw i64 %a_stride, 1 + %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2 + %var2 = load i8, ptr %arrayidx.2, align 1 + %mul.3 = mul nsw i64 %a_stride, 3 + %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3 + %var3 = load i8, ptr %arrayidx.3, align 1 + %var8 = insertelement <4 x i8> poison, i8 %var0, i64 0 + %var9 = insertelement <4 x i8> %var8, i8 %var1, i64 1 + %var10 = insertelement <4 x i8> %var9, i8 %var2, i64 2 + %var11 = insertelement <4 x i8> %var10, i8 %var3, i64 3 + %var16 = zext <4 x i8> %var11 to <4 x i32> + %var17 = load <4 x i16>, ptr %b, align 2 + %var18 = sext <4 x i16> %var17 to <4 x i32> + %var19 = mul nsw <4 x i32> %var18, %var16 + %var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19) + ret i32 %var20 +} + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable +define dso_local i32 @foov2i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { +; CHECK-LABEL: foov2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: movzbl (%rdi), %eax +; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovd %eax, %xmm0 +; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0 +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; CHECK-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: retq + %var0 = load i8, ptr %a, align 1 + %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride + %var1 = load i8, ptr %arrayidx.1, align 1 + %var8 = insertelement <2 x i8> poison, i8 %var0, i64 0 + %var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1 + %var16 = zext <2 x i8> %var9 to <2 x i32> + %var17 = load <2 x i16>, ptr %b, align 2 + %var18 = sext <2 x i16> %var17 to <2 x i32> + %var19 = mul nsw <2 x i32> %var18, %var16 + %var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19) + ret i32 %var20 +} + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable +define dso_local i64 @foov2i8_v2i64(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { +; CHECK-LABEL: foov2i8_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: movzbl (%rdi), %eax +; CHECK-NEXT: vpmovsxbq (%rdx), %xmm1 +; CHECK-NEXT: vmovd %eax, %xmm0 +; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi), %xmm0, %xmm0 +; CHECK-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovq %xmm0, %rax +; CHECK-NEXT: retq + %var0 = load i8, ptr %a, align 1 + %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride + %var1 = load i8, ptr %arrayidx.1, align 1 + %var8 = insertelement <2 x i8> poison, i8 %var0, i64 0 + %var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1 + %var16 = zext <2 x i8> %var9 to <2 x i64> + %var17 = load <2 x i8>, ptr %b, align 2 + %var18 = sext <2 x i8> %var17 to <2 x i64> + %var19 = mul nsw <2 x i64> %var18, %var16 + %var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19) + ret i64 %var20 +} + + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable +define dso_local i32 @foov4i16(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { +; CHECK-LABEL: foov4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movzwl (%rdi), %eax +; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx +; CHECK-NEXT: vpmovsxwd (%rdx), %xmm1 +; CHECK-NEXT: vmovd %eax, %xmm0 +; CHECK-NEXT: vpinsrw $2, (%rdi,%rsi), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrw $4, (%rdi,%rsi,2), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrw $6, (%rdi,%rcx), %xmm0, %xmm0 +; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: retq +entry: + %var0 = load i16, ptr %a, align 1 + %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride + %var1 = load i16, ptr %arrayidx.1, align 1 + %mul.2 = shl nsw i64 %a_stride, 1 + %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2 + %var2 = load i16, ptr %arrayidx.2, align 1 + %mul.3 = mul nsw i64 %a_stride, 3 + %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3 + %var3 = load i16, ptr %arrayidx.3, align 1 + %var8 = insertelement <4 x i16> poison, i16 %var0, i64 0 + %var9 = insertelement <4 x i16> %var8, i16 %var1, i64 1 + %var10 = insertelement <4 x i16> %var9, i16 %var2, i64 2 + %var11 = insertelement <4 x i16> %var10, i16 %var3, i64 3 + %var16 = zext <4 x i16> %var11 to <4 x i32> + %var17 = load <4 x i16>, ptr %b, align 2 + %var18 = sext <4 x i16> %var17 to <4 x i32> + %var19 = mul nsw <4 x i32> %var18, %var16 + %var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19) + ret i32 %var20 +} + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable +define dso_local i32 @foov2i16(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { +; CHECK-LABEL: foov2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: movzwl (%rdi), %eax +; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovd %eax, %xmm0 +; CHECK-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxwd %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: retq + %var0 = load i16, ptr %a, align 1 + %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride + %var1 = load i16, ptr %arrayidx.1, align 1 + %var8 = insertelement <2 x i16> poison, i16 %var0, i64 0 + %var9 = insertelement <2 x i16> %var8, i16 %var1, i64 1 + %var16 = zext <2 x i16> %var9 to <2 x i32> + %var17 = load <2 x i16>, ptr %b, align 2 + %var18 = sext <2 x i16> %var17 to <2 x i32> + %var19 = mul nsw <2 x i32> %var18, %var16 + %var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19) + ret i32 %var20 +} + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable +define dso_local i64 @foov2i32(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { +; CHECK-LABEL: foov2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: vpmovsxdq (%rdx), %xmm1 +; CHECK-NEXT: vpmullq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovq %xmm0, %rax +; CHECK-NEXT: retq + %var0 = load i32, ptr %a, align 1 + %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride + %var1 = load i32, ptr %arrayidx.1, align 1 + %var8 = insertelement <2 x i32> poison, i32 %var0, i64 0 + %var9 = insertelement <2 x i32> %var8, i32 %var1, i64 1 + %var16 = zext <2 x i32> %var9 to <2 x i64> + %var17 = load <2 x i32>, ptr %b, align 2 + %var18 = sext <2 x i32> %var17 to <2 x i64> + %var19 = mul nsw <2 x i64> %var18, %var16 + %var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19) + ret i64 %var20 +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #1 +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1 +declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) #1 +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) #1 From 483c27322e50799ff99c4142f1e1f087b7fb810f Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal Date: Wed, 16 Apr 2025 19:17:52 +0530 Subject: [PATCH 2/3] Fix for test cases failure --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 7 +- llvm/test/CodeGen/PowerPC/custom-stov.ll | 16 +-- llvm/test/CodeGen/PowerPC/pre-inc-disable.ll | 104 ++++++++++-------- llvm/test/CodeGen/SystemZ/vec-mul-07.ll | 30 ++++- llvm/test/CodeGen/SystemZ/vec-mul-09.ll | 30 ++++- llvm/test/CodeGen/WebAssembly/interleave.ll | 63 ++++++----- llvm/test/CodeGen/X86/buildvec-insertvec.ll | 8 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 53 ++------- 8 files changed, 166 insertions(+), 145 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 77c659aad0ed2..96b5f666ba9e5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14215,6 +14215,11 @@ static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) { unsigned NumElts = VT.getVectorNumElements(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (TLI.getTypeAction(*DAG.getContext(), VT) != + TargetLowering::TypeWidenVector) + return SDValue(); + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); unsigned WidenNumElts = WidenVT.getVectorNumElements(); @@ -14226,7 +14231,7 @@ static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) { // iteration. unsigned step = WidenNumElts / NumElts; if (WidenVT.is128BitVector()) { - if (Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) { + if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) { for (int i = NumElts - 1, j = WidenNumElts - step; i > 0; i--, j -= step) { SDValue temp = NewOps[i]; diff --git a/llvm/test/CodeGen/PowerPC/custom-stov.ll b/llvm/test/CodeGen/PowerPC/custom-stov.ll index 0642fa900b0e5..d1bcc73fd212a 100644 --- a/llvm/test/CodeGen/PowerPC/custom-stov.ll +++ b/llvm/test/CodeGen/PowerPC/custom-stov.ll @@ -15,18 +15,18 @@ define void @_blah() { ; CHECK-NEXT: vperm v2, v4, v3, v2 ; CHECK-NEXT: lwz r4, 16(0) ; CHECK-NEXT: stvx v2, 0, r5 -; CHECK-NEXT: lhz r5, -64(r1) -; CHECK-NEXT: lhz r6, -58(r1) -; CHECK-NEXT: lhz r7, -52(r1) -; CHECK-NEXT: sth r4, -34(r1) -; CHECK-NEXT: sth r3, -36(r1) +; CHECK-NEXT: sth r3, -34(r1) +; CHECK-NEXT: sth r3, -38(r1) +; CHECK-NEXT: sth r3, -42(r1) +; CHECK-NEXT: sth r3, -46(r1) +; CHECK-NEXT: lhz r3, -52(r1) ; CHECK-NEXT: sth r3, -40(r1) +; CHECK-NEXT: lhz r3, -58(r1) ; CHECK-NEXT: sth r3, -44(r1) +; CHECK-NEXT: lhz r3, -64(r1) +; CHECK-NEXT: sth r4, -36(r1) ; CHECK-NEXT: sth r3, -48(r1) ; CHECK-NEXT: addi r3, r1, -48 -; CHECK-NEXT: sth r7, -38(r1) -; CHECK-NEXT: sth r6, -42(r1) -; CHECK-NEXT: sth r5, -46(r1) ; CHECK-NEXT: lvx v2, 0, r3 ; CHECK-NEXT: addi r3, r1, -32 ; CHECK-NEXT: vsldoi v3, v2, v2, 8 diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll index 4435484ae0b94..d668868d41aa0 100644 --- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll +++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll @@ -327,9 +327,9 @@ define void @test16(ptr nocapture readonly %sums, i32 signext %delta, i32 signex ; P9BE-AIX32-NEXT: sth 3, -32(1) ; P9BE-AIX32-NEXT: lwz 3, L..C3(2) # %const.0 ; P9BE-AIX32-NEXT: lxv 3, -32(1) -; P9BE-AIX32-NEXT: vmrghh 4, 2, 4 +; P9BE-AIX32-NEXT: vmrghh 4, 4, 2 ; P9BE-AIX32-NEXT: lxv 0, 0(3) -; P9BE-AIX32-NEXT: vmrghh 3, 2, 3 +; P9BE-AIX32-NEXT: vmrghh 3, 3, 2 ; P9BE-AIX32-NEXT: vsplth 2, 2, 0 ; P9BE-AIX32-NEXT: xxmrghw 2, 2, 4 ; P9BE-AIX32-NEXT: xxperm 3, 2, 0 @@ -403,25 +403,29 @@ define void @test8(ptr nocapture readonly %sums, i32 signext %delta, i32 signext ; P9BE-LABEL: test8: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: add 6, 3, 4 -; P9BE-NEXT: li 7, 8 -; P9BE-NEXT: lxsibzx 3, 3, 4 +; P9BE-NEXT: lxsibzx 2, 3, 4 +; P9BE-NEXT: addis 3, 2, .LCPI3_0@toc@ha +; P9BE-NEXT: addi 3, 3, .LCPI3_0@toc@l +; P9BE-NEXT: lxv 0, 0(3) +; P9BE-NEXT: li 3, 0 +; P9BE-NEXT: mtvsrwz 3, 3 +; P9BE-NEXT: li 3, 8 +; P9BE-NEXT: vspltb 4, 3, 7 +; P9BE-NEXT: xxperm 2, 3, 0 +; P9BE-NEXT: lxsibzx 0, 6, 3 ; P9BE-NEXT: addis 3, 2, .LCPI3_1@toc@ha -; P9BE-NEXT: lxsibzx 0, 6, 7 -; P9BE-NEXT: addis 6, 2, .LCPI3_0@toc@ha ; P9BE-NEXT: addi 3, 3, .LCPI3_1@toc@l -; P9BE-NEXT: addi 6, 6, .LCPI3_0@toc@l -; P9BE-NEXT: lxv 1, 0(6) -; P9BE-NEXT: li 6, 0 -; P9BE-NEXT: mtvsrwz 2, 6 -; P9BE-NEXT: xxperm 0, 2, 1 -; P9BE-NEXT: xxperm 3, 2, 1 -; P9BE-NEXT: vspltb 2, 2, 7 -; P9BE-NEXT: vmrghh 3, 3, 2 -; P9BE-NEXT: xxspltw 1, 2, 0 -; P9BE-NEXT: xxmrghw 3, 3, 0 +; P9BE-NEXT: vmrghh 2, 4, 2 +; P9BE-NEXT: lxv 1, 0(3) +; P9BE-NEXT: addis 3, 2, .LCPI3_2@toc@ha +; P9BE-NEXT: addi 3, 3, .LCPI3_2@toc@l +; P9BE-NEXT: xxmrghw 2, 4, 2 +; P9BE-NEXT: xxperm 3, 0, 1 ; P9BE-NEXT: lxv 0, 0(3) ; P9BE-NEXT: li 3, 0 -; P9BE-NEXT: xxperm 3, 1, 0 +; P9BE-NEXT: vmrghh 3, 4, 3 +; P9BE-NEXT: xxmrghw 3, 3, 4 +; P9BE-NEXT: xxperm 3, 2, 0 ; P9BE-NEXT: xxspltw 2, 3, 1 ; P9BE-NEXT: vadduwm 2, 3, 2 ; P9BE-NEXT: vextuwlx 3, 3, 2 @@ -432,23 +436,26 @@ define void @test8(ptr nocapture readonly %sums, i32 signext %delta, i32 signext ; P9BE-AIX-LABEL: test8: ; P9BE-AIX: # %bb.0: # %entry ; P9BE-AIX-NEXT: add 6, 3, 4 -; P9BE-AIX-NEXT: li 7, 8 -; P9BE-AIX-NEXT: lxsibzx 3, 3, 4 -; P9BE-AIX-NEXT: ld 3, L..C5(2) # %const.1 -; P9BE-AIX-NEXT: lxsibzx 0, 6, 7 -; P9BE-AIX-NEXT: ld 6, L..C6(2) # %const.0 -; P9BE-AIX-NEXT: lxv 1, 0(6) -; P9BE-AIX-NEXT: li 6, 0 -; P9BE-AIX-NEXT: mtvsrwz 2, 6 -; P9BE-AIX-NEXT: xxperm 0, 2, 1 -; P9BE-AIX-NEXT: xxperm 3, 2, 1 -; P9BE-AIX-NEXT: vspltb 2, 2, 7 -; P9BE-AIX-NEXT: vmrghh 3, 3, 2 -; P9BE-AIX-NEXT: xxspltw 1, 2, 0 -; P9BE-AIX-NEXT: xxmrghw 3, 3, 0 +; P9BE-AIX-NEXT: lxsibzx 2, 3, 4 +; P9BE-AIX-NEXT: ld 3, L..C5(2) # %const.0 +; P9BE-AIX-NEXT: lxv 0, 0(3) +; P9BE-AIX-NEXT: li 3, 0 +; P9BE-AIX-NEXT: mtvsrwz 3, 3 +; P9BE-AIX-NEXT: li 3, 8 +; P9BE-AIX-NEXT: vspltb 4, 3, 7 +; P9BE-AIX-NEXT: xxperm 2, 3, 0 +; P9BE-AIX-NEXT: lxsibzx 0, 6, 3 +; P9BE-AIX-NEXT: ld 3, L..C6(2) # %const.1 +; P9BE-AIX-NEXT: vmrghh 2, 4, 2 +; P9BE-AIX-NEXT: lxv 1, 0(3) +; P9BE-AIX-NEXT: ld 3, L..C7(2) # %const.2 +; P9BE-AIX-NEXT: xxmrghw 2, 4, 2 +; P9BE-AIX-NEXT: xxperm 3, 0, 1 ; P9BE-AIX-NEXT: lxv 0, 0(3) ; P9BE-AIX-NEXT: li 3, 0 -; P9BE-AIX-NEXT: xxperm 3, 1, 0 +; P9BE-AIX-NEXT: vmrghh 3, 4, 3 +; P9BE-AIX-NEXT: xxmrghw 3, 3, 4 +; P9BE-AIX-NEXT: xxperm 3, 2, 0 ; P9BE-AIX-NEXT: xxspltw 2, 3, 1 ; P9BE-AIX-NEXT: vadduwm 2, 3, 2 ; P9BE-AIX-NEXT: vextuwlx 3, 3, 2 @@ -459,22 +466,25 @@ define void @test8(ptr nocapture readonly %sums, i32 signext %delta, i32 signext ; P9BE-AIX32-LABEL: test8: ; P9BE-AIX32: # %bb.0: # %entry ; P9BE-AIX32-NEXT: add 6, 3, 4 -; P9BE-AIX32-NEXT: li 7, 8 -; P9BE-AIX32-NEXT: lxsibzx 3, 3, 4 -; P9BE-AIX32-NEXT: lwz 3, L..C4(2) # %const.1 -; P9BE-AIX32-NEXT: lxsibzx 0, 6, 7 -; P9BE-AIX32-NEXT: lwz 6, L..C5(2) # %const.0 -; P9BE-AIX32-NEXT: lxv 1, 0(6) -; P9BE-AIX32-NEXT: li 6, 0 -; P9BE-AIX32-NEXT: mtvsrwz 2, 6 -; P9BE-AIX32-NEXT: xxperm 0, 2, 1 -; P9BE-AIX32-NEXT: xxperm 3, 2, 1 -; P9BE-AIX32-NEXT: vspltb 2, 2, 7 -; P9BE-AIX32-NEXT: vmrghh 3, 3, 2 -; P9BE-AIX32-NEXT: xxspltw 1, 2, 0 -; P9BE-AIX32-NEXT: xxmrghw 3, 3, 0 +; P9BE-AIX32-NEXT: lxsibzx 2, 3, 4 +; P9BE-AIX32-NEXT: lwz 3, L..C4(2) # %const.0 ; P9BE-AIX32-NEXT: lxv 0, 0(3) -; P9BE-AIX32-NEXT: xxperm 3, 1, 0 +; P9BE-AIX32-NEXT: li 3, 0 +; P9BE-AIX32-NEXT: mtvsrwz 3, 3 +; P9BE-AIX32-NEXT: li 3, 8 +; P9BE-AIX32-NEXT: vspltb 4, 3, 7 +; P9BE-AIX32-NEXT: xxperm 2, 3, 0 +; P9BE-AIX32-NEXT: lxsibzx 0, 6, 3 +; P9BE-AIX32-NEXT: lwz 3, L..C5(2) # %const.1 +; P9BE-AIX32-NEXT: vmrghh 2, 4, 2 +; P9BE-AIX32-NEXT: lxv 1, 0(3) +; P9BE-AIX32-NEXT: lwz 3, L..C6(2) # %const.2 +; P9BE-AIX32-NEXT: xxmrghw 2, 4, 2 +; P9BE-AIX32-NEXT: xxperm 3, 0, 1 +; P9BE-AIX32-NEXT: lxv 0, 0(3) +; P9BE-AIX32-NEXT: vmrghh 3, 4, 3 +; P9BE-AIX32-NEXT: xxmrghw 3, 3, 4 +; P9BE-AIX32-NEXT: xxperm 3, 2, 0 ; P9BE-AIX32-NEXT: xxspltw 2, 3, 1 ; P9BE-AIX32-NEXT: vadduwm 2, 3, 2 ; P9BE-AIX32-NEXT: stxv 2, -16(1) diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-07.ll b/llvm/test/CodeGen/SystemZ/vec-mul-07.ll index 73c7a8dec5dfc..ca9e8412d95bd 100644 --- a/llvm/test/CodeGen/SystemZ/vec-mul-07.ll +++ b/llvm/test/CodeGen/SystemZ/vec-mul-07.ll @@ -7,7 +7,11 @@ define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2) { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmleb %v24, %v24, %v26 +; CHECK-NEXT: larl %r1, .LCPI0_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v1, %v24, %v0, %v0 +; CHECK-NEXT: vperm %v0, %v26, %v0, %v0 +; CHECK-NEXT: vmlhw %v24, %v1, %v0 ; CHECK-NEXT: br %r14 %shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> %zext1 = zext <8 x i8> %shuf1 to <8 x i16> @@ -21,7 +25,12 @@ define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2) { define <8 x i16> @f2(<16 x i8> %val1, <16 x i8> %val2) { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmlob %v24, %v24, %v26 +; CHECK-NEXT: larl %r1, .LCPI1_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vperm %v2, %v24, %v1, %v0 +; CHECK-NEXT: vperm %v0, %v26, %v1, %v0 +; CHECK-NEXT: vmlhw %v24, %v2, %v0 ; CHECK-NEXT: br %r14 %shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> %zext1 = zext <8 x i8> %shuf1 to <8 x i16> @@ -63,7 +72,11 @@ define <8 x i16> @f4(<16 x i8> %val1, <16 x i8> %val2) { define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2) { ; CHECK-LABEL: f5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmleh %v24, %v24, %v26 +; CHECK-NEXT: larl %r1, .LCPI4_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v1, %v24, %v0, %v0 +; CHECK-NEXT: vperm %v0, %v26, %v0, %v0 +; CHECK-NEXT: vmlf %v24, %v1, %v0 ; CHECK-NEXT: br %r14 %shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> %zext1 = zext <4 x i16> %shuf1 to <4 x i32> @@ -77,7 +90,12 @@ define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2) { define <4 x i32> @f6(<8 x i16> %val1, <8 x i16> %val2) { ; CHECK-LABEL: f6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmloh %v24, %v24, %v26 +; CHECK-NEXT: larl %r1, .LCPI5_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vperm %v2, %v24, %v1, %v0 +; CHECK-NEXT: vperm %v0, %v26, %v1, %v0 +; CHECK-NEXT: vmlf %v24, %v2, %v0 ; CHECK-NEXT: br %r14 %shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> %zext1 = zext <4 x i16> %shuf1 to <4 x i32> @@ -119,7 +137,7 @@ define <4 x i32> @f8(<8 x i16> %val1, <8 x i16> %val2) { define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2) { ; CHECK-LABEL: f9: ; CHECK: # %bb.0: -; CHECK-NEXT: vmlef %v24, %v24, %v26 +; CHECK-NEXT: vgbm %v24, 0 ; CHECK-NEXT: br %r14 %shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> %zext1 = zext <2 x i32> %shuf1 to <2 x i64> @@ -133,7 +151,7 @@ define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2) { define <2 x i64> @f10(<4 x i32> %val1, <4 x i32> %val2) { ; CHECK-LABEL: f10: ; CHECK: # %bb.0: -; CHECK-NEXT: vmlof %v24, %v24, %v26 +; CHECK-NEXT: vgbm %v24, 0 ; CHECK-NEXT: br %r14 %shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> %zext1 = zext <2 x i32> %shuf1 to <2 x i64> diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-09.ll b/llvm/test/CodeGen/SystemZ/vec-mul-09.ll index def57ca03bb0c..e1e52bed7a143 100644 --- a/llvm/test/CodeGen/SystemZ/vec-mul-09.ll +++ b/llvm/test/CodeGen/SystemZ/vec-mul-09.ll @@ -7,7 +7,11 @@ define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmaleb %v24, %v24, %v26, %v28 +; CHECK-NEXT: larl %r1, .LCPI0_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v1, %v24, %v0, %v0 +; CHECK-NEXT: vperm %v0, %v26, %v0, %v0 +; CHECK-NEXT: vmalhw %v24, %v1, %v0, %v28 ; CHECK-NEXT: br %r14 %shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> %zext1 = zext <8 x i8> %shuf1 to <8 x i16> @@ -22,7 +26,12 @@ define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) { define <8 x i16> @f2(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmalob %v24, %v24, %v26, %v28 +; CHECK-NEXT: larl %r1, .LCPI1_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vperm %v2, %v24, %v1, %v0 +; CHECK-NEXT: vperm %v0, %v26, %v1, %v0 +; CHECK-NEXT: vmalhw %v24, %v2, %v0, %v28 ; CHECK-NEXT: br %r14 %shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> %zext1 = zext <8 x i8> %shuf1 to <8 x i16> @@ -67,7 +76,11 @@ define <8 x i16> @f4(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) { define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) { ; CHECK-LABEL: f5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmaleh %v24, %v24, %v26, %v28 +; CHECK-NEXT: larl %r1, .LCPI4_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v1, %v24, %v0, %v0 +; CHECK-NEXT: vperm %v0, %v26, %v0, %v0 +; CHECK-NEXT: vmalf %v24, %v1, %v0, %v28 ; CHECK-NEXT: br %r14 %shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> %zext1 = zext <4 x i16> %shuf1 to <4 x i32> @@ -82,7 +95,12 @@ define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) { define <4 x i32> @f6(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) { ; CHECK-LABEL: f6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmaloh %v24, %v24, %v26, %v28 +; CHECK-NEXT: larl %r1, .LCPI5_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vperm %v2, %v24, %v1, %v0 +; CHECK-NEXT: vperm %v0, %v26, %v1, %v0 +; CHECK-NEXT: vmalf %v24, %v2, %v0, %v28 ; CHECK-NEXT: br %r14 %shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> %zext1 = zext <4 x i16> %shuf1 to <4 x i32> @@ -127,7 +145,7 @@ define <4 x i32> @f8(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) { define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2, <2 x i64> %val3) { ; CHECK-LABEL: f9: ; CHECK: # %bb.0: -; CHECK-NEXT: vmalef %v24, %v24, %v26, %v28 +; CHECK-NEXT: vlr %v24, %v28 ; CHECK-NEXT: br %r14 %shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> %zext1 = zext <2 x i32> %shuf1 to <2 x i64> @@ -142,7 +160,7 @@ define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2, <2 x i64> %val3) { define <2 x i64> @f10(<4 x i32> %val1, <4 x i32> %val2, <2 x i64> %val3) { ; CHECK-LABEL: f10: ; CHECK: # %bb.0: -; CHECK-NEXT: vmalof %v24, %v24, %v26, %v28 +; CHECK-NEXT: vlr %v24, %v28 ; CHECK-NEXT: br %r14 %shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> %zext1 = zext <2 x i32> %shuf1 to <2 x i64> diff --git a/llvm/test/CodeGen/WebAssembly/interleave.ll b/llvm/test/CodeGen/WebAssembly/interleave.ll index c20b5e42c4850..eada6cc8c6813 100644 --- a/llvm/test/CodeGen/WebAssembly/interleave.ll +++ b/llvm/test/CodeGen/WebAssembly/interleave.ll @@ -17,13 +17,12 @@ define hidden void @accumulate8x2(ptr dead_on_unwind noalias writable sret(%stru ; CHECK-LABEL: accumulate8x2: ; CHECK: loop ; CHECK: v128.load64_zero -; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i16x8.extend_low_i8x16_u -; CHECK: i32x4.extend_low_i16x8_u +; CHECK: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: local.tee 10 +; CHECK: i8x16.shuffle 1, 17, 18, 19, 3, 21, 22, 23, 5, 25, 26, 27, 7, 29, 30, 31 ; CHECK: i32x4.add -; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i16x8.extend_low_i8x16_u -; CHECK: i32x4.extend_low_i16x8_u +; CHECK: local.get 10 +; CHECK: i8x16.shuffle 0, 17, 18, 19, 2, 21, 22, 23, 4, 25, 26, 27, 6, 29, 30, 31 ; CHECK: i32x4.add %4 = load i32, ptr %0, align 4 %5 = icmp eq i32 %2, 0 @@ -65,21 +64,18 @@ define hidden void @accumulate8x4(ptr dead_on_unwind noalias writable sret(%stru ; CHECK-LABEL: accumulate8x4 ; CHECK: loop ; CHECK: v128.load -; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i16x8.extend_low_i8x16_u -; CHECK: i32x4.extend_low_i16x8_u +; CHECK: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: local.tee 14 +; CHECK: i8x16.shuffle 3, 17, 18, 19, 7, 21, 22, 23, 11, 25, 26, 27, 15, 29, 30, 31 ; CHECK: i32x4.add -; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i16x8.extend_low_i8x16_u -; CHECK: i32x4.extend_low_i16x8_u +; CHECK: local.get 14 +; CHECK: i8x16.shuffle 2, 17, 18, 19, 6, 21, 22, 23, 10, 25, 26, 27, 14, 29, 30, 31 ; CHECK: i32x4.add -; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i16x8.extend_low_i8x16_u -; CHECK: i32x4.extend_low_i16x8_u +; CHECK: local.get 14 +; CHECK: i8x16.shuffle 1, 17, 18, 19, 5, 21, 22, 23, 9, 25, 26, 27, 13, 29, 30, 31 ; CHECK: i32x4.add -; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i16x8.extend_low_i8x16_u -; CHECK: i32x4.extend_low_i16x8_u +; CHECK: local.get 14 +; CHECK: i8x16.shuffle 0, 17, 18, 19, 4, 21, 22, 23, 8, 25, 26, 27, 12, 29, 30, 31 ; CHECK: i32x4.add %4 = load i32, ptr %0, align 4 %5 = icmp eq i32 %2, 0 @@ -137,11 +133,12 @@ define hidden void @accumulate16x2(ptr dead_on_unwind noalias writable sret(%str ; CHECK-LABEL: accumulate16x2 ; CHECK: loop ; CHECK: v128.load -; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i32x4.extend_low_i16x8_u +; CHECK: v128.const 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: local.tee 10 +; CHECK: i8x16.shuffle 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 ; CHECK: i32x4.add -; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i32x4.extend_low_i16x8_u +; CHECK: local.get 10 +; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 ; CHECK: i32x4.add %4 = load i32, ptr %0, align 4 %5 = icmp eq i32 %2, 0 @@ -184,17 +181,23 @@ define hidden void @accumulate16x4(ptr dead_on_unwind noalias writable sret(%str ; CHECK: loop ; CHECK: v128.load 0:p2align=1 ; CHECK: v128.load 16:p2align=1 -; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i32x4.extend_low_i16x8_u +; CHECK: i8x16.shuffle 6, 7, 0, 1, 14, 15, 0, 1, 22, 23, 0, 1, 30, 31, 0, 1 +; CHECK: v128.const 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: local.tee 15 +; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 + ; CHECK: i32x4.add -; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i32x4.extend_low_i16x8_u +; CHECK: i8x16.shuffle 4, 5, 0, 1, 12, 13, 0, 1, 20, 21, 0, 1, 28, 29, 0, 1 +; CHECK: local.get 15 +; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 ; CHECK: i32x4.add -; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i32x4.extend_low_i16x8_u +; CHECK: i8x16.shuffle 2, 3, 0, 1, 10, 11, 0, 1, 18, 19, 0, 1, 26, 27, 0, 1 +; CHECK: local.get 15 +; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 ; CHECK: i32x4.add -; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i32x4.extend_low_i16x8_u +; CHECK: i8x16.shuffle 0, 1, 0, 1, 8, 9, 0, 1, 16, 17, 0, 1, 24, 25, 0, 1 +; CHECK: local.get 15 +; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 ; CHECK: i32x4.add %4 = load i32, ptr %0, align 4 %5 = icmp eq i32 %2, 0 diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll index 4b0e5441b4abf..ea037c1173ae3 100644 --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -799,7 +799,9 @@ define i32 @PR46586(ptr %p, <4 x i32> %v) { ; ; SSE41-LABEL: PR46586: ; SSE41: # %bb.0: -; SSE41-NEXT: movzbl 3(%rdi), %eax +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pinsrb $12, 3(%rdi), %xmm1 +; SSE41-NEXT: pextrd $3, %xmm1, %eax ; SSE41-NEXT: extractps $3, %xmm0, %ecx ; SSE41-NEXT: xorl %edx, %edx ; SSE41-NEXT: divl %ecx @@ -808,7 +810,9 @@ define i32 @PR46586(ptr %p, <4 x i32> %v) { ; ; AVX-LABEL: PR46586: ; AVX: # %bb.0: -; AVX-NEXT: movzbl 3(%rdi), %eax +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $12, 3(%rdi), %xmm1, %xmm1 +; AVX-NEXT: vpextrd $3, %xmm1, %eax ; AVX-NEXT: vextractps $3, %xmm0, %ecx ; AVX-NEXT: xorl %edx, %edx ; AVX-NEXT: divl %ecx diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index ec442c185706c..b0b148b0cd50a 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -382,51 +382,14 @@ define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) { } define <4 x double> @PR34175(ptr %p) { -; AVX512F-LABEL: PR34175: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: PR34175: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: PR34175: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: PR34175: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512BWVL-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512BWVL-NEXT: retq -; -; AVX512VBMI-LABEL: PR34175: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512VBMI-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512VBMI-NEXT: retq +; AVX512-LABEL: PR34175: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] +; AVX512-NEXT: vpermd (%rdi), %zmm0, %zmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX512-NEXT: retq %v = load <32 x i16>, ptr %p, align 2 %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> %tofp = uitofp <4 x i16> %shuf to <4 x double> From ff669c677e307ca6a57066008bbb073211aced5c Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal Date: Fri, 25 Apr 2025 15:54:47 +0530 Subject: [PATCH 3/3] Fix for the test case failure --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 15 +- llvm/test/CodeGen/PowerPC/pre-inc-disable.ll | 104 ++++--- llvm/test/CodeGen/X86/WidenBuildVector.ll | 258 ------------------ llvm/test/CodeGen/X86/buildvec-insertvec.ll | 8 +- .../CodeGen/X86/buildvec-widen-dotproduct.ll | 231 ++++++---------- 5 files changed, 149 insertions(+), 467 deletions(-) delete mode 100644 llvm/test/CodeGen/X86/WidenBuildVector.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b2ca646679838..b90182c6fdc3e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14299,6 +14299,9 @@ static SDValue widenAbs(SDNode *Extend, SelectionDAG &DAG) { // This is a special case for the 128-bit vector types. Intention is to remove // the zext and replace it with a bitcast the wider type. While lowering // the bitcast is removed and extra commutation due to zext is avoided. +// For example: +// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8 +// build_vector (x, 0, y, 0, z, w, 0) static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) { assert(Extend->getOpcode() == ISD::ZERO_EXTEND && "Expected zero extend."); @@ -14309,6 +14312,13 @@ static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) { if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse()) return SDValue(); + if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) { + // If the build vector has undef elements, we cannot widen it. + // The widening would create a vector with more undef elements, which + // is not valid. + return SDValue(); + } + SDLoc dl(BV); EVT VT = BV.getValueType(); EVT EltVT = BV.getOperand(0).getValueType(); @@ -14344,11 +14354,6 @@ static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) { // new build vector to the type of the zext. SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV); DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast); - LLVM_DEBUG( - dbgs() << DAG.getMachineFunction().getFunction().getName() - << " - Widening buildvector and replace zext with bitcast\n"; - BV.dump(); Extend->dump(); dbgs() << " to \n"; - NewBV.getNode()->dump(); NewBVBitcast->dump();); return NewBV; } } diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll index d668868d41aa0..4435484ae0b94 100644 --- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll +++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll @@ -327,9 +327,9 @@ define void @test16(ptr nocapture readonly %sums, i32 signext %delta, i32 signex ; P9BE-AIX32-NEXT: sth 3, -32(1) ; P9BE-AIX32-NEXT: lwz 3, L..C3(2) # %const.0 ; P9BE-AIX32-NEXT: lxv 3, -32(1) -; P9BE-AIX32-NEXT: vmrghh 4, 4, 2 +; P9BE-AIX32-NEXT: vmrghh 4, 2, 4 ; P9BE-AIX32-NEXT: lxv 0, 0(3) -; P9BE-AIX32-NEXT: vmrghh 3, 3, 2 +; P9BE-AIX32-NEXT: vmrghh 3, 2, 3 ; P9BE-AIX32-NEXT: vsplth 2, 2, 0 ; P9BE-AIX32-NEXT: xxmrghw 2, 2, 4 ; P9BE-AIX32-NEXT: xxperm 3, 2, 0 @@ -403,29 +403,25 @@ define void @test8(ptr nocapture readonly %sums, i32 signext %delta, i32 signext ; P9BE-LABEL: test8: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: add 6, 3, 4 -; P9BE-NEXT: lxsibzx 2, 3, 4 -; P9BE-NEXT: addis 3, 2, .LCPI3_0@toc@ha -; P9BE-NEXT: addi 3, 3, .LCPI3_0@toc@l -; P9BE-NEXT: lxv 0, 0(3) -; P9BE-NEXT: li 3, 0 -; P9BE-NEXT: mtvsrwz 3, 3 -; P9BE-NEXT: li 3, 8 -; P9BE-NEXT: vspltb 4, 3, 7 -; P9BE-NEXT: xxperm 2, 3, 0 -; P9BE-NEXT: lxsibzx 0, 6, 3 +; P9BE-NEXT: li 7, 8 +; P9BE-NEXT: lxsibzx 3, 3, 4 ; P9BE-NEXT: addis 3, 2, .LCPI3_1@toc@ha +; P9BE-NEXT: lxsibzx 0, 6, 7 +; P9BE-NEXT: addis 6, 2, .LCPI3_0@toc@ha ; P9BE-NEXT: addi 3, 3, .LCPI3_1@toc@l -; P9BE-NEXT: vmrghh 2, 4, 2 -; P9BE-NEXT: lxv 1, 0(3) -; P9BE-NEXT: addis 3, 2, .LCPI3_2@toc@ha -; P9BE-NEXT: addi 3, 3, .LCPI3_2@toc@l -; P9BE-NEXT: xxmrghw 2, 4, 2 -; P9BE-NEXT: xxperm 3, 0, 1 +; P9BE-NEXT: addi 6, 6, .LCPI3_0@toc@l +; P9BE-NEXT: lxv 1, 0(6) +; P9BE-NEXT: li 6, 0 +; P9BE-NEXT: mtvsrwz 2, 6 +; P9BE-NEXT: xxperm 0, 2, 1 +; P9BE-NEXT: xxperm 3, 2, 1 +; P9BE-NEXT: vspltb 2, 2, 7 +; P9BE-NEXT: vmrghh 3, 3, 2 +; P9BE-NEXT: xxspltw 1, 2, 0 +; P9BE-NEXT: xxmrghw 3, 3, 0 ; P9BE-NEXT: lxv 0, 0(3) ; P9BE-NEXT: li 3, 0 -; P9BE-NEXT: vmrghh 3, 4, 3 -; P9BE-NEXT: xxmrghw 3, 3, 4 -; P9BE-NEXT: xxperm 3, 2, 0 +; P9BE-NEXT: xxperm 3, 1, 0 ; P9BE-NEXT: xxspltw 2, 3, 1 ; P9BE-NEXT: vadduwm 2, 3, 2 ; P9BE-NEXT: vextuwlx 3, 3, 2 @@ -436,26 +432,23 @@ define void @test8(ptr nocapture readonly %sums, i32 signext %delta, i32 signext ; P9BE-AIX-LABEL: test8: ; P9BE-AIX: # %bb.0: # %entry ; P9BE-AIX-NEXT: add 6, 3, 4 -; P9BE-AIX-NEXT: lxsibzx 2, 3, 4 -; P9BE-AIX-NEXT: ld 3, L..C5(2) # %const.0 -; P9BE-AIX-NEXT: lxv 0, 0(3) -; P9BE-AIX-NEXT: li 3, 0 -; P9BE-AIX-NEXT: mtvsrwz 3, 3 -; P9BE-AIX-NEXT: li 3, 8 -; P9BE-AIX-NEXT: vspltb 4, 3, 7 -; P9BE-AIX-NEXT: xxperm 2, 3, 0 -; P9BE-AIX-NEXT: lxsibzx 0, 6, 3 -; P9BE-AIX-NEXT: ld 3, L..C6(2) # %const.1 -; P9BE-AIX-NEXT: vmrghh 2, 4, 2 -; P9BE-AIX-NEXT: lxv 1, 0(3) -; P9BE-AIX-NEXT: ld 3, L..C7(2) # %const.2 -; P9BE-AIX-NEXT: xxmrghw 2, 4, 2 -; P9BE-AIX-NEXT: xxperm 3, 0, 1 +; P9BE-AIX-NEXT: li 7, 8 +; P9BE-AIX-NEXT: lxsibzx 3, 3, 4 +; P9BE-AIX-NEXT: ld 3, L..C5(2) # %const.1 +; P9BE-AIX-NEXT: lxsibzx 0, 6, 7 +; P9BE-AIX-NEXT: ld 6, L..C6(2) # %const.0 +; P9BE-AIX-NEXT: lxv 1, 0(6) +; P9BE-AIX-NEXT: li 6, 0 +; P9BE-AIX-NEXT: mtvsrwz 2, 6 +; P9BE-AIX-NEXT: xxperm 0, 2, 1 +; P9BE-AIX-NEXT: xxperm 3, 2, 1 +; P9BE-AIX-NEXT: vspltb 2, 2, 7 +; P9BE-AIX-NEXT: vmrghh 3, 3, 2 +; P9BE-AIX-NEXT: xxspltw 1, 2, 0 +; P9BE-AIX-NEXT: xxmrghw 3, 3, 0 ; P9BE-AIX-NEXT: lxv 0, 0(3) ; P9BE-AIX-NEXT: li 3, 0 -; P9BE-AIX-NEXT: vmrghh 3, 4, 3 -; P9BE-AIX-NEXT: xxmrghw 3, 3, 4 -; P9BE-AIX-NEXT: xxperm 3, 2, 0 +; P9BE-AIX-NEXT: xxperm 3, 1, 0 ; P9BE-AIX-NEXT: xxspltw 2, 3, 1 ; P9BE-AIX-NEXT: vadduwm 2, 3, 2 ; P9BE-AIX-NEXT: vextuwlx 3, 3, 2 @@ -466,25 +459,22 @@ define void @test8(ptr nocapture readonly %sums, i32 signext %delta, i32 signext ; P9BE-AIX32-LABEL: test8: ; P9BE-AIX32: # %bb.0: # %entry ; P9BE-AIX32-NEXT: add 6, 3, 4 -; P9BE-AIX32-NEXT: lxsibzx 2, 3, 4 -; P9BE-AIX32-NEXT: lwz 3, L..C4(2) # %const.0 -; P9BE-AIX32-NEXT: lxv 0, 0(3) -; P9BE-AIX32-NEXT: li 3, 0 -; P9BE-AIX32-NEXT: mtvsrwz 3, 3 -; P9BE-AIX32-NEXT: li 3, 8 -; P9BE-AIX32-NEXT: vspltb 4, 3, 7 -; P9BE-AIX32-NEXT: xxperm 2, 3, 0 -; P9BE-AIX32-NEXT: lxsibzx 0, 6, 3 -; P9BE-AIX32-NEXT: lwz 3, L..C5(2) # %const.1 -; P9BE-AIX32-NEXT: vmrghh 2, 4, 2 -; P9BE-AIX32-NEXT: lxv 1, 0(3) -; P9BE-AIX32-NEXT: lwz 3, L..C6(2) # %const.2 -; P9BE-AIX32-NEXT: xxmrghw 2, 4, 2 -; P9BE-AIX32-NEXT: xxperm 3, 0, 1 +; P9BE-AIX32-NEXT: li 7, 8 +; P9BE-AIX32-NEXT: lxsibzx 3, 3, 4 +; P9BE-AIX32-NEXT: lwz 3, L..C4(2) # %const.1 +; P9BE-AIX32-NEXT: lxsibzx 0, 6, 7 +; P9BE-AIX32-NEXT: lwz 6, L..C5(2) # %const.0 +; P9BE-AIX32-NEXT: lxv 1, 0(6) +; P9BE-AIX32-NEXT: li 6, 0 +; P9BE-AIX32-NEXT: mtvsrwz 2, 6 +; P9BE-AIX32-NEXT: xxperm 0, 2, 1 +; P9BE-AIX32-NEXT: xxperm 3, 2, 1 +; P9BE-AIX32-NEXT: vspltb 2, 2, 7 +; P9BE-AIX32-NEXT: vmrghh 3, 3, 2 +; P9BE-AIX32-NEXT: xxspltw 1, 2, 0 +; P9BE-AIX32-NEXT: xxmrghw 3, 3, 0 ; P9BE-AIX32-NEXT: lxv 0, 0(3) -; P9BE-AIX32-NEXT: vmrghh 3, 4, 3 -; P9BE-AIX32-NEXT: xxmrghw 3, 3, 4 -; P9BE-AIX32-NEXT: xxperm 3, 2, 0 +; P9BE-AIX32-NEXT: xxperm 3, 1, 0 ; P9BE-AIX32-NEXT: xxspltw 2, 3, 1 ; P9BE-AIX32-NEXT: vadduwm 2, 3, 2 ; P9BE-AIX32-NEXT: stxv 2, -16(1) diff --git a/llvm/test/CodeGen/X86/WidenBuildVector.ll b/llvm/test/CodeGen/X86/WidenBuildVector.ll deleted file mode 100644 index d2924d016a1bf..0000000000000 --- a/llvm/test/CodeGen/X86/WidenBuildVector.ll +++ /dev/null @@ -1,258 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mcpu=znver5 -mtriple=x86_64-unknown-unknown < %s | FileCheck %s - -; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable -define dso_local i32 @foov8i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { -; CHECK-LABEL: foov8i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movzbl (%rdi), %eax -; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx -; CHECK-NEXT: leaq (%rsi,%rsi,4), %r8 -; CHECK-NEXT: leaq (,%rsi,8), %r9 -; CHECK-NEXT: subq %rsi, %r9 -; CHECK-NEXT: vmovd %eax, %xmm0 -; CHECK-NEXT: vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrb $10, (%rdi,%r8), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrb $12, (%rdi,%rcx,2), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrb $14, (%rdi,%r9), %xmm0, %xmm0 -; CHECK-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: retq -entry: - %var0 = load i8, ptr %a, align 1 - %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride - %var1 = load i8, ptr %arrayidx.1, align 1 - %mul.2 = shl nsw i64 %a_stride, 1 - %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2 - %var2 = load i8, ptr %arrayidx.2, align 1 - %mul.3 = mul nsw i64 %a_stride, 3 - %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3 - %var3 = load i8, ptr %arrayidx.3, align 1 - %mul.4 = shl nsw i64 %a_stride, 2 - %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 %mul.4 - %var4 = load i8, ptr %arrayidx.4, align 1 - %mul.5 = mul nsw i64 %a_stride, 5 - %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 %mul.5 - %var5 = load i8, ptr %arrayidx.5, align 1 - %mul.6 = mul nsw i64 %a_stride, 6 - %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 %mul.6 - %var6 = load i8, ptr %arrayidx.6, align 1 - %mul.7 = mul nsw i64 %a_stride, 7 - %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 %mul.7 - %var7 = load i8, ptr %arrayidx.7, align 1 - %var8 = insertelement <8 x i8> poison, i8 %var0, i64 0 - %var9 = insertelement <8 x i8> %var8, i8 %var1, i64 1 - %var10 = insertelement <8 x i8> %var9, i8 %var2, i64 2 - %var11 = insertelement <8 x i8> %var10, i8 %var3, i64 3 - %var12 = insertelement <8 x i8> %var11, i8 %var4, i64 4 - %var13 = insertelement <8 x i8> %var12, i8 %var5, i64 5 - %var14 = insertelement <8 x i8> %var13, i8 %var6, i64 6 - %var15 = insertelement <8 x i8> %var14, i8 %var7, i64 7 - %var16 = zext <8 x i8> %var15 to <8 x i32> - %var17 = load <8 x i16>, ptr %b, align 2 - %var18 = sext <8 x i16> %var17 to <8 x i32> - %var19 = mul nsw <8 x i32> %var18, %var16 - %var20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %var19) - ret i32 %var20 -} - -; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable -define dso_local i32 @foov4i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { -; CHECK-LABEL: foov4i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movzbl (%rdi), %eax -; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; CHECK-NEXT: vmovd %eax, %xmm0 -; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi,2), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrb $12, (%rdi,%rcx), %xmm0, %xmm0 -; CHECK-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: retq -entry: - %var0 = load i8, ptr %a, align 1 - %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride - %var1 = load i8, ptr %arrayidx.1, align 1 - %mul.2 = shl nsw i64 %a_stride, 1 - %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2 - %var2 = load i8, ptr %arrayidx.2, align 1 - %mul.3 = mul nsw i64 %a_stride, 3 - %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3 - %var3 = load i8, ptr %arrayidx.3, align 1 - %var8 = insertelement <4 x i8> poison, i8 %var0, i64 0 - %var9 = insertelement <4 x i8> %var8, i8 %var1, i64 1 - %var10 = insertelement <4 x i8> %var9, i8 %var2, i64 2 - %var11 = insertelement <4 x i8> %var10, i8 %var3, i64 3 - %var16 = zext <4 x i8> %var11 to <4 x i32> - %var17 = load <4 x i16>, ptr %b, align 2 - %var18 = sext <4 x i16> %var17 to <4 x i32> - %var19 = mul nsw <4 x i32> %var18, %var16 - %var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19) - ret i32 %var20 -} - -; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable -define dso_local i32 @foov2i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { -; CHECK-LABEL: foov2i8: -; CHECK: # %bb.0: -; CHECK-NEXT: movzbl (%rdi), %eax -; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovd %eax, %xmm0 -; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; CHECK-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: retq - %var0 = load i8, ptr %a, align 1 - %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride - %var1 = load i8, ptr %arrayidx.1, align 1 - %var8 = insertelement <2 x i8> poison, i8 %var0, i64 0 - %var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1 - %var16 = zext <2 x i8> %var9 to <2 x i32> - %var17 = load <2 x i16>, ptr %b, align 2 - %var18 = sext <2 x i16> %var17 to <2 x i32> - %var19 = mul nsw <2 x i32> %var18, %var16 - %var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19) - ret i32 %var20 -} - -; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable -define dso_local i64 @foov2i8_v2i64(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { -; CHECK-LABEL: foov2i8_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: movzbl (%rdi), %eax -; CHECK-NEXT: vpmovsxbq (%rdx), %xmm1 -; CHECK-NEXT: vmovd %eax, %xmm0 -; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovq %xmm0, %rax -; CHECK-NEXT: retq - %var0 = load i8, ptr %a, align 1 - %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride - %var1 = load i8, ptr %arrayidx.1, align 1 - %var8 = insertelement <2 x i8> poison, i8 %var0, i64 0 - %var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1 - %var16 = zext <2 x i8> %var9 to <2 x i64> - %var17 = load <2 x i8>, ptr %b, align 2 - %var18 = sext <2 x i8> %var17 to <2 x i64> - %var19 = mul nsw <2 x i64> %var18, %var16 - %var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19) - ret i64 %var20 -} - - -; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable -define dso_local i32 @foov4i16(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { -; CHECK-LABEL: foov4i16: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx -; CHECK-NEXT: vpmovsxwd (%rdx), %xmm1 -; CHECK-NEXT: vmovd %eax, %xmm0 -; CHECK-NEXT: vpinsrw $2, (%rdi,%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrw $4, (%rdi,%rsi,2), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrw $6, (%rdi,%rcx), %xmm0, %xmm0 -; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: retq -entry: - %var0 = load i16, ptr %a, align 1 - %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride - %var1 = load i16, ptr %arrayidx.1, align 1 - %mul.2 = shl nsw i64 %a_stride, 1 - %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2 - %var2 = load i16, ptr %arrayidx.2, align 1 - %mul.3 = mul nsw i64 %a_stride, 3 - %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3 - %var3 = load i16, ptr %arrayidx.3, align 1 - %var8 = insertelement <4 x i16> poison, i16 %var0, i64 0 - %var9 = insertelement <4 x i16> %var8, i16 %var1, i64 1 - %var10 = insertelement <4 x i16> %var9, i16 %var2, i64 2 - %var11 = insertelement <4 x i16> %var10, i16 %var3, i64 3 - %var16 = zext <4 x i16> %var11 to <4 x i32> - %var17 = load <4 x i16>, ptr %b, align 2 - %var18 = sext <4 x i16> %var17 to <4 x i32> - %var19 = mul nsw <4 x i32> %var18, %var16 - %var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19) - ret i32 %var20 -} - -; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable -define dso_local i32 @foov2i16(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { -; CHECK-LABEL: foov2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovd %eax, %xmm0 -; CHECK-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpmovsxwd %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: retq - %var0 = load i16, ptr %a, align 1 - %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride - %var1 = load i16, ptr %arrayidx.1, align 1 - %var8 = insertelement <2 x i16> poison, i16 %var0, i64 0 - %var9 = insertelement <2 x i16> %var8, i16 %var1, i64 1 - %var16 = zext <2 x i16> %var9 to <2 x i32> - %var17 = load <2 x i16>, ptr %b, align 2 - %var18 = sext <2 x i16> %var17 to <2 x i32> - %var19 = mul nsw <2 x i32> %var18, %var16 - %var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19) - ret i32 %var20 -} - -; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable -define dso_local i64 @foov2i32(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr { -; CHECK-LABEL: foov2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; CHECK-NEXT: vpmovsxdq (%rdx), %xmm1 -; CHECK-NEXT: vpmullq %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovq %xmm0, %rax -; CHECK-NEXT: retq - %var0 = load i32, ptr %a, align 1 - %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride - %var1 = load i32, ptr %arrayidx.1, align 1 - %var8 = insertelement <2 x i32> poison, i32 %var0, i64 0 - %var9 = insertelement <2 x i32> %var8, i32 %var1, i64 1 - %var16 = zext <2 x i32> %var9 to <2 x i64> - %var17 = load <2 x i32>, ptr %b, align 2 - %var18 = sext <2 x i32> %var17 to <2 x i64> - %var19 = mul nsw <2 x i64> %var18, %var16 - %var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19) - ret i64 %var20 -} - -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #1 -declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1 -declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) #1 -declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) #1 diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll index ea037c1173ae3..4b0e5441b4abf 100644 --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -799,9 +799,7 @@ define i32 @PR46586(ptr %p, <4 x i32> %v) { ; ; SSE41-LABEL: PR46586: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pinsrb $12, 3(%rdi), %xmm1 -; SSE41-NEXT: pextrd $3, %xmm1, %eax +; SSE41-NEXT: movzbl 3(%rdi), %eax ; SSE41-NEXT: extractps $3, %xmm0, %ecx ; SSE41-NEXT: xorl %edx, %edx ; SSE41-NEXT: divl %ecx @@ -810,9 +808,7 @@ define i32 @PR46586(ptr %p, <4 x i32> %v) { ; ; AVX-LABEL: PR46586: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrb $12, 3(%rdi), %xmm1, %xmm1 -; AVX-NEXT: vpextrd $3, %xmm1, %eax +; AVX-NEXT: movzbl 3(%rdi), %eax ; AVX-NEXT: vextractps $3, %xmm0, %ecx ; AVX-NEXT: xorl %edx, %edx ; AVX-NEXT: divl %ecx diff --git a/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll b/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll index 8c85dfa09fd2d..345014edd0e9d 100644 --- a/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll +++ b/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll @@ -31,88 +31,62 @@ define i32 @dot_ext_v8i8_v8i32(ptr %a, i64 %a_stride, ptr %b) nounwind { ; SSE2-NEXT: pinsrw $6, %r9d, %xmm0 ; SSE2-NEXT: pinsrw $7, %esi, %xmm0 ; SSE2-NEXT: movdqu (%rdx), %xmm1 -; SSE2-NEXT: pmaddwd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: pmaddwd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: dot_ext_v8i8_v8i32: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: leaq (%rsi,%rsi,4), %rax -; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx -; SSE4-NEXT: leaq (,%rsi,8), %r8 -; SSE4-NEXT: movzbl (%rdi), %r9d -; SSE4-NEXT: movd %r9d, %xmm0 -; SSE4-NEXT: pinsrb $1, (%rdi,%rsi), %xmm0 -; SSE4-NEXT: pinsrb $2, (%rdi,%rsi,2), %xmm0 -; SSE4-NEXT: pinsrb $3, (%rdi,%rcx), %xmm0 -; SSE4-NEXT: pinsrb $4, (%rdi,%rsi,4), %xmm0 -; SSE4-NEXT: pinsrb $5, (%rdi,%rax), %xmm0 -; SSE4-NEXT: pinsrb $6, (%rdi,%rcx,2), %xmm0 -; SSE4-NEXT: subq %rsi, %r8 -; SSE4-NEXT: pinsrb $7, (%rdi,%r8), %xmm0 +; SSE4-NEXT: movzbl (%rdi), %eax +; SSE4-NEXT: leaq (%rsi,%rsi,4), %rcx +; SSE4-NEXT: leaq (%rsi,%rsi,2), %r8 +; SSE4-NEXT: leaq (,%rsi,8), %r9 +; SSE4-NEXT: subq %rsi, %r9 +; SSE4-NEXT: movd %eax, %xmm0 +; SSE4-NEXT: pinsrb $2, (%rdi,%rsi), %xmm0 +; SSE4-NEXT: pinsrb $4, (%rdi,%rsi,2), %xmm0 +; SSE4-NEXT: pinsrb $6, (%rdi,%r8), %xmm0 +; SSE4-NEXT: pinsrb $8, (%rdi,%rsi,4), %xmm0 +; SSE4-NEXT: pinsrb $10, (%rdi,%rcx), %xmm0 +; SSE4-NEXT: pinsrb $12, (%rdi,%r8,2), %xmm0 +; SSE4-NEXT: pinsrb $14, (%rdi,%r9), %xmm0 ; SSE4-NEXT: movdqu (%rdx), %xmm1 -; SSE4-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE4-NEXT: pmaddwd %xmm1, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE4-NEXT: paddd %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE4-NEXT: pmaddwd %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE4-NEXT: paddd %xmm1, %xmm0 -; SSE4-NEXT: movd %xmm0, %eax +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE4-NEXT: paddd %xmm0, %xmm1 +; SSE4-NEXT: movd %xmm1, %eax ; SSE4-NEXT: retq ; -; AVX2-LABEL: dot_ext_v8i8_v8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: leaq (%rsi,%rsi,4), %rax -; AVX2-NEXT: leaq (%rsi,%rsi,2), %rcx -; AVX2-NEXT: leaq (,%rsi,8), %r8 -; AVX2-NEXT: subq %rsi, %r8 -; AVX2-NEXT: movzbl (%rdi), %r9d -; AVX2-NEXT: vmovd %r9d, %xmm0 -; AVX2-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $3, (%rdi,%rcx), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $4, (%rdi,%rsi,4), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $5, (%rdi,%rax), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $6, (%rdi,%rcx,2), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $7, (%rdi,%r8), %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512-LABEL: dot_ext_v8i8_v8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: leaq (%rsi,%rsi,4), %rax -; AVX512-NEXT: leaq (%rsi,%rsi,2), %rcx -; AVX512-NEXT: leaq (,%rsi,8), %r8 -; AVX512-NEXT: movzbl (%rdi), %r9d -; AVX512-NEXT: vmovd %r9d, %xmm0 -; AVX512-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0 -; AVX512-NEXT: vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0 -; AVX512-NEXT: vpinsrb $3, (%rdi,%rcx), %xmm0, %xmm0 -; AVX512-NEXT: vpinsrb $4, (%rdi,%rsi,4), %xmm0, %xmm0 -; AVX512-NEXT: vpinsrb $5, (%rdi,%rax), %xmm0, %xmm0 -; AVX512-NEXT: vpinsrb $6, (%rdi,%rcx,2), %xmm0, %xmm0 -; AVX512-NEXT: subq %rsi, %r8 -; AVX512-NEXT: vpinsrb $7, (%rdi,%r8), %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq +; AVX-LABEL: dot_ext_v8i8_v8i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movzbl (%rdi), %eax +; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx +; AVX-NEXT: leaq (%rsi,%rsi,4), %r8 +; AVX-NEXT: leaq (,%rsi,8), %r9 +; AVX-NEXT: subq %rsi, %r9 +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $10, (%rdi,%r8), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $12, (%rdi,%rcx,2), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $14, (%rdi,%r9), %xmm0, %xmm0 +; AVX-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq entry: %var0 = load i8, ptr %a, align 1 %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride @@ -175,14 +149,13 @@ define i32 @dot_ext_v4i8_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind { ; ; SSE4-LABEL: dot_ext_v4i8_v4i32: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: leaq (%rsi,%rsi,2), %rax -; SSE4-NEXT: movzbl (%rdi), %ecx -; SSE4-NEXT: movd %ecx, %xmm0 -; SSE4-NEXT: pinsrb $1, (%rdi,%rsi), %xmm0 -; SSE4-NEXT: pinsrb $2, (%rdi,%rsi,2), %xmm0 -; SSE4-NEXT: pinsrb $3, (%rdi,%rax), %xmm0 +; SSE4-NEXT: movzbl (%rdi), %eax +; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx +; SSE4-NEXT: movd %eax, %xmm0 +; SSE4-NEXT: pinsrb $4, (%rdi,%rsi), %xmm0 +; SSE4-NEXT: pinsrb $8, (%rdi,%rsi,2), %xmm0 +; SSE4-NEXT: pinsrb $12, (%rdi,%rcx), %xmm0 ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE4-NEXT: pmaddwd %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE4-NEXT: paddd %xmm1, %xmm0 @@ -194,12 +167,11 @@ define i32 @dot_ext_v4i8_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind { ; AVX-LABEL: dot_ext_v4i8_v4i32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movzbl (%rdi), %eax +; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx ; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0 -; AVX-NEXT: leaq (%rsi,%rsi,2), %rax -; AVX-NEXT: vpinsrb $3, (%rdi,%rax), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $8, (%rdi,%rsi,2), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $12, (%rdi,%rcx), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -311,8 +283,7 @@ define i64 @dot_ext_v2i8_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind { ; SSE4: # %bb.0: ; SSE4-NEXT: movzbl (%rdi), %eax ; SSE4-NEXT: movd %eax, %xmm0 -; SSE4-NEXT: pinsrb $1, (%rdi,%rsi), %xmm0 -; SSE4-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE4-NEXT: pinsrb $8, (%rdi,%rsi), %xmm0 ; SSE4-NEXT: pmovsxbq (%rdx), %xmm1 ; SSE4-NEXT: pmuldq %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] @@ -324,8 +295,7 @@ define i64 @dot_ext_v2i8_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: movzbl (%rdi), %eax ; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpinsrb $8, (%rdi,%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovsxbq (%rdx), %xmm1 ; AVX-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -374,14 +344,13 @@ define i32 @dot_ext_v4i16_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind { ; ; SSE4-LABEL: dot_ext_v4i16_v4i32: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: leaq (%rsi,%rsi,2), %rax -; SSE4-NEXT: movzwl (%rdi), %ecx -; SSE4-NEXT: movd %ecx, %xmm0 -; SSE4-NEXT: pinsrw $1, (%rdi,%rsi), %xmm0 -; SSE4-NEXT: pinsrw $2, (%rdi,%rsi,2), %xmm0 -; SSE4-NEXT: pinsrw $3, (%rdi,%rax), %xmm0 +; SSE4-NEXT: movzwl (%rdi), %eax +; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx +; SSE4-NEXT: movd %eax, %xmm0 +; SSE4-NEXT: pinsrw $2, (%rdi,%rsi), %xmm0 +; SSE4-NEXT: pinsrw $4, (%rdi,%rsi,2), %xmm0 +; SSE4-NEXT: pinsrw $6, (%rdi,%rcx), %xmm0 ; SSE4-NEXT: pmovsxwd (%rdx), %xmm1 -; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-NEXT: pmulld %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE4-NEXT: paddd %xmm1, %xmm0 @@ -390,41 +359,22 @@ define i32 @dot_ext_v4i16_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind { ; SSE4-NEXT: movd %xmm1, %eax ; SSE4-NEXT: retq ; -; AVX2-LABEL: dot_ext_v4i16_v4i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: leaq (%rsi,%rsi,2), %rax -; AVX2-NEXT: movzwl (%rdi), %ecx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $2, (%rdi,%rsi,2), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $3, (%rdi,%rax), %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxwd (%rdx), %xmm1 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512-LABEL: dot_ext_v4i16_v4i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: leaq (%rsi,%rsi,2), %rax -; AVX512-NEXT: movzwl (%rdi), %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 -; AVX512-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $2, (%rdi,%rsi,2), %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $3, (%rdi,%rax), %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vpmovsxwd (%rdx), %xmm1 -; AVX512-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq +; AVX-LABEL: dot_ext_v4i16_v4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpinsrw $2, (%rdi,%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $4, (%rdi,%rsi,2), %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $6, (%rdi,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpmovsxwd (%rdx), %xmm1 +; AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq entry: %var0 = load i16, ptr %a, align 1 %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride @@ -509,16 +459,15 @@ define i64 @dot_ext_v2i32_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: psllq $32, %xmm2 +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: paddq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: movq %xmm0, %rax @@ -560,8 +509,8 @@ define i64 @dot_ext_v2i32_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind { ; AVX512-LABEL: dot_ext_v2i32_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vpinsrd $1, (%rdi,%rsi), %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vpmovsxdq (%rdx), %xmm1 ; AVX512-NEXT: vpmullq %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]