diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index ca7b737f4437c..bad598d607273 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4313,60 +4313,604 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { }}; } -std::pair<Register, unsigned> -AMDGPUInstructionSelector::selectVOP3PModsImpl( - Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const { +enum class SrcStatus { + IS_SAME, + IS_UPPER_HALF, + IS_LOWER_HALF, + IS_UPPER_HALF_NEG, + // This means current op = [op_upper, op_lower] and src = -op_lower. + IS_LOWER_HALF_NEG, + IS_HI_NEG, + // This means current op = [op_upper, op_lower] and src = [op_upper, + // -op_lower]. + IS_LO_NEG, + IS_BOTH_NEG, + INVALID, + NEG_START = IS_UPPER_HALF_NEG, + NEG_END = IS_BOTH_NEG, + HALF_START = IS_UPPER_HALF, + HALF_END = IS_LOWER_HALF_NEG +}; + +static bool isTruncHalf(const MachineInstr *MI, + const MachineRegisterInfo &MRI) { + if (MI->getOpcode() != AMDGPU::G_TRUNC) + return false; + + unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits(); + unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); + return DstSize * 2 == SrcSize; +} + +static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { + if (MI->getOpcode() != AMDGPU::G_LSHR) + return false; + + Register ShiftSrc; + std::optional<ValueAndVReg> ShiftAmt; + if (mi_match(MI->getOperand(0).getReg(), MRI, + m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) { + unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); + unsigned Shift = ShiftAmt->Value.getZExtValue(); + return Shift * 2 == SrcSize; + } + return false; +} + +static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { + if (MI->getOpcode() != AMDGPU::G_SHL) + return false; + + Register ShiftSrc; + std::optional<ValueAndVReg> ShiftAmt; + if (mi_match(MI->getOperand(0).getReg(), MRI, + m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) { + unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); + unsigned Shift = ShiftAmt->Value.getZExtValue(); + return Shift * 2 == SrcSize; + } + return false; +} + +static std::optional<std::pair<const MachineOperand *, SrcStatus>> +retOpStat(const MachineOperand *Op, SrcStatus Stat, + std::pair<const MachineOperand *, SrcStatus> &Curr) { + if (Stat != SrcStatus::INVALID && + ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() || + Op->isCImm() || Op->isFPImm())) { + return std::optional<std::pair<const MachineOperand *, SrcStatus>>( + {Op, Stat}); + } + + return std::nullopt; +} + +enum class TypeClass { VECTOR_OF_TWO, SCALAR, NONE_OF_LISTED }; + +static TypeClass isVectorOfTwoOrScalar(const MachineOperand *Op, + const MachineRegisterInfo &MRI) { + LLT OpTy = MRI.getType(Op->getReg()); + if (OpTy.isScalar()) + return TypeClass::SCALAR; + if (OpTy.isVector() && OpTy.getNumElements() == 2) + return TypeClass::VECTOR_OF_TWO; + return TypeClass::NONE_OF_LISTED; +} + +static SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, + const MachineRegisterInfo &MRI) { + TypeClass NegType = isVectorOfTwoOrScalar(Op, MRI); + if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR) + return SrcStatus::INVALID; + + switch (S) { + case SrcStatus::IS_SAME: + if (NegType == TypeClass::VECTOR_OF_TWO) { + // Vector of 2: + // [SrcHi, SrcLo] = [CurrHi, CurrLo] + // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type) + // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) + // [SrcHi, SrcLo] = [-OpHi, -OpLo] + return SrcStatus::IS_BOTH_NEG; + } + if (NegType == TypeClass::SCALAR) { + // Scalar: + // [SrcHi, SrcLo] = [CurrHi, CurrLo] + // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type) + // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) + // [SrcHi, SrcLo] = [-OpHi, OpLo] + return SrcStatus::IS_HI_NEG; + } + break; + case SrcStatus::IS_HI_NEG: + if (NegType == TypeClass::VECTOR_OF_TWO) { + // Vector of 2: + // [SrcHi, SrcLo] = [-CurrHi, CurrLo] + // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type) + // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) + // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo] + return SrcStatus::IS_LO_NEG; + } + if (NegType == TypeClass::SCALAR) { + // Scalar: + // [SrcHi, SrcLo] = [-CurrHi, CurrLo] + // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type) + // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) + // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo] + return SrcStatus::IS_SAME; + } + break; + case SrcStatus::IS_LO_NEG: + if (NegType == TypeClass::VECTOR_OF_TWO) { + // Vector of 2: + // [SrcHi, SrcLo] = [CurrHi, -CurrLo] + // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type) + // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) + // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo] + return SrcStatus::IS_HI_NEG; + } + if (NegType == TypeClass::SCALAR) { + // Scalar: + // [SrcHi, SrcLo] = [CurrHi, -CurrLo] + // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type) + // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) + // [SrcHi, SrcLo] = [-OpHi, -OpLo] + return SrcStatus::IS_BOTH_NEG; + } + break; + case SrcStatus::IS_BOTH_NEG: + if (NegType == TypeClass::VECTOR_OF_TWO) { + // Vector of 2: + // [SrcHi, SrcLo] = [-CurrHi, -CurrLo] + // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type) + // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) + // [SrcHi, SrcLo] = [OpHi, OpLo] + return SrcStatus::IS_SAME; + } + if (NegType == TypeClass::SCALAR) { + // Scalar: + // [SrcHi, SrcLo] = [-CurrHi, -CurrLo] + // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type) + // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) + // [SrcHi, SrcLo] = [OpHi, -OpLo] + return SrcStatus::IS_LO_NEG; + } + break; + case SrcStatus::IS_UPPER_HALF: + // Vector of 2: + // Src = CurrUpper + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) + // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) + // Src = -OpUpper + // + // Scalar: + // Src = CurrUpper + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) + // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) + // Src = -OpUpper + return SrcStatus::IS_UPPER_HALF_NEG; + case SrcStatus::IS_LOWER_HALF: + if (NegType == TypeClass::VECTOR_OF_TWO) { + // Vector of 2: + // Src = CurrLower + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) + // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) + // Src = -OpLower + return SrcStatus::IS_LOWER_HALF_NEG; + } + if (NegType == TypeClass::SCALAR) { + // Scalar: + // Src = CurrLower + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) + // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) + // Src = OpLower + return SrcStatus::IS_LOWER_HALF; + } + break; + case SrcStatus::IS_UPPER_HALF_NEG: + // Vector of 2: + // Src = -CurrUpper + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) + // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) + // Src = -(-OpUpper) = OpUpper + // + // Scalar: + // Src = -CurrUpper + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) + // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) + // Src = -(-OpUpper) = OpUpper + return SrcStatus::IS_UPPER_HALF; + case SrcStatus::IS_LOWER_HALF_NEG: + if (NegType == TypeClass::VECTOR_OF_TWO) { + // Vector of 2: + // Src = -CurrLower + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) + // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) + // Src = -(-OpLower) = OpLower + return SrcStatus::IS_LOWER_HALF; + } + if (NegType == TypeClass::SCALAR) { + // Scalar: + // Src = -CurrLower + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) + // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) + // Src = -OpLower + return SrcStatus::IS_LOWER_HALF_NEG; + } + break; + default: + llvm_unreachable("unexpected SrcStatus"); + } +} + +static std::optional<std::pair<const MachineOperand *, SrcStatus>> +calcNextStatus(std::pair<const MachineOperand *, SrcStatus> Curr, + const MachineRegisterInfo &MRI) { + if (!Curr.first->isReg()) + return std::nullopt; + + const MachineInstr *MI = Curr.first->isDef() + ? Curr.first->getParent() + : MRI.getVRegDef(Curr.first->getReg()); + + unsigned Opc = MI->getOpcode(); + + // Handle general Opc cases. + switch (Opc) { + case AMDGPU::G_CONSTANT: + case AMDGPU::G_FCONSTANT: + return retOpStat(&MI->getOperand(1), Curr.second, Curr); + case AMDGPU::G_BITCAST: + case AMDGPU::COPY: + return retOpStat(&MI->getOperand(1), Curr.second, Curr); + case AMDGPU::G_FNEG: + return retOpStat(&MI->getOperand(1), + getNegStatus(Curr.first, Curr.second, MRI), Curr); + default: + break; + } + + // Calc next Stat from current Stat. + switch (Curr.second) { + case SrcStatus::IS_SAME: + if (isTruncHalf(MI, MRI)) + return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF, Curr); + break; + case SrcStatus::IS_HI_NEG: + if (isTruncHalf(MI, MRI)) { + // [SrcHi, SrcLo] = [-CurrHi, CurrLo] + // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower + // = [OpLowerHi, OpLowerLo] + // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo] + // = [-OpLowerHi, OpLowerLo] + // = -OpLower + return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF_NEG, Curr); + } + break; + case SrcStatus::IS_UPPER_HALF: + if (isShlHalf(MI, MRI)) + return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF, Curr); + break; + case SrcStatus::IS_LOWER_HALF: + if (isLshrHalf(MI, MRI)) + return retOpStat(&MI->getOperand(1), SrcStatus::IS_UPPER_HALF, Curr); + break; + case SrcStatus::IS_UPPER_HALF_NEG: + if (isShlHalf(MI, MRI)) + return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF_NEG, Curr); + break; + case SrcStatus::IS_LOWER_HALF_NEG: + if (isLshrHalf(MI, MRI)) + return retOpStat(&MI->getOperand(1), SrcStatus::IS_UPPER_HALF_NEG, Curr); + break; + default: + break; + } + return std::nullopt; +} + +class searchOptions { +private: + bool HasNeg = false; + // Assume all complex pattern of VOP3P has opsel. + bool HasOpsel = true; + +public: + searchOptions(const MachineOperand *RootOp, const MachineRegisterInfo &MRI) { + const MachineInstr *MI = RootOp->getParent(); + unsigned Opc = MI->getOpcode(); + + if (Opc < TargetOpcode::GENERIC_OP_END) { + // Keep same for generic op. + HasNeg = true; + } else if (Opc == TargetOpcode::G_INTRINSIC) { + Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID(); + // Only float point intrinsic has neg & neg_hi bits. + if (IntrinsicID == Intrinsic::amdgcn_fdot2) + HasNeg = true; + } + } + bool checkOptions(SrcStatus Stat) const { + if (!HasNeg && + (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) { + return false; + } + if (!HasOpsel && + (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) { + return false; + } + return true; + } +}; + +static SmallVector<std::pair<const MachineOperand *, SrcStatus>> +getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, + searchOptions SearchOptions, int MaxDepth = 6) { + int Depth = 0; + auto Curr = calcNextStatus({Op, SrcStatus::IS_SAME}, MRI); + SmallVector<std::pair<const MachineOperand *, SrcStatus>> Statlist; + + while (Depth <= MaxDepth && Curr.has_value()) { + Depth++; + if (SearchOptions.checkOptions(Curr.value().second)) + Statlist.push_back(Curr.value()); + Curr = calcNextStatus(Curr.value(), MRI); + } + + return Statlist; +} + +static std::pair<const MachineOperand *, SrcStatus> +getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI, + searchOptions SearchOptions, int MaxDepth = 6) { + int Depth = 0; + std::pair<const MachineOperand *, SrcStatus> LastSameOrNeg = { + Op, SrcStatus::IS_SAME}; + auto Curr = calcNextStatus(LastSameOrNeg, MRI); + + while (Depth <= MaxDepth && Curr.has_value()) { + Depth++; + if (SearchOptions.checkOptions(Curr.value().second)) { + if (Curr.value().second == SrcStatus::IS_SAME || + Curr.value().second == SrcStatus::IS_HI_NEG || + Curr.value().second == SrcStatus::IS_LO_NEG || + Curr.value().second == SrcStatus::IS_BOTH_NEG) + LastSameOrNeg = Curr.value(); + } + Curr = calcNextStatus(Curr.value(), MRI); + } + + return LastSameOrNeg; +} + +static bool isInlinableFPConstant(const MachineOperand &Op, + const SIInstrInfo &TII) { + return Op.isFPImm() && TII.isInlineConstant(Op.getFPImm()->getValueAPF()); +} + +static bool isSameBitWidth(const MachineOperand *Op1, const MachineOperand *Op2, + const MachineRegisterInfo &MRI) { + unsigned Width1 = MRI.getType(Op1->getReg()).getSizeInBits(); + unsigned Width2 = MRI.getType(Op2->getReg()).getSizeInBits(); + return Width1 == Width2; +} + +static bool isSameOperand(const MachineOperand *Op1, + const MachineOperand *Op2) { + if (Op1->isReg()) + return Op2->isReg() && Op1->getReg() == Op2->getReg(); + + return Op1->isIdenticalTo(*Op2); +} + +static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) { + // SrcStatus::IS_LOWER_HALF remain 0. + if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) { + Mods ^= SISrcMods::NEG_HI; + Mods |= SISrcMods::OP_SEL_1; + } else if (HiStat == SrcStatus::IS_UPPER_HALF) + Mods |= SISrcMods::OP_SEL_1; + else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG) + Mods ^= SISrcMods::NEG_HI; + else if (HiStat == SrcStatus::IS_HI_NEG) + Mods ^= SISrcMods::NEG_HI; + + if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) { + Mods ^= SISrcMods::NEG; + Mods |= SISrcMods::OP_SEL_0; + } else if (LoStat == SrcStatus::IS_UPPER_HALF) + Mods |= SISrcMods::OP_SEL_0; + else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG) + Mods |= SISrcMods::NEG; + else if (LoStat == SrcStatus::IS_HI_NEG) + Mods ^= SISrcMods::NEG; + + return Mods; +} + +static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, + const MachineOperand *NewOp, + const MachineOperand *RootOp, const SIInstrInfo &TII, + const MachineRegisterInfo &MRI) { + if (NewOp->isReg()) { + auto IsHalfState = [](SrcStatus S) { + return S == SrcStatus::IS_UPPER_HALF || + S == SrcStatus::IS_UPPER_HALF_NEG || + S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG; + }; + return isSameBitWidth(NewOp, RootOp, MRI) && IsHalfState(LoStat) && + IsHalfState(HiStat); + } else + return ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_HI_NEG) && + (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_HI_NEG) && + isInlinableFPConstant(*NewOp, TII)); + + return false; +} + +std::pair<const MachineOperand *, unsigned> +AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp, + const MachineRegisterInfo &MRI, + bool IsDOT) const { unsigned Mods = 0; - MachineInstr *MI = MRI.getVRegDef(Src); + const MachineOperand *Op = RootOp; + // No modification if Root type is not form of <2 x Type>. + if (isVectorOfTwoOrScalar(Op, MRI) != TypeClass::VECTOR_OF_TWO) { + Mods |= SISrcMods::OP_SEL_1; + return {Op, Mods}; + } + + searchOptions SearchOptions(Op, MRI); - if (MI->getOpcode() == AMDGPU::G_FNEG && - // It's possible to see an f32 fneg here, but unlikely. - // TODO: Treat f32 fneg as only high bit. - MRI.getType(Src) == LLT::fixed_vector(2, 16)) { + std::pair<const MachineOperand *, SrcStatus> Stat = + getLastSameOrNeg(Op, MRI, SearchOptions); + if (!Stat.first->isReg()) { + Mods |= SISrcMods::OP_SEL_1; + return {Op, Mods}; + } + if (Stat.second == SrcStatus::IS_BOTH_NEG) Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); - Src = MI->getOperand(1).getReg(); - MI = MRI.getVRegDef(Src); + else if (Stat.second == SrcStatus::IS_HI_NEG) + Mods ^= SISrcMods::NEG_HI; + else if (Stat.second == SrcStatus::IS_LO_NEG) + Mods ^= SISrcMods::NEG; + + Op = Stat.first; + MachineInstr *MI = MRI.getVRegDef(Op->getReg()); + + if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 || + (IsDOT && Subtarget->hasDOTOpSelHazard())) { + Mods |= SISrcMods::OP_SEL_1; + return {Op, Mods}; } - // TODO: Handle G_FSUB 0 as fneg + SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistHi = + getSrcStats(&MI->getOperand(2), MRI, SearchOptions); - // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. - (void)IsDOT; // DOTs do not use OPSEL on gfx942+, check ST.hasDOTOpSelHazard() + if (StatlistHi.size() == 0) { + Mods |= SISrcMods::OP_SEL_1; + return {Op, Mods}; + } + + SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistLo = + getSrcStats(&MI->getOperand(1), MRI, SearchOptions); + + if (StatlistLo.size() == 0) { + Mods |= SISrcMods::OP_SEL_1; + return {Op, Mods}; + } + for (int I = StatlistHi.size() - 1; I >= 0; I--) { + for (int J = StatlistLo.size() - 1; J >= 0; J--) { + if (isSameOperand(StatlistHi[I].first, StatlistLo[J].first) && + isValidToPack(StatlistHi[I].second, StatlistLo[J].second, + StatlistHi[I].first, RootOp, TII, MRI)) + return {StatlistHi[I].first, + updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)}; + } + } // Packed instructions do not have abs modifiers. Mods |= SISrcMods::OP_SEL_1; - return std::pair(Src, Mods); + return {Op, Mods}; } -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { - MachineRegisterInfo &MRI - = Root.getParent()->getParent()->getParent()->getRegInfo(); +int64_t getAllKindImm(const MachineOperand *Op) { + switch (Op->getType()) { + case MachineOperand::MachineOperandType::MO_Immediate: + return Op->getImm(); + case MachineOperand::MachineOperandType::MO_CImmediate: + return Op->getCImm()->getSExtValue(); + case MachineOperand::MachineOperandType::MO_FPImmediate: + return Op->getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue(); + default: + llvm_unreachable("not an imm type"); + } +} + +static bool checkRB(const MachineOperand *Op, unsigned int RBNo, + const AMDGPURegisterBankInfo &RBI, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) { + const RegisterBank *RB = RBI.getRegBank(Op->getReg(), MRI, TRI); + return RB->getID() == RBNo; +} + +// This function is used to get the correct register bank for returned reg. +// Assume: +// 1. VOP3P is always legal for VGPR. +// 2. RootOp's regbank is legal. +// Thus +// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR. +// 2. If RootOp is VGPR, then NewOp must be VGPR. +static const MachineOperand * +getLegalRegBank(const MachineOperand *NewOp, const MachineOperand *RootOp, + const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, const SIInstrInfo &TII) { + // RootOp can only be VGPR or SGPR (some hand written cases such as. + // inst-select-ashr.v2s16.mir::ashr_v2s16_vs). + if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) || + checkRB(NewOp, AMDGPU::VGPRRegBankID, RBI, MRI, TRI)) + return NewOp; + + MachineInstr *MI = MRI.getVRegDef(RootOp->getReg()); + if (MI->getOpcode() == AMDGPU::COPY && + isSameOperand(NewOp, &MI->getOperand(1))) { + // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp. + return RootOp; + } + + MachineBasicBlock *BB = MI->getParent(); + const TargetRegisterClass *DstRC = + TRI.getConstrainedRegClassForOperand(*RootOp, MRI); + Register DstReg = MRI.createVirtualRegister(DstRC); - Register Src; + MachineInstrBuilder MIB = + BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) + .addReg(NewOp->getReg()); + + // only accept VGPR. + return &MIB->getOperand(0); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root, + bool IsDOT) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + const MachineOperand *Op; unsigned Mods; - std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); + std::tie(Op, Mods) = selectVOP3PModsImpl(&Root, MRI, IsDOT); + if (!(Op->isReg())) + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Op)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; + Op = getLegalRegBank(Op, &Root, RBI, MRI, TRI, TII); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addReg(Op->getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods }}; } InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { - MachineRegisterInfo &MRI - = Root.getParent()->getParent()->getParent()->getRegInfo(); +AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { - Register Src; - unsigned Mods; - std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true); + return selectVOP3PRetHelper(Root); +} - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods - }}; +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { + + return selectVOP3PRetHelper(Root, true); } InstructionSelector::ComplexRendererFns diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 6c3f3026e877a..a224f39f88996 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -187,9 +187,11 @@ class AMDGPUInstructionSelector final : public InstructionSelector { ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const; - std::pair<Register, unsigned> - selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI, + std::pair<const MachineOperand *, unsigned> + selectVOP3PModsImpl(const MachineOperand *Op, const MachineRegisterInfo &MRI, bool IsDOT = false) const; + InstructionSelector::ComplexRendererFns + selectVOP3PRetHelper(MachineOperand &Root, bool IsDOT = false) const; InstructionSelector::ComplexRendererFns selectVOP3PMods(MachineOperand &Root) const; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll index 543f8e413abd8..534b454775502 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -106,6 +106,169 @@ define <2 x half> @v_fmul_v2f16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) ret <2 x half> %mul } +define <2 x half> @v_fmul_v2f16_partial_neg(<2 x half> %a, <2 x half> %b) { +; GFX9-LABEL: v_fmul_v2f16_partial_neg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v1, v0 neg_hi:[1,0] +; GFX9-NEXT: v_pk_mul_f16 v0, v1, v0 neg_lo:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v2f16_partial_neg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v1 +; GFX8-NEXT: v_mul_f16_e32 v3, v1, v0 +; GFX8-NEXT: v_mul_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX8-NEXT: v_mul_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v2f16_partial_neg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v0, v1, v0 neg_hi:[1,0] +; GFX10-NEXT: v_pk_mul_f16 v0, v1, v0 neg_lo:[1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %b1 = bitcast <2 x half> %b to float + %b2 = fneg float %b1 + %b3 = bitcast float %b2 to <2 x half> + %b4 = fneg <2 x half> %b3 + %mul1 = fmul <2 x half> %b3, %a + %mul2 = fmul <2 x half> %b4, %mul1 + ret <2 x half> %mul2 +} + +define <2 x half> @fmul_v2_half_neg_hi(<2 x half> %a, <2 x half> %b) #0 { +; GFX9-LABEL: fmul_v2_half_neg_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_hi:[0,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: fmul_v2_half_neg_hi: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: fmul_v2_half_neg_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 neg_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %b1 = bitcast <2 x half> %b to float + %b2 = fneg float %b1 + %b3 = bitcast float %b2 to <2 x half> + %b4 = extractelement <2 x half> %b3, i64 1 + %tmp = insertelement <2 x half> poison, half %b4, i64 0 + %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0> + %mul = fmul <2 x half> %a, %k + ret <2 x half> %mul +} + +define <2 x half> @fmul_v2_half_neg_hi1(<2 x half> %a, <2 x half> %b) #0 { +; GFX9-LABEL: fmul_v2_half_neg_hi1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: fmul_v2_half_neg_hi1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: fmul_v2_half_neg_hi1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %b1 = bitcast <2 x half> %b to float + %b2 = fneg float %b1 + %b3 = bitcast float %b2 to <2 x half> + %b4 = fneg <2 x half> %b3 + %b5 = extractelement <2 x half> %b4, i64 1 + %tmp = insertelement <2 x half> poison, half %b5, i64 0 + %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0> + %mul = fmul <2 x half> %a, %k + ret <2 x half> %mul +} + +define <2 x half> @fmul_v2_half_neg_lo(<2 x half> %a, <2 x half> %b) #0 { +; GFX9-LABEL: fmul_v2_half_neg_lo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] neg_hi:[0,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: fmul_v2_half_neg_lo: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: fmul_v2_half_neg_lo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] neg_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %b1 = bitcast <2 x half> %b to float + %b2 = fneg float %b1 + %b3 = bitcast float %b2 to <2 x half> + %b4 = fneg <2 x half> %b3 + %b5 = extractelement <2 x half> %b4, i64 0 + %tmp = insertelement <2 x half> poison, half %b5, i64 0 + %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0> + %mul = fmul <2 x half> %a, %k + ret <2 x half> %mul +} + +define <2 x half> @fmul_v2_half_neg_lo1(<2 x half> %a, <2 x half> %b) #0 { +; GFX9-LABEL: fmul_v2_half_neg_lo1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: fmul_v2_half_neg_lo1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: fmul_v2_half_neg_lo1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %b1 = bitcast <2 x half> %b to float + %b2 = fneg float %b1 + %b3 = bitcast float %b2 to <2 x half> + %b4 = extractelement <2 x half> %b3, i64 0 + %tmp = insertelement <2 x half> poison, half %b4, i64 0 + %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0> + %mul = fmul <2 x half> %a, %k + ret <2 x half> %mul +} + define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) { ; GFX9-LABEL: v_fmul_v3f16: ; GFX9: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll index 744a5b7feb48d..8f0ae8c47098a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -304,8 +304,7 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_sdot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_a: @@ -319,8 +318,7 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX10-LABEL: v_sdot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> poison, <2 x i32> <i32 1, i32 0> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -331,8 +329,7 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_sdot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_b: @@ -346,8 +343,7 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX10-LABEL: v_sdot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> <i32 1, i32 0> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll index 9e623494a5a04..287a009ca1405 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll @@ -289,22 +289,19 @@ define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_udot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> poison, <2 x i32> <i32 1, i32 0> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -315,22 +312,19 @@ define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_udot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> <i32 1, i32 0> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll index 2f1dfa11fd34d..141b86a24c1c4 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -742,9 +742,8 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; ; GFX9-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_pk_add_f16 v0, s2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, s2, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-GISEL-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict: @@ -784,8 +783,7 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; ; GFX10-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 -; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s0 +; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-GISEL-NEXT: ; return to shader part epilog ; ; GFX11-SDAG-TRUE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict: @@ -810,8 +808,7 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; ; GFX11-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 -; GFX11-GISEL-NEXT: v_pk_add_f16 v0, s2, s0 +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-GISEL-NEXT: ; return to shader part epilog ; GFX10PLUS-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX10PLUS-SDAG: ; %bb.0: @@ -824,8 +821,7 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; GFX10PLUS-SDAG-NEXT: ; return to shader part epilog ; GFX10PLUS-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX10PLUS-GISEL: ; %bb.0: -; GFX10PLUS-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 -; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, s2, s0 +; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1] ; GFX10PLUS-GISEL-NEXT: ; return to shader part epilog %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val