1*e8d8bef9SDimitry Andric //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// 2*e8d8bef9SDimitry Andric // 3*e8d8bef9SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*e8d8bef9SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*e8d8bef9SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*e8d8bef9SDimitry Andric // 7*e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===// 8*e8d8bef9SDimitry Andric /// \file 9*e8d8bef9SDimitry Andric /// This file implements a TargetTransformInfo analysis pass specific to the 10*e8d8bef9SDimitry Andric /// X86 target machine. It uses the target's detailed information to provide 11*e8d8bef9SDimitry Andric /// more precise answers to certain TTI queries, while letting the target 12*e8d8bef9SDimitry Andric /// independent and default TTI implementations handle the rest. 13*e8d8bef9SDimitry Andric /// 14*e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===// 15*e8d8bef9SDimitry Andric 16*e8d8bef9SDimitry Andric #include "X86TargetTransformInfo.h" 17*e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicInst.h" 18*e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsX86.h" 19*e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h" 20*e8d8bef9SDimitry Andric #include "llvm/Transforms/InstCombine/InstCombiner.h" 21*e8d8bef9SDimitry Andric 22*e8d8bef9SDimitry Andric using namespace llvm; 23*e8d8bef9SDimitry Andric 24*e8d8bef9SDimitry Andric #define DEBUG_TYPE "x86tti" 25*e8d8bef9SDimitry Andric 26*e8d8bef9SDimitry Andric /// Return a constant boolean vector that has true elements in all positions 27*e8d8bef9SDimitry Andric /// where the input constant data vector has an element with the sign bit set. 28*e8d8bef9SDimitry Andric static Constant *getNegativeIsTrueBoolVec(Constant *V) { 29*e8d8bef9SDimitry Andric VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); 30*e8d8bef9SDimitry Andric V = ConstantExpr::getBitCast(V, IntTy); 31*e8d8bef9SDimitry Andric V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy), 32*e8d8bef9SDimitry Andric V); 33*e8d8bef9SDimitry Andric return V; 34*e8d8bef9SDimitry Andric } 35*e8d8bef9SDimitry Andric 36*e8d8bef9SDimitry Andric /// Convert the x86 XMM integer vector mask to a vector of bools based on 37*e8d8bef9SDimitry Andric /// each element's most significant bit (the sign bit). 38*e8d8bef9SDimitry Andric static Value *getBoolVecFromMask(Value *Mask) { 39*e8d8bef9SDimitry Andric // Fold Constant Mask. 40*e8d8bef9SDimitry Andric if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) 41*e8d8bef9SDimitry Andric return getNegativeIsTrueBoolVec(ConstantMask); 42*e8d8bef9SDimitry Andric 43*e8d8bef9SDimitry Andric // Mask was extended from a boolean vector. 44*e8d8bef9SDimitry Andric Value *ExtMask; 45*e8d8bef9SDimitry Andric if (PatternMatch::match( 46*e8d8bef9SDimitry Andric Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && 47*e8d8bef9SDimitry Andric ExtMask->getType()->isIntOrIntVectorTy(1)) 48*e8d8bef9SDimitry Andric return ExtMask; 49*e8d8bef9SDimitry Andric 50*e8d8bef9SDimitry Andric return nullptr; 51*e8d8bef9SDimitry Andric } 52*e8d8bef9SDimitry Andric 53*e8d8bef9SDimitry Andric // TODO: If the x86 backend knew how to convert a bool vector mask back to an 54*e8d8bef9SDimitry Andric // XMM register mask efficiently, we could transform all x86 masked intrinsics 55*e8d8bef9SDimitry Andric // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 56*e8d8bef9SDimitry Andric static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 57*e8d8bef9SDimitry Andric Value *Ptr = II.getOperand(0); 58*e8d8bef9SDimitry Andric Value *Mask = II.getOperand(1); 59*e8d8bef9SDimitry Andric Constant *ZeroVec = Constant::getNullValue(II.getType()); 60*e8d8bef9SDimitry Andric 61*e8d8bef9SDimitry Andric // Zero Mask - masked load instruction creates a zero vector. 62*e8d8bef9SDimitry Andric if (isa<ConstantAggregateZero>(Mask)) 63*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ZeroVec); 64*e8d8bef9SDimitry Andric 65*e8d8bef9SDimitry Andric // The mask is constant or extended from a bool vector. Convert this x86 66*e8d8bef9SDimitry Andric // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 67*e8d8bef9SDimitry Andric if (Value *BoolMask = getBoolVecFromMask(Mask)) { 68*e8d8bef9SDimitry Andric // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 69*e8d8bef9SDimitry Andric // the LLVM intrinsic definition for the pointer argument. 70*e8d8bef9SDimitry Andric unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 71*e8d8bef9SDimitry Andric PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 72*e8d8bef9SDimitry Andric Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 73*e8d8bef9SDimitry Andric 74*e8d8bef9SDimitry Andric // The pass-through vector for an x86 masked load is a zero vector. 75*e8d8bef9SDimitry Andric CallInst *NewMaskedLoad = 76*e8d8bef9SDimitry Andric IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec); 77*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, NewMaskedLoad); 78*e8d8bef9SDimitry Andric } 79*e8d8bef9SDimitry Andric 80*e8d8bef9SDimitry Andric return nullptr; 81*e8d8bef9SDimitry Andric } 82*e8d8bef9SDimitry Andric 83*e8d8bef9SDimitry Andric // TODO: If the x86 backend knew how to convert a bool vector mask back to an 84*e8d8bef9SDimitry Andric // XMM register mask efficiently, we could transform all x86 masked intrinsics 85*e8d8bef9SDimitry Andric // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 86*e8d8bef9SDimitry Andric static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 87*e8d8bef9SDimitry Andric Value *Ptr = II.getOperand(0); 88*e8d8bef9SDimitry Andric Value *Mask = II.getOperand(1); 89*e8d8bef9SDimitry Andric Value *Vec = II.getOperand(2); 90*e8d8bef9SDimitry Andric 91*e8d8bef9SDimitry Andric // Zero Mask - this masked store instruction does nothing. 92*e8d8bef9SDimitry Andric if (isa<ConstantAggregateZero>(Mask)) { 93*e8d8bef9SDimitry Andric IC.eraseInstFromFunction(II); 94*e8d8bef9SDimitry Andric return true; 95*e8d8bef9SDimitry Andric } 96*e8d8bef9SDimitry Andric 97*e8d8bef9SDimitry Andric // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 98*e8d8bef9SDimitry Andric // anything else at this level. 99*e8d8bef9SDimitry Andric if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 100*e8d8bef9SDimitry Andric return false; 101*e8d8bef9SDimitry Andric 102*e8d8bef9SDimitry Andric // The mask is constant or extended from a bool vector. Convert this x86 103*e8d8bef9SDimitry Andric // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 104*e8d8bef9SDimitry Andric if (Value *BoolMask = getBoolVecFromMask(Mask)) { 105*e8d8bef9SDimitry Andric unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 106*e8d8bef9SDimitry Andric PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 107*e8d8bef9SDimitry Andric Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 108*e8d8bef9SDimitry Andric 109*e8d8bef9SDimitry Andric IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 110*e8d8bef9SDimitry Andric 111*e8d8bef9SDimitry Andric // 'Replace uses' doesn't work for stores. Erase the original masked store. 112*e8d8bef9SDimitry Andric IC.eraseInstFromFunction(II); 113*e8d8bef9SDimitry Andric return true; 114*e8d8bef9SDimitry Andric } 115*e8d8bef9SDimitry Andric 116*e8d8bef9SDimitry Andric return false; 117*e8d8bef9SDimitry Andric } 118*e8d8bef9SDimitry Andric 119*e8d8bef9SDimitry Andric static Value *simplifyX86immShift(const IntrinsicInst &II, 120*e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 121*e8d8bef9SDimitry Andric bool LogicalShift = false; 122*e8d8bef9SDimitry Andric bool ShiftLeft = false; 123*e8d8bef9SDimitry Andric bool IsImm = false; 124*e8d8bef9SDimitry Andric 125*e8d8bef9SDimitry Andric switch (II.getIntrinsicID()) { 126*e8d8bef9SDimitry Andric default: 127*e8d8bef9SDimitry Andric llvm_unreachable("Unexpected intrinsic!"); 128*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrai_d: 129*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrai_w: 130*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrai_d: 131*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrai_w: 132*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_q_128: 133*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_q_256: 134*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_d_512: 135*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_q_512: 136*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_w_512: 137*e8d8bef9SDimitry Andric IsImm = true; 138*e8d8bef9SDimitry Andric LLVM_FALLTHROUGH; 139*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psra_d: 140*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psra_w: 141*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psra_d: 142*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psra_w: 143*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_q_128: 144*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_q_256: 145*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_d_512: 146*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_q_512: 147*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_w_512: 148*e8d8bef9SDimitry Andric LogicalShift = false; 149*e8d8bef9SDimitry Andric ShiftLeft = false; 150*e8d8bef9SDimitry Andric break; 151*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrli_d: 152*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrli_q: 153*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrli_w: 154*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrli_d: 155*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrli_q: 156*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrli_w: 157*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrli_d_512: 158*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrli_q_512: 159*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrli_w_512: 160*e8d8bef9SDimitry Andric IsImm = true; 161*e8d8bef9SDimitry Andric LLVM_FALLTHROUGH; 162*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrl_d: 163*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrl_q: 164*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrl_w: 165*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrl_d: 166*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrl_q: 167*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrl_w: 168*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrl_d_512: 169*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrl_q_512: 170*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrl_w_512: 171*e8d8bef9SDimitry Andric LogicalShift = true; 172*e8d8bef9SDimitry Andric ShiftLeft = false; 173*e8d8bef9SDimitry Andric break; 174*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pslli_d: 175*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pslli_q: 176*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pslli_w: 177*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pslli_d: 178*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pslli_q: 179*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pslli_w: 180*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pslli_d_512: 181*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pslli_q_512: 182*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pslli_w_512: 183*e8d8bef9SDimitry Andric IsImm = true; 184*e8d8bef9SDimitry Andric LLVM_FALLTHROUGH; 185*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psll_d: 186*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psll_q: 187*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psll_w: 188*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psll_d: 189*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psll_q: 190*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psll_w: 191*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psll_d_512: 192*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psll_q_512: 193*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psll_w_512: 194*e8d8bef9SDimitry Andric LogicalShift = true; 195*e8d8bef9SDimitry Andric ShiftLeft = true; 196*e8d8bef9SDimitry Andric break; 197*e8d8bef9SDimitry Andric } 198*e8d8bef9SDimitry Andric assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 199*e8d8bef9SDimitry Andric 200*e8d8bef9SDimitry Andric auto Vec = II.getArgOperand(0); 201*e8d8bef9SDimitry Andric auto Amt = II.getArgOperand(1); 202*e8d8bef9SDimitry Andric auto VT = cast<FixedVectorType>(Vec->getType()); 203*e8d8bef9SDimitry Andric auto SVT = VT->getElementType(); 204*e8d8bef9SDimitry Andric auto AmtVT = Amt->getType(); 205*e8d8bef9SDimitry Andric unsigned VWidth = VT->getNumElements(); 206*e8d8bef9SDimitry Andric unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 207*e8d8bef9SDimitry Andric 208*e8d8bef9SDimitry Andric // If the shift amount is guaranteed to be in-range we can replace it with a 209*e8d8bef9SDimitry Andric // generic shift. If its guaranteed to be out of range, logical shifts combine 210*e8d8bef9SDimitry Andric // to zero and arithmetic shifts are clamped to (BitWidth - 1). 211*e8d8bef9SDimitry Andric if (IsImm) { 212*e8d8bef9SDimitry Andric assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); 213*e8d8bef9SDimitry Andric KnownBits KnownAmtBits = 214*e8d8bef9SDimitry Andric llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); 215*e8d8bef9SDimitry Andric if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 216*e8d8bef9SDimitry Andric Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 217*e8d8bef9SDimitry Andric Amt = Builder.CreateVectorSplat(VWidth, Amt); 218*e8d8bef9SDimitry Andric return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 219*e8d8bef9SDimitry Andric : Builder.CreateLShr(Vec, Amt)) 220*e8d8bef9SDimitry Andric : Builder.CreateAShr(Vec, Amt)); 221*e8d8bef9SDimitry Andric } 222*e8d8bef9SDimitry Andric if (KnownAmtBits.getMinValue().uge(BitWidth)) { 223*e8d8bef9SDimitry Andric if (LogicalShift) 224*e8d8bef9SDimitry Andric return ConstantAggregateZero::get(VT); 225*e8d8bef9SDimitry Andric Amt = ConstantInt::get(SVT, BitWidth - 1); 226*e8d8bef9SDimitry Andric return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 227*e8d8bef9SDimitry Andric } 228*e8d8bef9SDimitry Andric } else { 229*e8d8bef9SDimitry Andric // Ensure the first element has an in-range value and the rest of the 230*e8d8bef9SDimitry Andric // elements in the bottom 64 bits are zero. 231*e8d8bef9SDimitry Andric assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 232*e8d8bef9SDimitry Andric cast<VectorType>(AmtVT)->getElementType() == SVT && 233*e8d8bef9SDimitry Andric "Unexpected shift-by-scalar type"); 234*e8d8bef9SDimitry Andric unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); 235*e8d8bef9SDimitry Andric APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 236*e8d8bef9SDimitry Andric APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 237*e8d8bef9SDimitry Andric KnownBits KnownLowerBits = llvm::computeKnownBits( 238*e8d8bef9SDimitry Andric Amt, DemandedLower, II.getModule()->getDataLayout()); 239*e8d8bef9SDimitry Andric KnownBits KnownUpperBits = llvm::computeKnownBits( 240*e8d8bef9SDimitry Andric Amt, DemandedUpper, II.getModule()->getDataLayout()); 241*e8d8bef9SDimitry Andric if (KnownLowerBits.getMaxValue().ult(BitWidth) && 242*e8d8bef9SDimitry Andric (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) { 243*e8d8bef9SDimitry Andric SmallVector<int, 16> ZeroSplat(VWidth, 0); 244*e8d8bef9SDimitry Andric Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); 245*e8d8bef9SDimitry Andric return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 246*e8d8bef9SDimitry Andric : Builder.CreateLShr(Vec, Amt)) 247*e8d8bef9SDimitry Andric : Builder.CreateAShr(Vec, Amt)); 248*e8d8bef9SDimitry Andric } 249*e8d8bef9SDimitry Andric } 250*e8d8bef9SDimitry Andric 251*e8d8bef9SDimitry Andric // Simplify if count is constant vector. 252*e8d8bef9SDimitry Andric auto CDV = dyn_cast<ConstantDataVector>(Amt); 253*e8d8bef9SDimitry Andric if (!CDV) 254*e8d8bef9SDimitry Andric return nullptr; 255*e8d8bef9SDimitry Andric 256*e8d8bef9SDimitry Andric // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 257*e8d8bef9SDimitry Andric // operand to compute the shift amount. 258*e8d8bef9SDimitry Andric assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 259*e8d8bef9SDimitry Andric cast<VectorType>(AmtVT)->getElementType() == SVT && 260*e8d8bef9SDimitry Andric "Unexpected shift-by-scalar type"); 261*e8d8bef9SDimitry Andric 262*e8d8bef9SDimitry Andric // Concatenate the sub-elements to create the 64-bit value. 263*e8d8bef9SDimitry Andric APInt Count(64, 0); 264*e8d8bef9SDimitry Andric for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 265*e8d8bef9SDimitry Andric unsigned SubEltIdx = (NumSubElts - 1) - i; 266*e8d8bef9SDimitry Andric auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 267*e8d8bef9SDimitry Andric Count <<= BitWidth; 268*e8d8bef9SDimitry Andric Count |= SubElt->getValue().zextOrTrunc(64); 269*e8d8bef9SDimitry Andric } 270*e8d8bef9SDimitry Andric 271*e8d8bef9SDimitry Andric // If shift-by-zero then just return the original value. 272*e8d8bef9SDimitry Andric if (Count.isNullValue()) 273*e8d8bef9SDimitry Andric return Vec; 274*e8d8bef9SDimitry Andric 275*e8d8bef9SDimitry Andric // Handle cases when Shift >= BitWidth. 276*e8d8bef9SDimitry Andric if (Count.uge(BitWidth)) { 277*e8d8bef9SDimitry Andric // If LogicalShift - just return zero. 278*e8d8bef9SDimitry Andric if (LogicalShift) 279*e8d8bef9SDimitry Andric return ConstantAggregateZero::get(VT); 280*e8d8bef9SDimitry Andric 281*e8d8bef9SDimitry Andric // If ArithmeticShift - clamp Shift to (BitWidth - 1). 282*e8d8bef9SDimitry Andric Count = APInt(64, BitWidth - 1); 283*e8d8bef9SDimitry Andric } 284*e8d8bef9SDimitry Andric 285*e8d8bef9SDimitry Andric // Get a constant vector of the same type as the first operand. 286*e8d8bef9SDimitry Andric auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 287*e8d8bef9SDimitry Andric auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 288*e8d8bef9SDimitry Andric 289*e8d8bef9SDimitry Andric if (ShiftLeft) 290*e8d8bef9SDimitry Andric return Builder.CreateShl(Vec, ShiftVec); 291*e8d8bef9SDimitry Andric 292*e8d8bef9SDimitry Andric if (LogicalShift) 293*e8d8bef9SDimitry Andric return Builder.CreateLShr(Vec, ShiftVec); 294*e8d8bef9SDimitry Andric 295*e8d8bef9SDimitry Andric return Builder.CreateAShr(Vec, ShiftVec); 296*e8d8bef9SDimitry Andric } 297*e8d8bef9SDimitry Andric 298*e8d8bef9SDimitry Andric // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 299*e8d8bef9SDimitry Andric // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 300*e8d8bef9SDimitry Andric // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 301*e8d8bef9SDimitry Andric static Value *simplifyX86varShift(const IntrinsicInst &II, 302*e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 303*e8d8bef9SDimitry Andric bool LogicalShift = false; 304*e8d8bef9SDimitry Andric bool ShiftLeft = false; 305*e8d8bef9SDimitry Andric 306*e8d8bef9SDimitry Andric switch (II.getIntrinsicID()) { 307*e8d8bef9SDimitry Andric default: 308*e8d8bef9SDimitry Andric llvm_unreachable("Unexpected intrinsic!"); 309*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrav_d: 310*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrav_d_256: 311*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_q_128: 312*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_q_256: 313*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_d_512: 314*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_q_512: 315*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_w_128: 316*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_w_256: 317*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_w_512: 318*e8d8bef9SDimitry Andric LogicalShift = false; 319*e8d8bef9SDimitry Andric ShiftLeft = false; 320*e8d8bef9SDimitry Andric break; 321*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_d: 322*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_d_256: 323*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_q: 324*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_q_256: 325*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_d_512: 326*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_q_512: 327*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_w_128: 328*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_w_256: 329*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_w_512: 330*e8d8bef9SDimitry Andric LogicalShift = true; 331*e8d8bef9SDimitry Andric ShiftLeft = false; 332*e8d8bef9SDimitry Andric break; 333*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_d: 334*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_d_256: 335*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_q: 336*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_q_256: 337*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_d_512: 338*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_q_512: 339*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_w_128: 340*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_w_256: 341*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_w_512: 342*e8d8bef9SDimitry Andric LogicalShift = true; 343*e8d8bef9SDimitry Andric ShiftLeft = true; 344*e8d8bef9SDimitry Andric break; 345*e8d8bef9SDimitry Andric } 346*e8d8bef9SDimitry Andric assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 347*e8d8bef9SDimitry Andric 348*e8d8bef9SDimitry Andric auto Vec = II.getArgOperand(0); 349*e8d8bef9SDimitry Andric auto Amt = II.getArgOperand(1); 350*e8d8bef9SDimitry Andric auto VT = cast<FixedVectorType>(II.getType()); 351*e8d8bef9SDimitry Andric auto SVT = VT->getElementType(); 352*e8d8bef9SDimitry Andric int NumElts = VT->getNumElements(); 353*e8d8bef9SDimitry Andric int BitWidth = SVT->getIntegerBitWidth(); 354*e8d8bef9SDimitry Andric 355*e8d8bef9SDimitry Andric // If the shift amount is guaranteed to be in-range we can replace it with a 356*e8d8bef9SDimitry Andric // generic shift. 357*e8d8bef9SDimitry Andric APInt UpperBits = 358*e8d8bef9SDimitry Andric APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth)); 359*e8d8bef9SDimitry Andric if (llvm::MaskedValueIsZero(Amt, UpperBits, 360*e8d8bef9SDimitry Andric II.getModule()->getDataLayout())) { 361*e8d8bef9SDimitry Andric return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 362*e8d8bef9SDimitry Andric : Builder.CreateLShr(Vec, Amt)) 363*e8d8bef9SDimitry Andric : Builder.CreateAShr(Vec, Amt)); 364*e8d8bef9SDimitry Andric } 365*e8d8bef9SDimitry Andric 366*e8d8bef9SDimitry Andric // Simplify if all shift amounts are constant/undef. 367*e8d8bef9SDimitry Andric auto *CShift = dyn_cast<Constant>(Amt); 368*e8d8bef9SDimitry Andric if (!CShift) 369*e8d8bef9SDimitry Andric return nullptr; 370*e8d8bef9SDimitry Andric 371*e8d8bef9SDimitry Andric // Collect each element's shift amount. 372*e8d8bef9SDimitry Andric // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 373*e8d8bef9SDimitry Andric bool AnyOutOfRange = false; 374*e8d8bef9SDimitry Andric SmallVector<int, 8> ShiftAmts; 375*e8d8bef9SDimitry Andric for (int I = 0; I < NumElts; ++I) { 376*e8d8bef9SDimitry Andric auto *CElt = CShift->getAggregateElement(I); 377*e8d8bef9SDimitry Andric if (isa_and_nonnull<UndefValue>(CElt)) { 378*e8d8bef9SDimitry Andric ShiftAmts.push_back(-1); 379*e8d8bef9SDimitry Andric continue; 380*e8d8bef9SDimitry Andric } 381*e8d8bef9SDimitry Andric 382*e8d8bef9SDimitry Andric auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 383*e8d8bef9SDimitry Andric if (!COp) 384*e8d8bef9SDimitry Andric return nullptr; 385*e8d8bef9SDimitry Andric 386*e8d8bef9SDimitry Andric // Handle out of range shifts. 387*e8d8bef9SDimitry Andric // If LogicalShift - set to BitWidth (special case). 388*e8d8bef9SDimitry Andric // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 389*e8d8bef9SDimitry Andric APInt ShiftVal = COp->getValue(); 390*e8d8bef9SDimitry Andric if (ShiftVal.uge(BitWidth)) { 391*e8d8bef9SDimitry Andric AnyOutOfRange = LogicalShift; 392*e8d8bef9SDimitry Andric ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 393*e8d8bef9SDimitry Andric continue; 394*e8d8bef9SDimitry Andric } 395*e8d8bef9SDimitry Andric 396*e8d8bef9SDimitry Andric ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 397*e8d8bef9SDimitry Andric } 398*e8d8bef9SDimitry Andric 399*e8d8bef9SDimitry Andric // If all elements out of range or UNDEF, return vector of zeros/undefs. 400*e8d8bef9SDimitry Andric // ArithmeticShift should only hit this if they are all UNDEF. 401*e8d8bef9SDimitry Andric auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 402*e8d8bef9SDimitry Andric if (llvm::all_of(ShiftAmts, OutOfRange)) { 403*e8d8bef9SDimitry Andric SmallVector<Constant *, 8> ConstantVec; 404*e8d8bef9SDimitry Andric for (int Idx : ShiftAmts) { 405*e8d8bef9SDimitry Andric if (Idx < 0) { 406*e8d8bef9SDimitry Andric ConstantVec.push_back(UndefValue::get(SVT)); 407*e8d8bef9SDimitry Andric } else { 408*e8d8bef9SDimitry Andric assert(LogicalShift && "Logical shift expected"); 409*e8d8bef9SDimitry Andric ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 410*e8d8bef9SDimitry Andric } 411*e8d8bef9SDimitry Andric } 412*e8d8bef9SDimitry Andric return ConstantVector::get(ConstantVec); 413*e8d8bef9SDimitry Andric } 414*e8d8bef9SDimitry Andric 415*e8d8bef9SDimitry Andric // We can't handle only some out of range values with generic logical shifts. 416*e8d8bef9SDimitry Andric if (AnyOutOfRange) 417*e8d8bef9SDimitry Andric return nullptr; 418*e8d8bef9SDimitry Andric 419*e8d8bef9SDimitry Andric // Build the shift amount constant vector. 420*e8d8bef9SDimitry Andric SmallVector<Constant *, 8> ShiftVecAmts; 421*e8d8bef9SDimitry Andric for (int Idx : ShiftAmts) { 422*e8d8bef9SDimitry Andric if (Idx < 0) 423*e8d8bef9SDimitry Andric ShiftVecAmts.push_back(UndefValue::get(SVT)); 424*e8d8bef9SDimitry Andric else 425*e8d8bef9SDimitry Andric ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 426*e8d8bef9SDimitry Andric } 427*e8d8bef9SDimitry Andric auto ShiftVec = ConstantVector::get(ShiftVecAmts); 428*e8d8bef9SDimitry Andric 429*e8d8bef9SDimitry Andric if (ShiftLeft) 430*e8d8bef9SDimitry Andric return Builder.CreateShl(Vec, ShiftVec); 431*e8d8bef9SDimitry Andric 432*e8d8bef9SDimitry Andric if (LogicalShift) 433*e8d8bef9SDimitry Andric return Builder.CreateLShr(Vec, ShiftVec); 434*e8d8bef9SDimitry Andric 435*e8d8bef9SDimitry Andric return Builder.CreateAShr(Vec, ShiftVec); 436*e8d8bef9SDimitry Andric } 437*e8d8bef9SDimitry Andric 438*e8d8bef9SDimitry Andric static Value *simplifyX86pack(IntrinsicInst &II, 439*e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder, bool IsSigned) { 440*e8d8bef9SDimitry Andric Value *Arg0 = II.getArgOperand(0); 441*e8d8bef9SDimitry Andric Value *Arg1 = II.getArgOperand(1); 442*e8d8bef9SDimitry Andric Type *ResTy = II.getType(); 443*e8d8bef9SDimitry Andric 444*e8d8bef9SDimitry Andric // Fast all undef handling. 445*e8d8bef9SDimitry Andric if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 446*e8d8bef9SDimitry Andric return UndefValue::get(ResTy); 447*e8d8bef9SDimitry Andric 448*e8d8bef9SDimitry Andric auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 449*e8d8bef9SDimitry Andric unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 450*e8d8bef9SDimitry Andric unsigned NumSrcElts = ArgTy->getNumElements(); 451*e8d8bef9SDimitry Andric assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 452*e8d8bef9SDimitry Andric "Unexpected packing types"); 453*e8d8bef9SDimitry Andric 454*e8d8bef9SDimitry Andric unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 455*e8d8bef9SDimitry Andric unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 456*e8d8bef9SDimitry Andric unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 457*e8d8bef9SDimitry Andric assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 458*e8d8bef9SDimitry Andric "Unexpected packing types"); 459*e8d8bef9SDimitry Andric 460*e8d8bef9SDimitry Andric // Constant folding. 461*e8d8bef9SDimitry Andric if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 462*e8d8bef9SDimitry Andric return nullptr; 463*e8d8bef9SDimitry Andric 464*e8d8bef9SDimitry Andric // Clamp Values - signed/unsigned both use signed clamp values, but they 465*e8d8bef9SDimitry Andric // differ on the min/max values. 466*e8d8bef9SDimitry Andric APInt MinValue, MaxValue; 467*e8d8bef9SDimitry Andric if (IsSigned) { 468*e8d8bef9SDimitry Andric // PACKSS: Truncate signed value with signed saturation. 469*e8d8bef9SDimitry Andric // Source values less than dst minint are saturated to minint. 470*e8d8bef9SDimitry Andric // Source values greater than dst maxint are saturated to maxint. 471*e8d8bef9SDimitry Andric MinValue = 472*e8d8bef9SDimitry Andric APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 473*e8d8bef9SDimitry Andric MaxValue = 474*e8d8bef9SDimitry Andric APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 475*e8d8bef9SDimitry Andric } else { 476*e8d8bef9SDimitry Andric // PACKUS: Truncate signed value with unsigned saturation. 477*e8d8bef9SDimitry Andric // Source values less than zero are saturated to zero. 478*e8d8bef9SDimitry Andric // Source values greater than dst maxuint are saturated to maxuint. 479*e8d8bef9SDimitry Andric MinValue = APInt::getNullValue(SrcScalarSizeInBits); 480*e8d8bef9SDimitry Andric MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 481*e8d8bef9SDimitry Andric } 482*e8d8bef9SDimitry Andric 483*e8d8bef9SDimitry Andric auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 484*e8d8bef9SDimitry Andric auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 485*e8d8bef9SDimitry Andric Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 486*e8d8bef9SDimitry Andric Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 487*e8d8bef9SDimitry Andric Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 488*e8d8bef9SDimitry Andric Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 489*e8d8bef9SDimitry Andric 490*e8d8bef9SDimitry Andric // Shuffle clamped args together at the lane level. 491*e8d8bef9SDimitry Andric SmallVector<int, 32> PackMask; 492*e8d8bef9SDimitry Andric for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 493*e8d8bef9SDimitry Andric for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 494*e8d8bef9SDimitry Andric PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 495*e8d8bef9SDimitry Andric for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 496*e8d8bef9SDimitry Andric PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 497*e8d8bef9SDimitry Andric } 498*e8d8bef9SDimitry Andric auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 499*e8d8bef9SDimitry Andric 500*e8d8bef9SDimitry Andric // Truncate to dst size. 501*e8d8bef9SDimitry Andric return Builder.CreateTrunc(Shuffle, ResTy); 502*e8d8bef9SDimitry Andric } 503*e8d8bef9SDimitry Andric 504*e8d8bef9SDimitry Andric static Value *simplifyX86movmsk(const IntrinsicInst &II, 505*e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 506*e8d8bef9SDimitry Andric Value *Arg = II.getArgOperand(0); 507*e8d8bef9SDimitry Andric Type *ResTy = II.getType(); 508*e8d8bef9SDimitry Andric 509*e8d8bef9SDimitry Andric // movmsk(undef) -> zero as we must ensure the upper bits are zero. 510*e8d8bef9SDimitry Andric if (isa<UndefValue>(Arg)) 511*e8d8bef9SDimitry Andric return Constant::getNullValue(ResTy); 512*e8d8bef9SDimitry Andric 513*e8d8bef9SDimitry Andric auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); 514*e8d8bef9SDimitry Andric // We can't easily peek through x86_mmx types. 515*e8d8bef9SDimitry Andric if (!ArgTy) 516*e8d8bef9SDimitry Andric return nullptr; 517*e8d8bef9SDimitry Andric 518*e8d8bef9SDimitry Andric // Expand MOVMSK to compare/bitcast/zext: 519*e8d8bef9SDimitry Andric // e.g. PMOVMSKB(v16i8 x): 520*e8d8bef9SDimitry Andric // %cmp = icmp slt <16 x i8> %x, zeroinitializer 521*e8d8bef9SDimitry Andric // %int = bitcast <16 x i1> %cmp to i16 522*e8d8bef9SDimitry Andric // %res = zext i16 %int to i32 523*e8d8bef9SDimitry Andric unsigned NumElts = ArgTy->getNumElements(); 524*e8d8bef9SDimitry Andric Type *IntegerVecTy = VectorType::getInteger(ArgTy); 525*e8d8bef9SDimitry Andric Type *IntegerTy = Builder.getIntNTy(NumElts); 526*e8d8bef9SDimitry Andric 527*e8d8bef9SDimitry Andric Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); 528*e8d8bef9SDimitry Andric Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); 529*e8d8bef9SDimitry Andric Res = Builder.CreateBitCast(Res, IntegerTy); 530*e8d8bef9SDimitry Andric Res = Builder.CreateZExtOrTrunc(Res, ResTy); 531*e8d8bef9SDimitry Andric return Res; 532*e8d8bef9SDimitry Andric } 533*e8d8bef9SDimitry Andric 534*e8d8bef9SDimitry Andric static Value *simplifyX86addcarry(const IntrinsicInst &II, 535*e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 536*e8d8bef9SDimitry Andric Value *CarryIn = II.getArgOperand(0); 537*e8d8bef9SDimitry Andric Value *Op1 = II.getArgOperand(1); 538*e8d8bef9SDimitry Andric Value *Op2 = II.getArgOperand(2); 539*e8d8bef9SDimitry Andric Type *RetTy = II.getType(); 540*e8d8bef9SDimitry Andric Type *OpTy = Op1->getType(); 541*e8d8bef9SDimitry Andric assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 542*e8d8bef9SDimitry Andric RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 543*e8d8bef9SDimitry Andric "Unexpected types for x86 addcarry"); 544*e8d8bef9SDimitry Andric 545*e8d8bef9SDimitry Andric // If carry-in is zero, this is just an unsigned add with overflow. 546*e8d8bef9SDimitry Andric if (match(CarryIn, PatternMatch::m_ZeroInt())) { 547*e8d8bef9SDimitry Andric Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 548*e8d8bef9SDimitry Andric {Op1, Op2}); 549*e8d8bef9SDimitry Andric // The types have to be adjusted to match the x86 call types. 550*e8d8bef9SDimitry Andric Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 551*e8d8bef9SDimitry Andric Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 552*e8d8bef9SDimitry Andric Builder.getInt8Ty()); 553*e8d8bef9SDimitry Andric Value *Res = UndefValue::get(RetTy); 554*e8d8bef9SDimitry Andric Res = Builder.CreateInsertValue(Res, UAddOV, 0); 555*e8d8bef9SDimitry Andric return Builder.CreateInsertValue(Res, UAddResult, 1); 556*e8d8bef9SDimitry Andric } 557*e8d8bef9SDimitry Andric 558*e8d8bef9SDimitry Andric return nullptr; 559*e8d8bef9SDimitry Andric } 560*e8d8bef9SDimitry Andric 561*e8d8bef9SDimitry Andric static Value *simplifyX86insertps(const IntrinsicInst &II, 562*e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 563*e8d8bef9SDimitry Andric auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 564*e8d8bef9SDimitry Andric if (!CInt) 565*e8d8bef9SDimitry Andric return nullptr; 566*e8d8bef9SDimitry Andric 567*e8d8bef9SDimitry Andric auto *VecTy = cast<FixedVectorType>(II.getType()); 568*e8d8bef9SDimitry Andric assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 569*e8d8bef9SDimitry Andric 570*e8d8bef9SDimitry Andric // The immediate permute control byte looks like this: 571*e8d8bef9SDimitry Andric // [3:0] - zero mask for each 32-bit lane 572*e8d8bef9SDimitry Andric // [5:4] - select one 32-bit destination lane 573*e8d8bef9SDimitry Andric // [7:6] - select one 32-bit source lane 574*e8d8bef9SDimitry Andric 575*e8d8bef9SDimitry Andric uint8_t Imm = CInt->getZExtValue(); 576*e8d8bef9SDimitry Andric uint8_t ZMask = Imm & 0xf; 577*e8d8bef9SDimitry Andric uint8_t DestLane = (Imm >> 4) & 0x3; 578*e8d8bef9SDimitry Andric uint8_t SourceLane = (Imm >> 6) & 0x3; 579*e8d8bef9SDimitry Andric 580*e8d8bef9SDimitry Andric ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 581*e8d8bef9SDimitry Andric 582*e8d8bef9SDimitry Andric // If all zero mask bits are set, this was just a weird way to 583*e8d8bef9SDimitry Andric // generate a zero vector. 584*e8d8bef9SDimitry Andric if (ZMask == 0xf) 585*e8d8bef9SDimitry Andric return ZeroVector; 586*e8d8bef9SDimitry Andric 587*e8d8bef9SDimitry Andric // Initialize by passing all of the first source bits through. 588*e8d8bef9SDimitry Andric int ShuffleMask[4] = {0, 1, 2, 3}; 589*e8d8bef9SDimitry Andric 590*e8d8bef9SDimitry Andric // We may replace the second operand with the zero vector. 591*e8d8bef9SDimitry Andric Value *V1 = II.getArgOperand(1); 592*e8d8bef9SDimitry Andric 593*e8d8bef9SDimitry Andric if (ZMask) { 594*e8d8bef9SDimitry Andric // If the zero mask is being used with a single input or the zero mask 595*e8d8bef9SDimitry Andric // overrides the destination lane, this is a shuffle with the zero vector. 596*e8d8bef9SDimitry Andric if ((II.getArgOperand(0) == II.getArgOperand(1)) || 597*e8d8bef9SDimitry Andric (ZMask & (1 << DestLane))) { 598*e8d8bef9SDimitry Andric V1 = ZeroVector; 599*e8d8bef9SDimitry Andric // We may still move 32-bits of the first source vector from one lane 600*e8d8bef9SDimitry Andric // to another. 601*e8d8bef9SDimitry Andric ShuffleMask[DestLane] = SourceLane; 602*e8d8bef9SDimitry Andric // The zero mask may override the previous insert operation. 603*e8d8bef9SDimitry Andric for (unsigned i = 0; i < 4; ++i) 604*e8d8bef9SDimitry Andric if ((ZMask >> i) & 0x1) 605*e8d8bef9SDimitry Andric ShuffleMask[i] = i + 4; 606*e8d8bef9SDimitry Andric } else { 607*e8d8bef9SDimitry Andric // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 608*e8d8bef9SDimitry Andric return nullptr; 609*e8d8bef9SDimitry Andric } 610*e8d8bef9SDimitry Andric } else { 611*e8d8bef9SDimitry Andric // Replace the selected destination lane with the selected source lane. 612*e8d8bef9SDimitry Andric ShuffleMask[DestLane] = SourceLane + 4; 613*e8d8bef9SDimitry Andric } 614*e8d8bef9SDimitry Andric 615*e8d8bef9SDimitry Andric return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 616*e8d8bef9SDimitry Andric } 617*e8d8bef9SDimitry Andric 618*e8d8bef9SDimitry Andric /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 619*e8d8bef9SDimitry Andric /// or conversion to a shuffle vector. 620*e8d8bef9SDimitry Andric static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 621*e8d8bef9SDimitry Andric ConstantInt *CILength, ConstantInt *CIIndex, 622*e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 623*e8d8bef9SDimitry Andric auto LowConstantHighUndef = [&](uint64_t Val) { 624*e8d8bef9SDimitry Andric Type *IntTy64 = Type::getInt64Ty(II.getContext()); 625*e8d8bef9SDimitry Andric Constant *Args[] = {ConstantInt::get(IntTy64, Val), 626*e8d8bef9SDimitry Andric UndefValue::get(IntTy64)}; 627*e8d8bef9SDimitry Andric return ConstantVector::get(Args); 628*e8d8bef9SDimitry Andric }; 629*e8d8bef9SDimitry Andric 630*e8d8bef9SDimitry Andric // See if we're dealing with constant values. 631*e8d8bef9SDimitry Andric Constant *C0 = dyn_cast<Constant>(Op0); 632*e8d8bef9SDimitry Andric ConstantInt *CI0 = 633*e8d8bef9SDimitry Andric C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 634*e8d8bef9SDimitry Andric : nullptr; 635*e8d8bef9SDimitry Andric 636*e8d8bef9SDimitry Andric // Attempt to constant fold. 637*e8d8bef9SDimitry Andric if (CILength && CIIndex) { 638*e8d8bef9SDimitry Andric // From AMD documentation: "The bit index and field length are each six 639*e8d8bef9SDimitry Andric // bits in length other bits of the field are ignored." 640*e8d8bef9SDimitry Andric APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 641*e8d8bef9SDimitry Andric APInt APLength = CILength->getValue().zextOrTrunc(6); 642*e8d8bef9SDimitry Andric 643*e8d8bef9SDimitry Andric unsigned Index = APIndex.getZExtValue(); 644*e8d8bef9SDimitry Andric 645*e8d8bef9SDimitry Andric // From AMD documentation: "a value of zero in the field length is 646*e8d8bef9SDimitry Andric // defined as length of 64". 647*e8d8bef9SDimitry Andric unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 648*e8d8bef9SDimitry Andric 649*e8d8bef9SDimitry Andric // From AMD documentation: "If the sum of the bit index + length field 650*e8d8bef9SDimitry Andric // is greater than 64, the results are undefined". 651*e8d8bef9SDimitry Andric unsigned End = Index + Length; 652*e8d8bef9SDimitry Andric 653*e8d8bef9SDimitry Andric // Note that both field index and field length are 8-bit quantities. 654*e8d8bef9SDimitry Andric // Since variables 'Index' and 'Length' are unsigned values 655*e8d8bef9SDimitry Andric // obtained from zero-extending field index and field length 656*e8d8bef9SDimitry Andric // respectively, their sum should never wrap around. 657*e8d8bef9SDimitry Andric if (End > 64) 658*e8d8bef9SDimitry Andric return UndefValue::get(II.getType()); 659*e8d8bef9SDimitry Andric 660*e8d8bef9SDimitry Andric // If we are inserting whole bytes, we can convert this to a shuffle. 661*e8d8bef9SDimitry Andric // Lowering can recognize EXTRQI shuffle masks. 662*e8d8bef9SDimitry Andric if ((Length % 8) == 0 && (Index % 8) == 0) { 663*e8d8bef9SDimitry Andric // Convert bit indices to byte indices. 664*e8d8bef9SDimitry Andric Length /= 8; 665*e8d8bef9SDimitry Andric Index /= 8; 666*e8d8bef9SDimitry Andric 667*e8d8bef9SDimitry Andric Type *IntTy8 = Type::getInt8Ty(II.getContext()); 668*e8d8bef9SDimitry Andric auto *ShufTy = FixedVectorType::get(IntTy8, 16); 669*e8d8bef9SDimitry Andric 670*e8d8bef9SDimitry Andric SmallVector<int, 16> ShuffleMask; 671*e8d8bef9SDimitry Andric for (int i = 0; i != (int)Length; ++i) 672*e8d8bef9SDimitry Andric ShuffleMask.push_back(i + Index); 673*e8d8bef9SDimitry Andric for (int i = Length; i != 8; ++i) 674*e8d8bef9SDimitry Andric ShuffleMask.push_back(i + 16); 675*e8d8bef9SDimitry Andric for (int i = 8; i != 16; ++i) 676*e8d8bef9SDimitry Andric ShuffleMask.push_back(-1); 677*e8d8bef9SDimitry Andric 678*e8d8bef9SDimitry Andric Value *SV = Builder.CreateShuffleVector( 679*e8d8bef9SDimitry Andric Builder.CreateBitCast(Op0, ShufTy), 680*e8d8bef9SDimitry Andric ConstantAggregateZero::get(ShufTy), ShuffleMask); 681*e8d8bef9SDimitry Andric return Builder.CreateBitCast(SV, II.getType()); 682*e8d8bef9SDimitry Andric } 683*e8d8bef9SDimitry Andric 684*e8d8bef9SDimitry Andric // Constant Fold - shift Index'th bit to lowest position and mask off 685*e8d8bef9SDimitry Andric // Length bits. 686*e8d8bef9SDimitry Andric if (CI0) { 687*e8d8bef9SDimitry Andric APInt Elt = CI0->getValue(); 688*e8d8bef9SDimitry Andric Elt.lshrInPlace(Index); 689*e8d8bef9SDimitry Andric Elt = Elt.zextOrTrunc(Length); 690*e8d8bef9SDimitry Andric return LowConstantHighUndef(Elt.getZExtValue()); 691*e8d8bef9SDimitry Andric } 692*e8d8bef9SDimitry Andric 693*e8d8bef9SDimitry Andric // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 694*e8d8bef9SDimitry Andric if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 695*e8d8bef9SDimitry Andric Value *Args[] = {Op0, CILength, CIIndex}; 696*e8d8bef9SDimitry Andric Module *M = II.getModule(); 697*e8d8bef9SDimitry Andric Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 698*e8d8bef9SDimitry Andric return Builder.CreateCall(F, Args); 699*e8d8bef9SDimitry Andric } 700*e8d8bef9SDimitry Andric } 701*e8d8bef9SDimitry Andric 702*e8d8bef9SDimitry Andric // Constant Fold - extraction from zero is always {zero, undef}. 703*e8d8bef9SDimitry Andric if (CI0 && CI0->isZero()) 704*e8d8bef9SDimitry Andric return LowConstantHighUndef(0); 705*e8d8bef9SDimitry Andric 706*e8d8bef9SDimitry Andric return nullptr; 707*e8d8bef9SDimitry Andric } 708*e8d8bef9SDimitry Andric 709*e8d8bef9SDimitry Andric /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 710*e8d8bef9SDimitry Andric /// folding or conversion to a shuffle vector. 711*e8d8bef9SDimitry Andric static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 712*e8d8bef9SDimitry Andric APInt APLength, APInt APIndex, 713*e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 714*e8d8bef9SDimitry Andric // From AMD documentation: "The bit index and field length are each six bits 715*e8d8bef9SDimitry Andric // in length other bits of the field are ignored." 716*e8d8bef9SDimitry Andric APIndex = APIndex.zextOrTrunc(6); 717*e8d8bef9SDimitry Andric APLength = APLength.zextOrTrunc(6); 718*e8d8bef9SDimitry Andric 719*e8d8bef9SDimitry Andric // Attempt to constant fold. 720*e8d8bef9SDimitry Andric unsigned Index = APIndex.getZExtValue(); 721*e8d8bef9SDimitry Andric 722*e8d8bef9SDimitry Andric // From AMD documentation: "a value of zero in the field length is 723*e8d8bef9SDimitry Andric // defined as length of 64". 724*e8d8bef9SDimitry Andric unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 725*e8d8bef9SDimitry Andric 726*e8d8bef9SDimitry Andric // From AMD documentation: "If the sum of the bit index + length field 727*e8d8bef9SDimitry Andric // is greater than 64, the results are undefined". 728*e8d8bef9SDimitry Andric unsigned End = Index + Length; 729*e8d8bef9SDimitry Andric 730*e8d8bef9SDimitry Andric // Note that both field index and field length are 8-bit quantities. 731*e8d8bef9SDimitry Andric // Since variables 'Index' and 'Length' are unsigned values 732*e8d8bef9SDimitry Andric // obtained from zero-extending field index and field length 733*e8d8bef9SDimitry Andric // respectively, their sum should never wrap around. 734*e8d8bef9SDimitry Andric if (End > 64) 735*e8d8bef9SDimitry Andric return UndefValue::get(II.getType()); 736*e8d8bef9SDimitry Andric 737*e8d8bef9SDimitry Andric // If we are inserting whole bytes, we can convert this to a shuffle. 738*e8d8bef9SDimitry Andric // Lowering can recognize INSERTQI shuffle masks. 739*e8d8bef9SDimitry Andric if ((Length % 8) == 0 && (Index % 8) == 0) { 740*e8d8bef9SDimitry Andric // Convert bit indices to byte indices. 741*e8d8bef9SDimitry Andric Length /= 8; 742*e8d8bef9SDimitry Andric Index /= 8; 743*e8d8bef9SDimitry Andric 744*e8d8bef9SDimitry Andric Type *IntTy8 = Type::getInt8Ty(II.getContext()); 745*e8d8bef9SDimitry Andric auto *ShufTy = FixedVectorType::get(IntTy8, 16); 746*e8d8bef9SDimitry Andric 747*e8d8bef9SDimitry Andric SmallVector<int, 16> ShuffleMask; 748*e8d8bef9SDimitry Andric for (int i = 0; i != (int)Index; ++i) 749*e8d8bef9SDimitry Andric ShuffleMask.push_back(i); 750*e8d8bef9SDimitry Andric for (int i = 0; i != (int)Length; ++i) 751*e8d8bef9SDimitry Andric ShuffleMask.push_back(i + 16); 752*e8d8bef9SDimitry Andric for (int i = Index + Length; i != 8; ++i) 753*e8d8bef9SDimitry Andric ShuffleMask.push_back(i); 754*e8d8bef9SDimitry Andric for (int i = 8; i != 16; ++i) 755*e8d8bef9SDimitry Andric ShuffleMask.push_back(-1); 756*e8d8bef9SDimitry Andric 757*e8d8bef9SDimitry Andric Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 758*e8d8bef9SDimitry Andric Builder.CreateBitCast(Op1, ShufTy), 759*e8d8bef9SDimitry Andric ShuffleMask); 760*e8d8bef9SDimitry Andric return Builder.CreateBitCast(SV, II.getType()); 761*e8d8bef9SDimitry Andric } 762*e8d8bef9SDimitry Andric 763*e8d8bef9SDimitry Andric // See if we're dealing with constant values. 764*e8d8bef9SDimitry Andric Constant *C0 = dyn_cast<Constant>(Op0); 765*e8d8bef9SDimitry Andric Constant *C1 = dyn_cast<Constant>(Op1); 766*e8d8bef9SDimitry Andric ConstantInt *CI00 = 767*e8d8bef9SDimitry Andric C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 768*e8d8bef9SDimitry Andric : nullptr; 769*e8d8bef9SDimitry Andric ConstantInt *CI10 = 770*e8d8bef9SDimitry Andric C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 771*e8d8bef9SDimitry Andric : nullptr; 772*e8d8bef9SDimitry Andric 773*e8d8bef9SDimitry Andric // Constant Fold - insert bottom Length bits starting at the Index'th bit. 774*e8d8bef9SDimitry Andric if (CI00 && CI10) { 775*e8d8bef9SDimitry Andric APInt V00 = CI00->getValue(); 776*e8d8bef9SDimitry Andric APInt V10 = CI10->getValue(); 777*e8d8bef9SDimitry Andric APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 778*e8d8bef9SDimitry Andric V00 = V00 & ~Mask; 779*e8d8bef9SDimitry Andric V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 780*e8d8bef9SDimitry Andric APInt Val = V00 | V10; 781*e8d8bef9SDimitry Andric Type *IntTy64 = Type::getInt64Ty(II.getContext()); 782*e8d8bef9SDimitry Andric Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 783*e8d8bef9SDimitry Andric UndefValue::get(IntTy64)}; 784*e8d8bef9SDimitry Andric return ConstantVector::get(Args); 785*e8d8bef9SDimitry Andric } 786*e8d8bef9SDimitry Andric 787*e8d8bef9SDimitry Andric // If we were an INSERTQ call, we'll save demanded elements if we convert to 788*e8d8bef9SDimitry Andric // INSERTQI. 789*e8d8bef9SDimitry Andric if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 790*e8d8bef9SDimitry Andric Type *IntTy8 = Type::getInt8Ty(II.getContext()); 791*e8d8bef9SDimitry Andric Constant *CILength = ConstantInt::get(IntTy8, Length, false); 792*e8d8bef9SDimitry Andric Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 793*e8d8bef9SDimitry Andric 794*e8d8bef9SDimitry Andric Value *Args[] = {Op0, Op1, CILength, CIIndex}; 795*e8d8bef9SDimitry Andric Module *M = II.getModule(); 796*e8d8bef9SDimitry Andric Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 797*e8d8bef9SDimitry Andric return Builder.CreateCall(F, Args); 798*e8d8bef9SDimitry Andric } 799*e8d8bef9SDimitry Andric 800*e8d8bef9SDimitry Andric return nullptr; 801*e8d8bef9SDimitry Andric } 802*e8d8bef9SDimitry Andric 803*e8d8bef9SDimitry Andric /// Attempt to convert pshufb* to shufflevector if the mask is constant. 804*e8d8bef9SDimitry Andric static Value *simplifyX86pshufb(const IntrinsicInst &II, 805*e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 806*e8d8bef9SDimitry Andric Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 807*e8d8bef9SDimitry Andric if (!V) 808*e8d8bef9SDimitry Andric return nullptr; 809*e8d8bef9SDimitry Andric 810*e8d8bef9SDimitry Andric auto *VecTy = cast<FixedVectorType>(II.getType()); 811*e8d8bef9SDimitry Andric unsigned NumElts = VecTy->getNumElements(); 812*e8d8bef9SDimitry Andric assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 813*e8d8bef9SDimitry Andric "Unexpected number of elements in shuffle mask!"); 814*e8d8bef9SDimitry Andric 815*e8d8bef9SDimitry Andric // Construct a shuffle mask from constant integers or UNDEFs. 816*e8d8bef9SDimitry Andric int Indexes[64]; 817*e8d8bef9SDimitry Andric 818*e8d8bef9SDimitry Andric // Each byte in the shuffle control mask forms an index to permute the 819*e8d8bef9SDimitry Andric // corresponding byte in the destination operand. 820*e8d8bef9SDimitry Andric for (unsigned I = 0; I < NumElts; ++I) { 821*e8d8bef9SDimitry Andric Constant *COp = V->getAggregateElement(I); 822*e8d8bef9SDimitry Andric if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 823*e8d8bef9SDimitry Andric return nullptr; 824*e8d8bef9SDimitry Andric 825*e8d8bef9SDimitry Andric if (isa<UndefValue>(COp)) { 826*e8d8bef9SDimitry Andric Indexes[I] = -1; 827*e8d8bef9SDimitry Andric continue; 828*e8d8bef9SDimitry Andric } 829*e8d8bef9SDimitry Andric 830*e8d8bef9SDimitry Andric int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 831*e8d8bef9SDimitry Andric 832*e8d8bef9SDimitry Andric // If the most significant bit (bit[7]) of each byte of the shuffle 833*e8d8bef9SDimitry Andric // control mask is set, then zero is written in the result byte. 834*e8d8bef9SDimitry Andric // The zero vector is in the right-hand side of the resulting 835*e8d8bef9SDimitry Andric // shufflevector. 836*e8d8bef9SDimitry Andric 837*e8d8bef9SDimitry Andric // The value of each index for the high 128-bit lane is the least 838*e8d8bef9SDimitry Andric // significant 4 bits of the respective shuffle control byte. 839*e8d8bef9SDimitry Andric Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 840*e8d8bef9SDimitry Andric Indexes[I] = Index; 841*e8d8bef9SDimitry Andric } 842*e8d8bef9SDimitry Andric 843*e8d8bef9SDimitry Andric auto V1 = II.getArgOperand(0); 844*e8d8bef9SDimitry Andric auto V2 = Constant::getNullValue(VecTy); 845*e8d8bef9SDimitry Andric return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); 846*e8d8bef9SDimitry Andric } 847*e8d8bef9SDimitry Andric 848*e8d8bef9SDimitry Andric /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 849*e8d8bef9SDimitry Andric static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 850*e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 851*e8d8bef9SDimitry Andric Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 852*e8d8bef9SDimitry Andric if (!V) 853*e8d8bef9SDimitry Andric return nullptr; 854*e8d8bef9SDimitry Andric 855*e8d8bef9SDimitry Andric auto *VecTy = cast<FixedVectorType>(II.getType()); 856*e8d8bef9SDimitry Andric unsigned NumElts = VecTy->getNumElements(); 857*e8d8bef9SDimitry Andric bool IsPD = VecTy->getScalarType()->isDoubleTy(); 858*e8d8bef9SDimitry Andric unsigned NumLaneElts = IsPD ? 2 : 4; 859*e8d8bef9SDimitry Andric assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 860*e8d8bef9SDimitry Andric 861*e8d8bef9SDimitry Andric // Construct a shuffle mask from constant integers or UNDEFs. 862*e8d8bef9SDimitry Andric int Indexes[16]; 863*e8d8bef9SDimitry Andric 864*e8d8bef9SDimitry Andric // The intrinsics only read one or two bits, clear the rest. 865*e8d8bef9SDimitry Andric for (unsigned I = 0; I < NumElts; ++I) { 866*e8d8bef9SDimitry Andric Constant *COp = V->getAggregateElement(I); 867*e8d8bef9SDimitry Andric if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 868*e8d8bef9SDimitry Andric return nullptr; 869*e8d8bef9SDimitry Andric 870*e8d8bef9SDimitry Andric if (isa<UndefValue>(COp)) { 871*e8d8bef9SDimitry Andric Indexes[I] = -1; 872*e8d8bef9SDimitry Andric continue; 873*e8d8bef9SDimitry Andric } 874*e8d8bef9SDimitry Andric 875*e8d8bef9SDimitry Andric APInt Index = cast<ConstantInt>(COp)->getValue(); 876*e8d8bef9SDimitry Andric Index = Index.zextOrTrunc(32).getLoBits(2); 877*e8d8bef9SDimitry Andric 878*e8d8bef9SDimitry Andric // The PD variants uses bit 1 to select per-lane element index, so 879*e8d8bef9SDimitry Andric // shift down to convert to generic shuffle mask index. 880*e8d8bef9SDimitry Andric if (IsPD) 881*e8d8bef9SDimitry Andric Index.lshrInPlace(1); 882*e8d8bef9SDimitry Andric 883*e8d8bef9SDimitry Andric // The _256 variants are a bit trickier since the mask bits always index 884*e8d8bef9SDimitry Andric // into the corresponding 128 half. In order to convert to a generic 885*e8d8bef9SDimitry Andric // shuffle, we have to make that explicit. 886*e8d8bef9SDimitry Andric Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 887*e8d8bef9SDimitry Andric 888*e8d8bef9SDimitry Andric Indexes[I] = Index.getZExtValue(); 889*e8d8bef9SDimitry Andric } 890*e8d8bef9SDimitry Andric 891*e8d8bef9SDimitry Andric auto V1 = II.getArgOperand(0); 892*e8d8bef9SDimitry Andric return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts)); 893*e8d8bef9SDimitry Andric } 894*e8d8bef9SDimitry Andric 895*e8d8bef9SDimitry Andric /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 896*e8d8bef9SDimitry Andric static Value *simplifyX86vpermv(const IntrinsicInst &II, 897*e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 898*e8d8bef9SDimitry Andric auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 899*e8d8bef9SDimitry Andric if (!V) 900*e8d8bef9SDimitry Andric return nullptr; 901*e8d8bef9SDimitry Andric 902*e8d8bef9SDimitry Andric auto *VecTy = cast<FixedVectorType>(II.getType()); 903*e8d8bef9SDimitry Andric unsigned Size = VecTy->getNumElements(); 904*e8d8bef9SDimitry Andric assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 905*e8d8bef9SDimitry Andric "Unexpected shuffle mask size"); 906*e8d8bef9SDimitry Andric 907*e8d8bef9SDimitry Andric // Construct a shuffle mask from constant integers or UNDEFs. 908*e8d8bef9SDimitry Andric int Indexes[64]; 909*e8d8bef9SDimitry Andric 910*e8d8bef9SDimitry Andric for (unsigned I = 0; I < Size; ++I) { 911*e8d8bef9SDimitry Andric Constant *COp = V->getAggregateElement(I); 912*e8d8bef9SDimitry Andric if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 913*e8d8bef9SDimitry Andric return nullptr; 914*e8d8bef9SDimitry Andric 915*e8d8bef9SDimitry Andric if (isa<UndefValue>(COp)) { 916*e8d8bef9SDimitry Andric Indexes[I] = -1; 917*e8d8bef9SDimitry Andric continue; 918*e8d8bef9SDimitry Andric } 919*e8d8bef9SDimitry Andric 920*e8d8bef9SDimitry Andric uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 921*e8d8bef9SDimitry Andric Index &= Size - 1; 922*e8d8bef9SDimitry Andric Indexes[I] = Index; 923*e8d8bef9SDimitry Andric } 924*e8d8bef9SDimitry Andric 925*e8d8bef9SDimitry Andric auto V1 = II.getArgOperand(0); 926*e8d8bef9SDimitry Andric return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size)); 927*e8d8bef9SDimitry Andric } 928*e8d8bef9SDimitry Andric 929*e8d8bef9SDimitry Andric Optional<Instruction *> 930*e8d8bef9SDimitry Andric X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 931*e8d8bef9SDimitry Andric auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, 932*e8d8bef9SDimitry Andric unsigned DemandedWidth) { 933*e8d8bef9SDimitry Andric APInt UndefElts(Width, 0); 934*e8d8bef9SDimitry Andric APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 935*e8d8bef9SDimitry Andric return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 936*e8d8bef9SDimitry Andric }; 937*e8d8bef9SDimitry Andric 938*e8d8bef9SDimitry Andric Intrinsic::ID IID = II.getIntrinsicID(); 939*e8d8bef9SDimitry Andric switch (IID) { 940*e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_bextr_32: 941*e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_bextr_64: 942*e8d8bef9SDimitry Andric case Intrinsic::x86_tbm_bextri_u32: 943*e8d8bef9SDimitry Andric case Intrinsic::x86_tbm_bextri_u64: 944*e8d8bef9SDimitry Andric // If the RHS is a constant we can try some simplifications. 945*e8d8bef9SDimitry Andric if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 946*e8d8bef9SDimitry Andric uint64_t Shift = C->getZExtValue(); 947*e8d8bef9SDimitry Andric uint64_t Length = (Shift >> 8) & 0xff; 948*e8d8bef9SDimitry Andric Shift &= 0xff; 949*e8d8bef9SDimitry Andric unsigned BitWidth = II.getType()->getIntegerBitWidth(); 950*e8d8bef9SDimitry Andric // If the length is 0 or the shift is out of range, replace with zero. 951*e8d8bef9SDimitry Andric if (Length == 0 || Shift >= BitWidth) { 952*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 953*e8d8bef9SDimitry Andric } 954*e8d8bef9SDimitry Andric // If the LHS is also a constant, we can completely constant fold this. 955*e8d8bef9SDimitry Andric if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 956*e8d8bef9SDimitry Andric uint64_t Result = InC->getZExtValue() >> Shift; 957*e8d8bef9SDimitry Andric if (Length > BitWidth) 958*e8d8bef9SDimitry Andric Length = BitWidth; 959*e8d8bef9SDimitry Andric Result &= maskTrailingOnes<uint64_t>(Length); 960*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, 961*e8d8bef9SDimitry Andric ConstantInt::get(II.getType(), Result)); 962*e8d8bef9SDimitry Andric } 963*e8d8bef9SDimitry Andric // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 964*e8d8bef9SDimitry Andric // are only masking bits that a shift already cleared? 965*e8d8bef9SDimitry Andric } 966*e8d8bef9SDimitry Andric break; 967*e8d8bef9SDimitry Andric 968*e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_bzhi_32: 969*e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_bzhi_64: 970*e8d8bef9SDimitry Andric // If the RHS is a constant we can try some simplifications. 971*e8d8bef9SDimitry Andric if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 972*e8d8bef9SDimitry Andric uint64_t Index = C->getZExtValue() & 0xff; 973*e8d8bef9SDimitry Andric unsigned BitWidth = II.getType()->getIntegerBitWidth(); 974*e8d8bef9SDimitry Andric if (Index >= BitWidth) { 975*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 976*e8d8bef9SDimitry Andric } 977*e8d8bef9SDimitry Andric if (Index == 0) { 978*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 979*e8d8bef9SDimitry Andric } 980*e8d8bef9SDimitry Andric // If the LHS is also a constant, we can completely constant fold this. 981*e8d8bef9SDimitry Andric if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 982*e8d8bef9SDimitry Andric uint64_t Result = InC->getZExtValue(); 983*e8d8bef9SDimitry Andric Result &= maskTrailingOnes<uint64_t>(Index); 984*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, 985*e8d8bef9SDimitry Andric ConstantInt::get(II.getType(), Result)); 986*e8d8bef9SDimitry Andric } 987*e8d8bef9SDimitry Andric // TODO should we convert this to an AND if the RHS is constant? 988*e8d8bef9SDimitry Andric } 989*e8d8bef9SDimitry Andric break; 990*e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_pext_32: 991*e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_pext_64: 992*e8d8bef9SDimitry Andric if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 993*e8d8bef9SDimitry Andric if (MaskC->isNullValue()) { 994*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 995*e8d8bef9SDimitry Andric } 996*e8d8bef9SDimitry Andric if (MaskC->isAllOnesValue()) { 997*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 998*e8d8bef9SDimitry Andric } 999*e8d8bef9SDimitry Andric 1000*e8d8bef9SDimitry Andric if (MaskC->getValue().isShiftedMask()) { 1001*e8d8bef9SDimitry Andric // any single contingous sequence of 1s anywhere in the mask simply 1002*e8d8bef9SDimitry Andric // describes a subset of the input bits shifted to the appropriate 1003*e8d8bef9SDimitry Andric // position. Replace with the straight forward IR. 1004*e8d8bef9SDimitry Andric unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); 1005*e8d8bef9SDimitry Andric Value *Input = II.getArgOperand(0); 1006*e8d8bef9SDimitry Andric Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); 1007*e8d8bef9SDimitry Andric Value *Shifted = IC.Builder.CreateLShr(Masked, 1008*e8d8bef9SDimitry Andric ConstantInt::get(II.getType(), 1009*e8d8bef9SDimitry Andric ShiftAmount)); 1010*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, Shifted); 1011*e8d8bef9SDimitry Andric } 1012*e8d8bef9SDimitry Andric 1013*e8d8bef9SDimitry Andric 1014*e8d8bef9SDimitry Andric if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1015*e8d8bef9SDimitry Andric uint64_t Src = SrcC->getZExtValue(); 1016*e8d8bef9SDimitry Andric uint64_t Mask = MaskC->getZExtValue(); 1017*e8d8bef9SDimitry Andric uint64_t Result = 0; 1018*e8d8bef9SDimitry Andric uint64_t BitToSet = 1; 1019*e8d8bef9SDimitry Andric 1020*e8d8bef9SDimitry Andric while (Mask) { 1021*e8d8bef9SDimitry Andric // Isolate lowest set bit. 1022*e8d8bef9SDimitry Andric uint64_t BitToTest = Mask & -Mask; 1023*e8d8bef9SDimitry Andric if (BitToTest & Src) 1024*e8d8bef9SDimitry Andric Result |= BitToSet; 1025*e8d8bef9SDimitry Andric 1026*e8d8bef9SDimitry Andric BitToSet <<= 1; 1027*e8d8bef9SDimitry Andric // Clear lowest set bit. 1028*e8d8bef9SDimitry Andric Mask &= Mask - 1; 1029*e8d8bef9SDimitry Andric } 1030*e8d8bef9SDimitry Andric 1031*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, 1032*e8d8bef9SDimitry Andric ConstantInt::get(II.getType(), Result)); 1033*e8d8bef9SDimitry Andric } 1034*e8d8bef9SDimitry Andric } 1035*e8d8bef9SDimitry Andric break; 1036*e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_pdep_32: 1037*e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_pdep_64: 1038*e8d8bef9SDimitry Andric if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 1039*e8d8bef9SDimitry Andric if (MaskC->isNullValue()) { 1040*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 1041*e8d8bef9SDimitry Andric } 1042*e8d8bef9SDimitry Andric if (MaskC->isAllOnesValue()) { 1043*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 1044*e8d8bef9SDimitry Andric } 1045*e8d8bef9SDimitry Andric if (MaskC->getValue().isShiftedMask()) { 1046*e8d8bef9SDimitry Andric // any single contingous sequence of 1s anywhere in the mask simply 1047*e8d8bef9SDimitry Andric // describes a subset of the input bits shifted to the appropriate 1048*e8d8bef9SDimitry Andric // position. Replace with the straight forward IR. 1049*e8d8bef9SDimitry Andric unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); 1050*e8d8bef9SDimitry Andric Value *Input = II.getArgOperand(0); 1051*e8d8bef9SDimitry Andric Value *Shifted = IC.Builder.CreateShl(Input, 1052*e8d8bef9SDimitry Andric ConstantInt::get(II.getType(), 1053*e8d8bef9SDimitry Andric ShiftAmount)); 1054*e8d8bef9SDimitry Andric Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); 1055*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, Masked); 1056*e8d8bef9SDimitry Andric } 1057*e8d8bef9SDimitry Andric 1058*e8d8bef9SDimitry Andric if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1059*e8d8bef9SDimitry Andric uint64_t Src = SrcC->getZExtValue(); 1060*e8d8bef9SDimitry Andric uint64_t Mask = MaskC->getZExtValue(); 1061*e8d8bef9SDimitry Andric uint64_t Result = 0; 1062*e8d8bef9SDimitry Andric uint64_t BitToTest = 1; 1063*e8d8bef9SDimitry Andric 1064*e8d8bef9SDimitry Andric while (Mask) { 1065*e8d8bef9SDimitry Andric // Isolate lowest set bit. 1066*e8d8bef9SDimitry Andric uint64_t BitToSet = Mask & -Mask; 1067*e8d8bef9SDimitry Andric if (BitToTest & Src) 1068*e8d8bef9SDimitry Andric Result |= BitToSet; 1069*e8d8bef9SDimitry Andric 1070*e8d8bef9SDimitry Andric BitToTest <<= 1; 1071*e8d8bef9SDimitry Andric // Clear lowest set bit; 1072*e8d8bef9SDimitry Andric Mask &= Mask - 1; 1073*e8d8bef9SDimitry Andric } 1074*e8d8bef9SDimitry Andric 1075*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, 1076*e8d8bef9SDimitry Andric ConstantInt::get(II.getType(), Result)); 1077*e8d8bef9SDimitry Andric } 1078*e8d8bef9SDimitry Andric } 1079*e8d8bef9SDimitry Andric break; 1080*e8d8bef9SDimitry Andric 1081*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_cvtss2si: 1082*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_cvtss2si64: 1083*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_cvttss2si: 1084*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_cvttss2si64: 1085*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_cvtsd2si: 1086*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_cvtsd2si64: 1087*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_cvttsd2si: 1088*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_cvttsd2si64: 1089*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtss2si32: 1090*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtss2si64: 1091*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtss2usi32: 1092*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtss2usi64: 1093*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtsd2si32: 1094*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtsd2si64: 1095*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtsd2usi32: 1096*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtsd2usi64: 1097*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttss2si: 1098*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttss2si64: 1099*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttss2usi: 1100*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttss2usi64: 1101*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttsd2si: 1102*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttsd2si64: 1103*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttsd2usi: 1104*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttsd2usi64: { 1105*e8d8bef9SDimitry Andric // These intrinsics only demand the 0th element of their input vectors. If 1106*e8d8bef9SDimitry Andric // we can simplify the input based on that, do so now. 1107*e8d8bef9SDimitry Andric Value *Arg = II.getArgOperand(0); 1108*e8d8bef9SDimitry Andric unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); 1109*e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 1110*e8d8bef9SDimitry Andric return IC.replaceOperand(II, 0, V); 1111*e8d8bef9SDimitry Andric } 1112*e8d8bef9SDimitry Andric break; 1113*e8d8bef9SDimitry Andric } 1114*e8d8bef9SDimitry Andric 1115*e8d8bef9SDimitry Andric case Intrinsic::x86_mmx_pmovmskb: 1116*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_movmsk_ps: 1117*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_movmsk_pd: 1118*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pmovmskb_128: 1119*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_movmsk_pd_256: 1120*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_movmsk_ps_256: 1121*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pmovmskb: 1122*e8d8bef9SDimitry Andric if (Value *V = simplifyX86movmsk(II, IC.Builder)) { 1123*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1124*e8d8bef9SDimitry Andric } 1125*e8d8bef9SDimitry Andric break; 1126*e8d8bef9SDimitry Andric 1127*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_comieq_ss: 1128*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_comige_ss: 1129*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_comigt_ss: 1130*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_comile_ss: 1131*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_comilt_ss: 1132*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_comineq_ss: 1133*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_ucomieq_ss: 1134*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_ucomige_ss: 1135*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_ucomigt_ss: 1136*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_ucomile_ss: 1137*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_ucomilt_ss: 1138*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_ucomineq_ss: 1139*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_comieq_sd: 1140*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_comige_sd: 1141*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_comigt_sd: 1142*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_comile_sd: 1143*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_comilt_sd: 1144*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_comineq_sd: 1145*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_ucomieq_sd: 1146*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_ucomige_sd: 1147*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_ucomigt_sd: 1148*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_ucomile_sd: 1149*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_ucomilt_sd: 1150*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_ucomineq_sd: 1151*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcomi_ss: 1152*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcomi_sd: 1153*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_cmp_ss: 1154*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_cmp_sd: { 1155*e8d8bef9SDimitry Andric // These intrinsics only demand the 0th element of their input vectors. If 1156*e8d8bef9SDimitry Andric // we can simplify the input based on that, do so now. 1157*e8d8bef9SDimitry Andric bool MadeChange = false; 1158*e8d8bef9SDimitry Andric Value *Arg0 = II.getArgOperand(0); 1159*e8d8bef9SDimitry Andric Value *Arg1 = II.getArgOperand(1); 1160*e8d8bef9SDimitry Andric unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); 1161*e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 1162*e8d8bef9SDimitry Andric IC.replaceOperand(II, 0, V); 1163*e8d8bef9SDimitry Andric MadeChange = true; 1164*e8d8bef9SDimitry Andric } 1165*e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 1166*e8d8bef9SDimitry Andric IC.replaceOperand(II, 1, V); 1167*e8d8bef9SDimitry Andric MadeChange = true; 1168*e8d8bef9SDimitry Andric } 1169*e8d8bef9SDimitry Andric if (MadeChange) { 1170*e8d8bef9SDimitry Andric return &II; 1171*e8d8bef9SDimitry Andric } 1172*e8d8bef9SDimitry Andric break; 1173*e8d8bef9SDimitry Andric } 1174*e8d8bef9SDimitry Andric 1175*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_add_ps_512: 1176*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_div_ps_512: 1177*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mul_ps_512: 1178*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_sub_ps_512: 1179*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_add_pd_512: 1180*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_div_pd_512: 1181*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mul_pd_512: 1182*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_sub_pd_512: 1183*e8d8bef9SDimitry Andric // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 1184*e8d8bef9SDimitry Andric // IR operations. 1185*e8d8bef9SDimitry Andric if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1186*e8d8bef9SDimitry Andric if (R->getValue() == 4) { 1187*e8d8bef9SDimitry Andric Value *Arg0 = II.getArgOperand(0); 1188*e8d8bef9SDimitry Andric Value *Arg1 = II.getArgOperand(1); 1189*e8d8bef9SDimitry Andric 1190*e8d8bef9SDimitry Andric Value *V; 1191*e8d8bef9SDimitry Andric switch (IID) { 1192*e8d8bef9SDimitry Andric default: 1193*e8d8bef9SDimitry Andric llvm_unreachable("Case stmts out of sync!"); 1194*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_add_ps_512: 1195*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_add_pd_512: 1196*e8d8bef9SDimitry Andric V = IC.Builder.CreateFAdd(Arg0, Arg1); 1197*e8d8bef9SDimitry Andric break; 1198*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_sub_ps_512: 1199*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_sub_pd_512: 1200*e8d8bef9SDimitry Andric V = IC.Builder.CreateFSub(Arg0, Arg1); 1201*e8d8bef9SDimitry Andric break; 1202*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mul_ps_512: 1203*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mul_pd_512: 1204*e8d8bef9SDimitry Andric V = IC.Builder.CreateFMul(Arg0, Arg1); 1205*e8d8bef9SDimitry Andric break; 1206*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_div_ps_512: 1207*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_div_pd_512: 1208*e8d8bef9SDimitry Andric V = IC.Builder.CreateFDiv(Arg0, Arg1); 1209*e8d8bef9SDimitry Andric break; 1210*e8d8bef9SDimitry Andric } 1211*e8d8bef9SDimitry Andric 1212*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1213*e8d8bef9SDimitry Andric } 1214*e8d8bef9SDimitry Andric } 1215*e8d8bef9SDimitry Andric break; 1216*e8d8bef9SDimitry Andric 1217*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_add_ss_round: 1218*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_div_ss_round: 1219*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_mul_ss_round: 1220*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_sub_ss_round: 1221*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_add_sd_round: 1222*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_div_sd_round: 1223*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_mul_sd_round: 1224*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_sub_sd_round: 1225*e8d8bef9SDimitry Andric // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 1226*e8d8bef9SDimitry Andric // IR operations. 1227*e8d8bef9SDimitry Andric if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { 1228*e8d8bef9SDimitry Andric if (R->getValue() == 4) { 1229*e8d8bef9SDimitry Andric // Extract the element as scalars. 1230*e8d8bef9SDimitry Andric Value *Arg0 = II.getArgOperand(0); 1231*e8d8bef9SDimitry Andric Value *Arg1 = II.getArgOperand(1); 1232*e8d8bef9SDimitry Andric Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); 1233*e8d8bef9SDimitry Andric Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); 1234*e8d8bef9SDimitry Andric 1235*e8d8bef9SDimitry Andric Value *V; 1236*e8d8bef9SDimitry Andric switch (IID) { 1237*e8d8bef9SDimitry Andric default: 1238*e8d8bef9SDimitry Andric llvm_unreachable("Case stmts out of sync!"); 1239*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_add_ss_round: 1240*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_add_sd_round: 1241*e8d8bef9SDimitry Andric V = IC.Builder.CreateFAdd(LHS, RHS); 1242*e8d8bef9SDimitry Andric break; 1243*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_sub_ss_round: 1244*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_sub_sd_round: 1245*e8d8bef9SDimitry Andric V = IC.Builder.CreateFSub(LHS, RHS); 1246*e8d8bef9SDimitry Andric break; 1247*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_mul_ss_round: 1248*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_mul_sd_round: 1249*e8d8bef9SDimitry Andric V = IC.Builder.CreateFMul(LHS, RHS); 1250*e8d8bef9SDimitry Andric break; 1251*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_div_ss_round: 1252*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_div_sd_round: 1253*e8d8bef9SDimitry Andric V = IC.Builder.CreateFDiv(LHS, RHS); 1254*e8d8bef9SDimitry Andric break; 1255*e8d8bef9SDimitry Andric } 1256*e8d8bef9SDimitry Andric 1257*e8d8bef9SDimitry Andric // Handle the masking aspect of the intrinsic. 1258*e8d8bef9SDimitry Andric Value *Mask = II.getArgOperand(3); 1259*e8d8bef9SDimitry Andric auto *C = dyn_cast<ConstantInt>(Mask); 1260*e8d8bef9SDimitry Andric // We don't need a select if we know the mask bit is a 1. 1261*e8d8bef9SDimitry Andric if (!C || !C->getValue()[0]) { 1262*e8d8bef9SDimitry Andric // Cast the mask to an i1 vector and then extract the lowest element. 1263*e8d8bef9SDimitry Andric auto *MaskTy = FixedVectorType::get( 1264*e8d8bef9SDimitry Andric IC.Builder.getInt1Ty(), 1265*e8d8bef9SDimitry Andric cast<IntegerType>(Mask->getType())->getBitWidth()); 1266*e8d8bef9SDimitry Andric Mask = IC.Builder.CreateBitCast(Mask, MaskTy); 1267*e8d8bef9SDimitry Andric Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); 1268*e8d8bef9SDimitry Andric // Extract the lowest element from the passthru operand. 1269*e8d8bef9SDimitry Andric Value *Passthru = 1270*e8d8bef9SDimitry Andric IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); 1271*e8d8bef9SDimitry Andric V = IC.Builder.CreateSelect(Mask, V, Passthru); 1272*e8d8bef9SDimitry Andric } 1273*e8d8bef9SDimitry Andric 1274*e8d8bef9SDimitry Andric // Insert the result back into the original argument 0. 1275*e8d8bef9SDimitry Andric V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 1276*e8d8bef9SDimitry Andric 1277*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1278*e8d8bef9SDimitry Andric } 1279*e8d8bef9SDimitry Andric } 1280*e8d8bef9SDimitry Andric break; 1281*e8d8bef9SDimitry Andric 1282*e8d8bef9SDimitry Andric // Constant fold ashr( <A x Bi>, Ci ). 1283*e8d8bef9SDimitry Andric // Constant fold lshr( <A x Bi>, Ci ). 1284*e8d8bef9SDimitry Andric // Constant fold shl( <A x Bi>, Ci ). 1285*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrai_d: 1286*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrai_w: 1287*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrai_d: 1288*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrai_w: 1289*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_q_128: 1290*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_q_256: 1291*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_d_512: 1292*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_q_512: 1293*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_w_512: 1294*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrli_d: 1295*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrli_q: 1296*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrli_w: 1297*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrli_d: 1298*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrli_q: 1299*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrli_w: 1300*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrli_d_512: 1301*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrli_q_512: 1302*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrli_w_512: 1303*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pslli_d: 1304*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pslli_q: 1305*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pslli_w: 1306*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pslli_d: 1307*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pslli_q: 1308*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pslli_w: 1309*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pslli_d_512: 1310*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pslli_q_512: 1311*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pslli_w_512: 1312*e8d8bef9SDimitry Andric if (Value *V = simplifyX86immShift(II, IC.Builder)) { 1313*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1314*e8d8bef9SDimitry Andric } 1315*e8d8bef9SDimitry Andric break; 1316*e8d8bef9SDimitry Andric 1317*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psra_d: 1318*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psra_w: 1319*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psra_d: 1320*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psra_w: 1321*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_q_128: 1322*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_q_256: 1323*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_d_512: 1324*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_q_512: 1325*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_w_512: 1326*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrl_d: 1327*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrl_q: 1328*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrl_w: 1329*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrl_d: 1330*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrl_q: 1331*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrl_w: 1332*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrl_d_512: 1333*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrl_q_512: 1334*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrl_w_512: 1335*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psll_d: 1336*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psll_q: 1337*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psll_w: 1338*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psll_d: 1339*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psll_q: 1340*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psll_w: 1341*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psll_d_512: 1342*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psll_q_512: 1343*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psll_w_512: { 1344*e8d8bef9SDimitry Andric if (Value *V = simplifyX86immShift(II, IC.Builder)) { 1345*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1346*e8d8bef9SDimitry Andric } 1347*e8d8bef9SDimitry Andric 1348*e8d8bef9SDimitry Andric // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 1349*e8d8bef9SDimitry Andric // operand to compute the shift amount. 1350*e8d8bef9SDimitry Andric Value *Arg1 = II.getArgOperand(1); 1351*e8d8bef9SDimitry Andric assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 1352*e8d8bef9SDimitry Andric "Unexpected packed shift size"); 1353*e8d8bef9SDimitry Andric unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); 1354*e8d8bef9SDimitry Andric 1355*e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 1356*e8d8bef9SDimitry Andric return IC.replaceOperand(II, 1, V); 1357*e8d8bef9SDimitry Andric } 1358*e8d8bef9SDimitry Andric break; 1359*e8d8bef9SDimitry Andric } 1360*e8d8bef9SDimitry Andric 1361*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_d: 1362*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_d_256: 1363*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_q: 1364*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_q_256: 1365*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_d_512: 1366*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_q_512: 1367*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_w_128: 1368*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_w_256: 1369*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_w_512: 1370*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrav_d: 1371*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrav_d_256: 1372*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_q_128: 1373*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_q_256: 1374*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_d_512: 1375*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_q_512: 1376*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_w_128: 1377*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_w_256: 1378*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_w_512: 1379*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_d: 1380*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_d_256: 1381*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_q: 1382*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_q_256: 1383*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_d_512: 1384*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_q_512: 1385*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_w_128: 1386*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_w_256: 1387*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_w_512: 1388*e8d8bef9SDimitry Andric if (Value *V = simplifyX86varShift(II, IC.Builder)) { 1389*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1390*e8d8bef9SDimitry Andric } 1391*e8d8bef9SDimitry Andric break; 1392*e8d8bef9SDimitry Andric 1393*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_packssdw_128: 1394*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_packsswb_128: 1395*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packssdw: 1396*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packsswb: 1397*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packssdw_512: 1398*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packsswb_512: 1399*e8d8bef9SDimitry Andric if (Value *V = simplifyX86pack(II, IC.Builder, true)) { 1400*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1401*e8d8bef9SDimitry Andric } 1402*e8d8bef9SDimitry Andric break; 1403*e8d8bef9SDimitry Andric 1404*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_packuswb_128: 1405*e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_packusdw: 1406*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packusdw: 1407*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packuswb: 1408*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packusdw_512: 1409*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packuswb_512: 1410*e8d8bef9SDimitry Andric if (Value *V = simplifyX86pack(II, IC.Builder, false)) { 1411*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1412*e8d8bef9SDimitry Andric } 1413*e8d8bef9SDimitry Andric break; 1414*e8d8bef9SDimitry Andric 1415*e8d8bef9SDimitry Andric case Intrinsic::x86_pclmulqdq: 1416*e8d8bef9SDimitry Andric case Intrinsic::x86_pclmulqdq_256: 1417*e8d8bef9SDimitry Andric case Intrinsic::x86_pclmulqdq_512: { 1418*e8d8bef9SDimitry Andric if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1419*e8d8bef9SDimitry Andric unsigned Imm = C->getZExtValue(); 1420*e8d8bef9SDimitry Andric 1421*e8d8bef9SDimitry Andric bool MadeChange = false; 1422*e8d8bef9SDimitry Andric Value *Arg0 = II.getArgOperand(0); 1423*e8d8bef9SDimitry Andric Value *Arg1 = II.getArgOperand(1); 1424*e8d8bef9SDimitry Andric unsigned VWidth = 1425*e8d8bef9SDimitry Andric cast<FixedVectorType>(Arg0->getType())->getNumElements(); 1426*e8d8bef9SDimitry Andric 1427*e8d8bef9SDimitry Andric APInt UndefElts1(VWidth, 0); 1428*e8d8bef9SDimitry Andric APInt DemandedElts1 = 1429*e8d8bef9SDimitry Andric APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); 1430*e8d8bef9SDimitry Andric if (Value *V = 1431*e8d8bef9SDimitry Andric IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { 1432*e8d8bef9SDimitry Andric IC.replaceOperand(II, 0, V); 1433*e8d8bef9SDimitry Andric MadeChange = true; 1434*e8d8bef9SDimitry Andric } 1435*e8d8bef9SDimitry Andric 1436*e8d8bef9SDimitry Andric APInt UndefElts2(VWidth, 0); 1437*e8d8bef9SDimitry Andric APInt DemandedElts2 = 1438*e8d8bef9SDimitry Andric APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); 1439*e8d8bef9SDimitry Andric if (Value *V = 1440*e8d8bef9SDimitry Andric IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { 1441*e8d8bef9SDimitry Andric IC.replaceOperand(II, 1, V); 1442*e8d8bef9SDimitry Andric MadeChange = true; 1443*e8d8bef9SDimitry Andric } 1444*e8d8bef9SDimitry Andric 1445*e8d8bef9SDimitry Andric // If either input elements are undef, the result is zero. 1446*e8d8bef9SDimitry Andric if (DemandedElts1.isSubsetOf(UndefElts1) || 1447*e8d8bef9SDimitry Andric DemandedElts2.isSubsetOf(UndefElts2)) { 1448*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, 1449*e8d8bef9SDimitry Andric ConstantAggregateZero::get(II.getType())); 1450*e8d8bef9SDimitry Andric } 1451*e8d8bef9SDimitry Andric 1452*e8d8bef9SDimitry Andric if (MadeChange) { 1453*e8d8bef9SDimitry Andric return &II; 1454*e8d8bef9SDimitry Andric } 1455*e8d8bef9SDimitry Andric } 1456*e8d8bef9SDimitry Andric break; 1457*e8d8bef9SDimitry Andric } 1458*e8d8bef9SDimitry Andric 1459*e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_insertps: 1460*e8d8bef9SDimitry Andric if (Value *V = simplifyX86insertps(II, IC.Builder)) { 1461*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1462*e8d8bef9SDimitry Andric } 1463*e8d8bef9SDimitry Andric break; 1464*e8d8bef9SDimitry Andric 1465*e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_extrq: { 1466*e8d8bef9SDimitry Andric Value *Op0 = II.getArgOperand(0); 1467*e8d8bef9SDimitry Andric Value *Op1 = II.getArgOperand(1); 1468*e8d8bef9SDimitry Andric unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1469*e8d8bef9SDimitry Andric unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 1470*e8d8bef9SDimitry Andric assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1471*e8d8bef9SDimitry Andric Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 1472*e8d8bef9SDimitry Andric VWidth1 == 16 && "Unexpected operand sizes"); 1473*e8d8bef9SDimitry Andric 1474*e8d8bef9SDimitry Andric // See if we're dealing with constant values. 1475*e8d8bef9SDimitry Andric Constant *C1 = dyn_cast<Constant>(Op1); 1476*e8d8bef9SDimitry Andric ConstantInt *CILength = 1477*e8d8bef9SDimitry Andric C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 1478*e8d8bef9SDimitry Andric : nullptr; 1479*e8d8bef9SDimitry Andric ConstantInt *CIIndex = 1480*e8d8bef9SDimitry Andric C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 1481*e8d8bef9SDimitry Andric : nullptr; 1482*e8d8bef9SDimitry Andric 1483*e8d8bef9SDimitry Andric // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 1484*e8d8bef9SDimitry Andric if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 1485*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1486*e8d8bef9SDimitry Andric } 1487*e8d8bef9SDimitry Andric 1488*e8d8bef9SDimitry Andric // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 1489*e8d8bef9SDimitry Andric // operands and the lowest 16-bits of the second. 1490*e8d8bef9SDimitry Andric bool MadeChange = false; 1491*e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 1492*e8d8bef9SDimitry Andric IC.replaceOperand(II, 0, V); 1493*e8d8bef9SDimitry Andric MadeChange = true; 1494*e8d8bef9SDimitry Andric } 1495*e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 1496*e8d8bef9SDimitry Andric IC.replaceOperand(II, 1, V); 1497*e8d8bef9SDimitry Andric MadeChange = true; 1498*e8d8bef9SDimitry Andric } 1499*e8d8bef9SDimitry Andric if (MadeChange) { 1500*e8d8bef9SDimitry Andric return &II; 1501*e8d8bef9SDimitry Andric } 1502*e8d8bef9SDimitry Andric break; 1503*e8d8bef9SDimitry Andric } 1504*e8d8bef9SDimitry Andric 1505*e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_extrqi: { 1506*e8d8bef9SDimitry Andric // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 1507*e8d8bef9SDimitry Andric // bits of the lower 64-bits. The upper 64-bits are undefined. 1508*e8d8bef9SDimitry Andric Value *Op0 = II.getArgOperand(0); 1509*e8d8bef9SDimitry Andric unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1510*e8d8bef9SDimitry Andric assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 1511*e8d8bef9SDimitry Andric "Unexpected operand size"); 1512*e8d8bef9SDimitry Andric 1513*e8d8bef9SDimitry Andric // See if we're dealing with constant values. 1514*e8d8bef9SDimitry Andric ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); 1515*e8d8bef9SDimitry Andric ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1516*e8d8bef9SDimitry Andric 1517*e8d8bef9SDimitry Andric // Attempt to simplify to a constant or shuffle vector. 1518*e8d8bef9SDimitry Andric if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 1519*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1520*e8d8bef9SDimitry Andric } 1521*e8d8bef9SDimitry Andric 1522*e8d8bef9SDimitry Andric // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 1523*e8d8bef9SDimitry Andric // operand. 1524*e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 1525*e8d8bef9SDimitry Andric return IC.replaceOperand(II, 0, V); 1526*e8d8bef9SDimitry Andric } 1527*e8d8bef9SDimitry Andric break; 1528*e8d8bef9SDimitry Andric } 1529*e8d8bef9SDimitry Andric 1530*e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_insertq: { 1531*e8d8bef9SDimitry Andric Value *Op0 = II.getArgOperand(0); 1532*e8d8bef9SDimitry Andric Value *Op1 = II.getArgOperand(1); 1533*e8d8bef9SDimitry Andric unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1534*e8d8bef9SDimitry Andric assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1535*e8d8bef9SDimitry Andric Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 1536*e8d8bef9SDimitry Andric cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && 1537*e8d8bef9SDimitry Andric "Unexpected operand size"); 1538*e8d8bef9SDimitry Andric 1539*e8d8bef9SDimitry Andric // See if we're dealing with constant values. 1540*e8d8bef9SDimitry Andric Constant *C1 = dyn_cast<Constant>(Op1); 1541*e8d8bef9SDimitry Andric ConstantInt *CI11 = 1542*e8d8bef9SDimitry Andric C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 1543*e8d8bef9SDimitry Andric : nullptr; 1544*e8d8bef9SDimitry Andric 1545*e8d8bef9SDimitry Andric // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 1546*e8d8bef9SDimitry Andric if (CI11) { 1547*e8d8bef9SDimitry Andric const APInt &V11 = CI11->getValue(); 1548*e8d8bef9SDimitry Andric APInt Len = V11.zextOrTrunc(6); 1549*e8d8bef9SDimitry Andric APInt Idx = V11.lshr(8).zextOrTrunc(6); 1550*e8d8bef9SDimitry Andric if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 1551*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1552*e8d8bef9SDimitry Andric } 1553*e8d8bef9SDimitry Andric } 1554*e8d8bef9SDimitry Andric 1555*e8d8bef9SDimitry Andric // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 1556*e8d8bef9SDimitry Andric // operand. 1557*e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 1558*e8d8bef9SDimitry Andric return IC.replaceOperand(II, 0, V); 1559*e8d8bef9SDimitry Andric } 1560*e8d8bef9SDimitry Andric break; 1561*e8d8bef9SDimitry Andric } 1562*e8d8bef9SDimitry Andric 1563*e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_insertqi: { 1564*e8d8bef9SDimitry Andric // INSERTQI: Extract lowest Length bits from lower half of second source and 1565*e8d8bef9SDimitry Andric // insert over first source starting at Index bit. The upper 64-bits are 1566*e8d8bef9SDimitry Andric // undefined. 1567*e8d8bef9SDimitry Andric Value *Op0 = II.getArgOperand(0); 1568*e8d8bef9SDimitry Andric Value *Op1 = II.getArgOperand(1); 1569*e8d8bef9SDimitry Andric unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1570*e8d8bef9SDimitry Andric unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 1571*e8d8bef9SDimitry Andric assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1572*e8d8bef9SDimitry Andric Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 1573*e8d8bef9SDimitry Andric VWidth1 == 2 && "Unexpected operand sizes"); 1574*e8d8bef9SDimitry Andric 1575*e8d8bef9SDimitry Andric // See if we're dealing with constant values. 1576*e8d8bef9SDimitry Andric ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1577*e8d8bef9SDimitry Andric ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); 1578*e8d8bef9SDimitry Andric 1579*e8d8bef9SDimitry Andric // Attempt to simplify to a constant or shuffle vector. 1580*e8d8bef9SDimitry Andric if (CILength && CIIndex) { 1581*e8d8bef9SDimitry Andric APInt Len = CILength->getValue().zextOrTrunc(6); 1582*e8d8bef9SDimitry Andric APInt Idx = CIIndex->getValue().zextOrTrunc(6); 1583*e8d8bef9SDimitry Andric if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 1584*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1585*e8d8bef9SDimitry Andric } 1586*e8d8bef9SDimitry Andric } 1587*e8d8bef9SDimitry Andric 1588*e8d8bef9SDimitry Andric // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 1589*e8d8bef9SDimitry Andric // operands. 1590*e8d8bef9SDimitry Andric bool MadeChange = false; 1591*e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 1592*e8d8bef9SDimitry Andric IC.replaceOperand(II, 0, V); 1593*e8d8bef9SDimitry Andric MadeChange = true; 1594*e8d8bef9SDimitry Andric } 1595*e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 1596*e8d8bef9SDimitry Andric IC.replaceOperand(II, 1, V); 1597*e8d8bef9SDimitry Andric MadeChange = true; 1598*e8d8bef9SDimitry Andric } 1599*e8d8bef9SDimitry Andric if (MadeChange) { 1600*e8d8bef9SDimitry Andric return &II; 1601*e8d8bef9SDimitry Andric } 1602*e8d8bef9SDimitry Andric break; 1603*e8d8bef9SDimitry Andric } 1604*e8d8bef9SDimitry Andric 1605*e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_pblendvb: 1606*e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_blendvps: 1607*e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_blendvpd: 1608*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_blendv_ps_256: 1609*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_blendv_pd_256: 1610*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pblendvb: { 1611*e8d8bef9SDimitry Andric // fold (blend A, A, Mask) -> A 1612*e8d8bef9SDimitry Andric Value *Op0 = II.getArgOperand(0); 1613*e8d8bef9SDimitry Andric Value *Op1 = II.getArgOperand(1); 1614*e8d8bef9SDimitry Andric Value *Mask = II.getArgOperand(2); 1615*e8d8bef9SDimitry Andric if (Op0 == Op1) { 1616*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, Op0); 1617*e8d8bef9SDimitry Andric } 1618*e8d8bef9SDimitry Andric 1619*e8d8bef9SDimitry Andric // Zero Mask - select 1st argument. 1620*e8d8bef9SDimitry Andric if (isa<ConstantAggregateZero>(Mask)) { 1621*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, Op0); 1622*e8d8bef9SDimitry Andric } 1623*e8d8bef9SDimitry Andric 1624*e8d8bef9SDimitry Andric // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 1625*e8d8bef9SDimitry Andric if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 1626*e8d8bef9SDimitry Andric Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 1627*e8d8bef9SDimitry Andric return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 1628*e8d8bef9SDimitry Andric } 1629*e8d8bef9SDimitry Andric 1630*e8d8bef9SDimitry Andric // Convert to a vector select if we can bypass casts and find a boolean 1631*e8d8bef9SDimitry Andric // vector condition value. 1632*e8d8bef9SDimitry Andric Value *BoolVec; 1633*e8d8bef9SDimitry Andric Mask = InstCombiner::peekThroughBitcast(Mask); 1634*e8d8bef9SDimitry Andric if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && 1635*e8d8bef9SDimitry Andric BoolVec->getType()->isVectorTy() && 1636*e8d8bef9SDimitry Andric BoolVec->getType()->getScalarSizeInBits() == 1) { 1637*e8d8bef9SDimitry Andric assert(Mask->getType()->getPrimitiveSizeInBits() == 1638*e8d8bef9SDimitry Andric II.getType()->getPrimitiveSizeInBits() && 1639*e8d8bef9SDimitry Andric "Not expecting mask and operands with different sizes"); 1640*e8d8bef9SDimitry Andric 1641*e8d8bef9SDimitry Andric unsigned NumMaskElts = 1642*e8d8bef9SDimitry Andric cast<FixedVectorType>(Mask->getType())->getNumElements(); 1643*e8d8bef9SDimitry Andric unsigned NumOperandElts = 1644*e8d8bef9SDimitry Andric cast<FixedVectorType>(II.getType())->getNumElements(); 1645*e8d8bef9SDimitry Andric if (NumMaskElts == NumOperandElts) { 1646*e8d8bef9SDimitry Andric return SelectInst::Create(BoolVec, Op1, Op0); 1647*e8d8bef9SDimitry Andric } 1648*e8d8bef9SDimitry Andric 1649*e8d8bef9SDimitry Andric // If the mask has less elements than the operands, each mask bit maps to 1650*e8d8bef9SDimitry Andric // multiple elements of the operands. Bitcast back and forth. 1651*e8d8bef9SDimitry Andric if (NumMaskElts < NumOperandElts) { 1652*e8d8bef9SDimitry Andric Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType()); 1653*e8d8bef9SDimitry Andric Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType()); 1654*e8d8bef9SDimitry Andric Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 1655*e8d8bef9SDimitry Andric return new BitCastInst(Sel, II.getType()); 1656*e8d8bef9SDimitry Andric } 1657*e8d8bef9SDimitry Andric } 1658*e8d8bef9SDimitry Andric 1659*e8d8bef9SDimitry Andric break; 1660*e8d8bef9SDimitry Andric } 1661*e8d8bef9SDimitry Andric 1662*e8d8bef9SDimitry Andric case Intrinsic::x86_ssse3_pshuf_b_128: 1663*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pshuf_b: 1664*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pshuf_b_512: 1665*e8d8bef9SDimitry Andric if (Value *V = simplifyX86pshufb(II, IC.Builder)) { 1666*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1667*e8d8bef9SDimitry Andric } 1668*e8d8bef9SDimitry Andric break; 1669*e8d8bef9SDimitry Andric 1670*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_ps: 1671*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_ps_256: 1672*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vpermilvar_ps_512: 1673*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_pd: 1674*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_pd_256: 1675*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vpermilvar_pd_512: 1676*e8d8bef9SDimitry Andric if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 1677*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1678*e8d8bef9SDimitry Andric } 1679*e8d8bef9SDimitry Andric break; 1680*e8d8bef9SDimitry Andric 1681*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_permd: 1682*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_permps: 1683*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_df_256: 1684*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_df_512: 1685*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_di_256: 1686*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_di_512: 1687*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_hi_128: 1688*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_hi_256: 1689*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_hi_512: 1690*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_qi_128: 1691*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_qi_256: 1692*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_qi_512: 1693*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_sf_512: 1694*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_si_512: 1695*e8d8bef9SDimitry Andric if (Value *V = simplifyX86vpermv(II, IC.Builder)) { 1696*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1697*e8d8bef9SDimitry Andric } 1698*e8d8bef9SDimitry Andric break; 1699*e8d8bef9SDimitry Andric 1700*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskload_ps: 1701*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskload_pd: 1702*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskload_ps_256: 1703*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskload_pd_256: 1704*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskload_d: 1705*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskload_q: 1706*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskload_d_256: 1707*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskload_q_256: 1708*e8d8bef9SDimitry Andric if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { 1709*e8d8bef9SDimitry Andric return I; 1710*e8d8bef9SDimitry Andric } 1711*e8d8bef9SDimitry Andric break; 1712*e8d8bef9SDimitry Andric 1713*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_maskmov_dqu: 1714*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskstore_ps: 1715*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskstore_pd: 1716*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskstore_ps_256: 1717*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskstore_pd_256: 1718*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskstore_d: 1719*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskstore_q: 1720*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskstore_d_256: 1721*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskstore_q_256: 1722*e8d8bef9SDimitry Andric if (simplifyX86MaskedStore(II, IC)) { 1723*e8d8bef9SDimitry Andric return nullptr; 1724*e8d8bef9SDimitry Andric } 1725*e8d8bef9SDimitry Andric break; 1726*e8d8bef9SDimitry Andric 1727*e8d8bef9SDimitry Andric case Intrinsic::x86_addcarry_32: 1728*e8d8bef9SDimitry Andric case Intrinsic::x86_addcarry_64: 1729*e8d8bef9SDimitry Andric if (Value *V = simplifyX86addcarry(II, IC.Builder)) { 1730*e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1731*e8d8bef9SDimitry Andric } 1732*e8d8bef9SDimitry Andric break; 1733*e8d8bef9SDimitry Andric 1734*e8d8bef9SDimitry Andric default: 1735*e8d8bef9SDimitry Andric break; 1736*e8d8bef9SDimitry Andric } 1737*e8d8bef9SDimitry Andric return None; 1738*e8d8bef9SDimitry Andric } 1739*e8d8bef9SDimitry Andric 1740*e8d8bef9SDimitry Andric Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( 1741*e8d8bef9SDimitry Andric InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, 1742*e8d8bef9SDimitry Andric bool &KnownBitsComputed) const { 1743*e8d8bef9SDimitry Andric switch (II.getIntrinsicID()) { 1744*e8d8bef9SDimitry Andric default: 1745*e8d8bef9SDimitry Andric break; 1746*e8d8bef9SDimitry Andric case Intrinsic::x86_mmx_pmovmskb: 1747*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_movmsk_ps: 1748*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_movmsk_pd: 1749*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pmovmskb_128: 1750*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_movmsk_ps_256: 1751*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_movmsk_pd_256: 1752*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pmovmskb: { 1753*e8d8bef9SDimitry Andric // MOVMSK copies the vector elements' sign bits to the low bits 1754*e8d8bef9SDimitry Andric // and zeros the high bits. 1755*e8d8bef9SDimitry Andric unsigned ArgWidth; 1756*e8d8bef9SDimitry Andric if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { 1757*e8d8bef9SDimitry Andric ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. 1758*e8d8bef9SDimitry Andric } else { 1759*e8d8bef9SDimitry Andric auto Arg = II.getArgOperand(0); 1760*e8d8bef9SDimitry Andric auto ArgType = cast<FixedVectorType>(Arg->getType()); 1761*e8d8bef9SDimitry Andric ArgWidth = ArgType->getNumElements(); 1762*e8d8bef9SDimitry Andric } 1763*e8d8bef9SDimitry Andric 1764*e8d8bef9SDimitry Andric // If we don't need any of low bits then return zero, 1765*e8d8bef9SDimitry Andric // we know that DemandedMask is non-zero already. 1766*e8d8bef9SDimitry Andric APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); 1767*e8d8bef9SDimitry Andric Type *VTy = II.getType(); 1768*e8d8bef9SDimitry Andric if (DemandedElts.isNullValue()) { 1769*e8d8bef9SDimitry Andric return ConstantInt::getNullValue(VTy); 1770*e8d8bef9SDimitry Andric } 1771*e8d8bef9SDimitry Andric 1772*e8d8bef9SDimitry Andric // We know that the upper bits are set to zero. 1773*e8d8bef9SDimitry Andric Known.Zero.setBitsFrom(ArgWidth); 1774*e8d8bef9SDimitry Andric KnownBitsComputed = true; 1775*e8d8bef9SDimitry Andric break; 1776*e8d8bef9SDimitry Andric } 1777*e8d8bef9SDimitry Andric } 1778*e8d8bef9SDimitry Andric return None; 1779*e8d8bef9SDimitry Andric } 1780*e8d8bef9SDimitry Andric 1781*e8d8bef9SDimitry Andric Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1782*e8d8bef9SDimitry Andric InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1783*e8d8bef9SDimitry Andric APInt &UndefElts2, APInt &UndefElts3, 1784*e8d8bef9SDimitry Andric std::function<void(Instruction *, unsigned, APInt, APInt &)> 1785*e8d8bef9SDimitry Andric simplifyAndSetOp) const { 1786*e8d8bef9SDimitry Andric unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); 1787*e8d8bef9SDimitry Andric switch (II.getIntrinsicID()) { 1788*e8d8bef9SDimitry Andric default: 1789*e8d8bef9SDimitry Andric break; 1790*e8d8bef9SDimitry Andric case Intrinsic::x86_xop_vfrcz_ss: 1791*e8d8bef9SDimitry Andric case Intrinsic::x86_xop_vfrcz_sd: 1792*e8d8bef9SDimitry Andric // The instructions for these intrinsics are speced to zero upper bits not 1793*e8d8bef9SDimitry Andric // pass them through like other scalar intrinsics. So we shouldn't just 1794*e8d8bef9SDimitry Andric // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. 1795*e8d8bef9SDimitry Andric // Instead we should return a zero vector. 1796*e8d8bef9SDimitry Andric if (!DemandedElts[0]) { 1797*e8d8bef9SDimitry Andric IC.addToWorklist(&II); 1798*e8d8bef9SDimitry Andric return ConstantAggregateZero::get(II.getType()); 1799*e8d8bef9SDimitry Andric } 1800*e8d8bef9SDimitry Andric 1801*e8d8bef9SDimitry Andric // Only the lower element is used. 1802*e8d8bef9SDimitry Andric DemandedElts = 1; 1803*e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1804*e8d8bef9SDimitry Andric 1805*e8d8bef9SDimitry Andric // Only the lower element is undefined. The high elements are zero. 1806*e8d8bef9SDimitry Andric UndefElts = UndefElts[0]; 1807*e8d8bef9SDimitry Andric break; 1808*e8d8bef9SDimitry Andric 1809*e8d8bef9SDimitry Andric // Unary scalar-as-vector operations that work column-wise. 1810*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_rcp_ss: 1811*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_rsqrt_ss: 1812*e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1813*e8d8bef9SDimitry Andric 1814*e8d8bef9SDimitry Andric // If lowest element of a scalar op isn't used then use Arg0. 1815*e8d8bef9SDimitry Andric if (!DemandedElts[0]) { 1816*e8d8bef9SDimitry Andric IC.addToWorklist(&II); 1817*e8d8bef9SDimitry Andric return II.getArgOperand(0); 1818*e8d8bef9SDimitry Andric } 1819*e8d8bef9SDimitry Andric // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions 1820*e8d8bef9SDimitry Andric // checks). 1821*e8d8bef9SDimitry Andric break; 1822*e8d8bef9SDimitry Andric 1823*e8d8bef9SDimitry Andric // Binary scalar-as-vector operations that work column-wise. The high 1824*e8d8bef9SDimitry Andric // elements come from operand 0. The low element is a function of both 1825*e8d8bef9SDimitry Andric // operands. 1826*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_min_ss: 1827*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_max_ss: 1828*e8d8bef9SDimitry Andric case Intrinsic::x86_sse_cmp_ss: 1829*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_min_sd: 1830*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_max_sd: 1831*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_cmp_sd: { 1832*e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1833*e8d8bef9SDimitry Andric 1834*e8d8bef9SDimitry Andric // If lowest element of a scalar op isn't used then use Arg0. 1835*e8d8bef9SDimitry Andric if (!DemandedElts[0]) { 1836*e8d8bef9SDimitry Andric IC.addToWorklist(&II); 1837*e8d8bef9SDimitry Andric return II.getArgOperand(0); 1838*e8d8bef9SDimitry Andric } 1839*e8d8bef9SDimitry Andric 1840*e8d8bef9SDimitry Andric // Only lower element is used for operand 1. 1841*e8d8bef9SDimitry Andric DemandedElts = 1; 1842*e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1843*e8d8bef9SDimitry Andric 1844*e8d8bef9SDimitry Andric // Lower element is undefined if both lower elements are undefined. 1845*e8d8bef9SDimitry Andric // Consider things like undef&0. The result is known zero, not undef. 1846*e8d8bef9SDimitry Andric if (!UndefElts2[0]) 1847*e8d8bef9SDimitry Andric UndefElts.clearBit(0); 1848*e8d8bef9SDimitry Andric 1849*e8d8bef9SDimitry Andric break; 1850*e8d8bef9SDimitry Andric } 1851*e8d8bef9SDimitry Andric 1852*e8d8bef9SDimitry Andric // Binary scalar-as-vector operations that work column-wise. The high 1853*e8d8bef9SDimitry Andric // elements come from operand 0 and the low element comes from operand 1. 1854*e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_round_ss: 1855*e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_round_sd: { 1856*e8d8bef9SDimitry Andric // Don't use the low element of operand 0. 1857*e8d8bef9SDimitry Andric APInt DemandedElts2 = DemandedElts; 1858*e8d8bef9SDimitry Andric DemandedElts2.clearBit(0); 1859*e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); 1860*e8d8bef9SDimitry Andric 1861*e8d8bef9SDimitry Andric // If lowest element of a scalar op isn't used then use Arg0. 1862*e8d8bef9SDimitry Andric if (!DemandedElts[0]) { 1863*e8d8bef9SDimitry Andric IC.addToWorklist(&II); 1864*e8d8bef9SDimitry Andric return II.getArgOperand(0); 1865*e8d8bef9SDimitry Andric } 1866*e8d8bef9SDimitry Andric 1867*e8d8bef9SDimitry Andric // Only lower element is used for operand 1. 1868*e8d8bef9SDimitry Andric DemandedElts = 1; 1869*e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1870*e8d8bef9SDimitry Andric 1871*e8d8bef9SDimitry Andric // Take the high undef elements from operand 0 and take the lower element 1872*e8d8bef9SDimitry Andric // from operand 1. 1873*e8d8bef9SDimitry Andric UndefElts.clearBit(0); 1874*e8d8bef9SDimitry Andric UndefElts |= UndefElts2[0]; 1875*e8d8bef9SDimitry Andric break; 1876*e8d8bef9SDimitry Andric } 1877*e8d8bef9SDimitry Andric 1878*e8d8bef9SDimitry Andric // Three input scalar-as-vector operations that work column-wise. The high 1879*e8d8bef9SDimitry Andric // elements come from operand 0 and the low element is a function of all 1880*e8d8bef9SDimitry Andric // three inputs. 1881*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_add_ss_round: 1882*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_div_ss_round: 1883*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_mul_ss_round: 1884*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_sub_ss_round: 1885*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_max_ss_round: 1886*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_min_ss_round: 1887*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_add_sd_round: 1888*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_div_sd_round: 1889*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_mul_sd_round: 1890*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_sub_sd_round: 1891*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_max_sd_round: 1892*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_min_sd_round: 1893*e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1894*e8d8bef9SDimitry Andric 1895*e8d8bef9SDimitry Andric // If lowest element of a scalar op isn't used then use Arg0. 1896*e8d8bef9SDimitry Andric if (!DemandedElts[0]) { 1897*e8d8bef9SDimitry Andric IC.addToWorklist(&II); 1898*e8d8bef9SDimitry Andric return II.getArgOperand(0); 1899*e8d8bef9SDimitry Andric } 1900*e8d8bef9SDimitry Andric 1901*e8d8bef9SDimitry Andric // Only lower element is used for operand 1 and 2. 1902*e8d8bef9SDimitry Andric DemandedElts = 1; 1903*e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1904*e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); 1905*e8d8bef9SDimitry Andric 1906*e8d8bef9SDimitry Andric // Lower element is undefined if all three lower elements are undefined. 1907*e8d8bef9SDimitry Andric // Consider things like undef&0. The result is known zero, not undef. 1908*e8d8bef9SDimitry Andric if (!UndefElts2[0] || !UndefElts3[0]) 1909*e8d8bef9SDimitry Andric UndefElts.clearBit(0); 1910*e8d8bef9SDimitry Andric break; 1911*e8d8bef9SDimitry Andric 1912*e8d8bef9SDimitry Andric // TODO: Add fmaddsub support? 1913*e8d8bef9SDimitry Andric case Intrinsic::x86_sse3_addsub_pd: 1914*e8d8bef9SDimitry Andric case Intrinsic::x86_sse3_addsub_ps: 1915*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_addsub_pd_256: 1916*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_addsub_ps_256: { 1917*e8d8bef9SDimitry Andric // If none of the even or none of the odd lanes are required, turn this 1918*e8d8bef9SDimitry Andric // into a generic FP math instruction. 1919*e8d8bef9SDimitry Andric APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); 1920*e8d8bef9SDimitry Andric APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); 1921*e8d8bef9SDimitry Andric bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); 1922*e8d8bef9SDimitry Andric bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); 1923*e8d8bef9SDimitry Andric if (IsSubOnly || IsAddOnly) { 1924*e8d8bef9SDimitry Andric assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); 1925*e8d8bef9SDimitry Andric IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1926*e8d8bef9SDimitry Andric IC.Builder.SetInsertPoint(&II); 1927*e8d8bef9SDimitry Andric Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); 1928*e8d8bef9SDimitry Andric return IC.Builder.CreateBinOp( 1929*e8d8bef9SDimitry Andric IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); 1930*e8d8bef9SDimitry Andric } 1931*e8d8bef9SDimitry Andric 1932*e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1933*e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1934*e8d8bef9SDimitry Andric UndefElts &= UndefElts2; 1935*e8d8bef9SDimitry Andric break; 1936*e8d8bef9SDimitry Andric } 1937*e8d8bef9SDimitry Andric 1938*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_packssdw_128: 1939*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_packsswb_128: 1940*e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_packuswb_128: 1941*e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_packusdw: 1942*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packssdw: 1943*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packsswb: 1944*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packusdw: 1945*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packuswb: 1946*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packssdw_512: 1947*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packsswb_512: 1948*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packusdw_512: 1949*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packuswb_512: { 1950*e8d8bef9SDimitry Andric auto *Ty0 = II.getArgOperand(0)->getType(); 1951*e8d8bef9SDimitry Andric unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); 1952*e8d8bef9SDimitry Andric assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); 1953*e8d8bef9SDimitry Andric 1954*e8d8bef9SDimitry Andric unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; 1955*e8d8bef9SDimitry Andric unsigned VWidthPerLane = VWidth / NumLanes; 1956*e8d8bef9SDimitry Andric unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; 1957*e8d8bef9SDimitry Andric 1958*e8d8bef9SDimitry Andric // Per lane, pack the elements of the first input and then the second. 1959*e8d8bef9SDimitry Andric // e.g. 1960*e8d8bef9SDimitry Andric // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) 1961*e8d8bef9SDimitry Andric // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) 1962*e8d8bef9SDimitry Andric for (int OpNum = 0; OpNum != 2; ++OpNum) { 1963*e8d8bef9SDimitry Andric APInt OpDemandedElts(InnerVWidth, 0); 1964*e8d8bef9SDimitry Andric for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 1965*e8d8bef9SDimitry Andric unsigned LaneIdx = Lane * VWidthPerLane; 1966*e8d8bef9SDimitry Andric for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { 1967*e8d8bef9SDimitry Andric unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; 1968*e8d8bef9SDimitry Andric if (DemandedElts[Idx]) 1969*e8d8bef9SDimitry Andric OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); 1970*e8d8bef9SDimitry Andric } 1971*e8d8bef9SDimitry Andric } 1972*e8d8bef9SDimitry Andric 1973*e8d8bef9SDimitry Andric // Demand elements from the operand. 1974*e8d8bef9SDimitry Andric APInt OpUndefElts(InnerVWidth, 0); 1975*e8d8bef9SDimitry Andric simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); 1976*e8d8bef9SDimitry Andric 1977*e8d8bef9SDimitry Andric // Pack the operand's UNDEF elements, one lane at a time. 1978*e8d8bef9SDimitry Andric OpUndefElts = OpUndefElts.zext(VWidth); 1979*e8d8bef9SDimitry Andric for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 1980*e8d8bef9SDimitry Andric APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); 1981*e8d8bef9SDimitry Andric LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); 1982*e8d8bef9SDimitry Andric LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); 1983*e8d8bef9SDimitry Andric UndefElts |= LaneElts; 1984*e8d8bef9SDimitry Andric } 1985*e8d8bef9SDimitry Andric } 1986*e8d8bef9SDimitry Andric break; 1987*e8d8bef9SDimitry Andric } 1988*e8d8bef9SDimitry Andric 1989*e8d8bef9SDimitry Andric // PSHUFB 1990*e8d8bef9SDimitry Andric case Intrinsic::x86_ssse3_pshuf_b_128: 1991*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pshuf_b: 1992*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pshuf_b_512: 1993*e8d8bef9SDimitry Andric // PERMILVAR 1994*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_ps: 1995*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_ps_256: 1996*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vpermilvar_ps_512: 1997*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_pd: 1998*e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_pd_256: 1999*e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vpermilvar_pd_512: 2000*e8d8bef9SDimitry Andric // PERMV 2001*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_permd: 2002*e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_permps: { 2003*e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); 2004*e8d8bef9SDimitry Andric break; 2005*e8d8bef9SDimitry Andric } 2006*e8d8bef9SDimitry Andric 2007*e8d8bef9SDimitry Andric // SSE4A instructions leave the upper 64-bits of the 128-bit result 2008*e8d8bef9SDimitry Andric // in an undefined state. 2009*e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_extrq: 2010*e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_extrqi: 2011*e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_insertq: 2012*e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_insertqi: 2013*e8d8bef9SDimitry Andric UndefElts.setHighBits(VWidth / 2); 2014*e8d8bef9SDimitry Andric break; 2015*e8d8bef9SDimitry Andric } 2016*e8d8bef9SDimitry Andric return None; 2017*e8d8bef9SDimitry Andric } 2018