xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp (revision e8d8bef961a50d4dc22501cde4fb9fb0be1b2532)
1*e8d8bef9SDimitry Andric //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2*e8d8bef9SDimitry Andric //
3*e8d8bef9SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*e8d8bef9SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*e8d8bef9SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*e8d8bef9SDimitry Andric //
7*e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===//
8*e8d8bef9SDimitry Andric /// \file
9*e8d8bef9SDimitry Andric /// This file implements a TargetTransformInfo analysis pass specific to the
10*e8d8bef9SDimitry Andric /// X86 target machine. It uses the target's detailed information to provide
11*e8d8bef9SDimitry Andric /// more precise answers to certain TTI queries, while letting the target
12*e8d8bef9SDimitry Andric /// independent and default TTI implementations handle the rest.
13*e8d8bef9SDimitry Andric ///
14*e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===//
15*e8d8bef9SDimitry Andric 
16*e8d8bef9SDimitry Andric #include "X86TargetTransformInfo.h"
17*e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
18*e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsX86.h"
19*e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h"
20*e8d8bef9SDimitry Andric #include "llvm/Transforms/InstCombine/InstCombiner.h"
21*e8d8bef9SDimitry Andric 
22*e8d8bef9SDimitry Andric using namespace llvm;
23*e8d8bef9SDimitry Andric 
24*e8d8bef9SDimitry Andric #define DEBUG_TYPE "x86tti"
25*e8d8bef9SDimitry Andric 
26*e8d8bef9SDimitry Andric /// Return a constant boolean vector that has true elements in all positions
27*e8d8bef9SDimitry Andric /// where the input constant data vector has an element with the sign bit set.
28*e8d8bef9SDimitry Andric static Constant *getNegativeIsTrueBoolVec(Constant *V) {
29*e8d8bef9SDimitry Andric   VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
30*e8d8bef9SDimitry Andric   V = ConstantExpr::getBitCast(V, IntTy);
31*e8d8bef9SDimitry Andric   V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
32*e8d8bef9SDimitry Andric                             V);
33*e8d8bef9SDimitry Andric   return V;
34*e8d8bef9SDimitry Andric }
35*e8d8bef9SDimitry Andric 
36*e8d8bef9SDimitry Andric /// Convert the x86 XMM integer vector mask to a vector of bools based on
37*e8d8bef9SDimitry Andric /// each element's most significant bit (the sign bit).
38*e8d8bef9SDimitry Andric static Value *getBoolVecFromMask(Value *Mask) {
39*e8d8bef9SDimitry Andric   // Fold Constant Mask.
40*e8d8bef9SDimitry Andric   if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
41*e8d8bef9SDimitry Andric     return getNegativeIsTrueBoolVec(ConstantMask);
42*e8d8bef9SDimitry Andric 
43*e8d8bef9SDimitry Andric   // Mask was extended from a boolean vector.
44*e8d8bef9SDimitry Andric   Value *ExtMask;
45*e8d8bef9SDimitry Andric   if (PatternMatch::match(
46*e8d8bef9SDimitry Andric           Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
47*e8d8bef9SDimitry Andric       ExtMask->getType()->isIntOrIntVectorTy(1))
48*e8d8bef9SDimitry Andric     return ExtMask;
49*e8d8bef9SDimitry Andric 
50*e8d8bef9SDimitry Andric   return nullptr;
51*e8d8bef9SDimitry Andric }
52*e8d8bef9SDimitry Andric 
53*e8d8bef9SDimitry Andric // TODO: If the x86 backend knew how to convert a bool vector mask back to an
54*e8d8bef9SDimitry Andric // XMM register mask efficiently, we could transform all x86 masked intrinsics
55*e8d8bef9SDimitry Andric // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
56*e8d8bef9SDimitry Andric static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
57*e8d8bef9SDimitry Andric   Value *Ptr = II.getOperand(0);
58*e8d8bef9SDimitry Andric   Value *Mask = II.getOperand(1);
59*e8d8bef9SDimitry Andric   Constant *ZeroVec = Constant::getNullValue(II.getType());
60*e8d8bef9SDimitry Andric 
61*e8d8bef9SDimitry Andric   // Zero Mask - masked load instruction creates a zero vector.
62*e8d8bef9SDimitry Andric   if (isa<ConstantAggregateZero>(Mask))
63*e8d8bef9SDimitry Andric     return IC.replaceInstUsesWith(II, ZeroVec);
64*e8d8bef9SDimitry Andric 
65*e8d8bef9SDimitry Andric   // The mask is constant or extended from a bool vector. Convert this x86
66*e8d8bef9SDimitry Andric   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
67*e8d8bef9SDimitry Andric   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
68*e8d8bef9SDimitry Andric     // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
69*e8d8bef9SDimitry Andric     // the LLVM intrinsic definition for the pointer argument.
70*e8d8bef9SDimitry Andric     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
71*e8d8bef9SDimitry Andric     PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
72*e8d8bef9SDimitry Andric     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
73*e8d8bef9SDimitry Andric 
74*e8d8bef9SDimitry Andric     // The pass-through vector for an x86 masked load is a zero vector.
75*e8d8bef9SDimitry Andric     CallInst *NewMaskedLoad =
76*e8d8bef9SDimitry Andric         IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
77*e8d8bef9SDimitry Andric     return IC.replaceInstUsesWith(II, NewMaskedLoad);
78*e8d8bef9SDimitry Andric   }
79*e8d8bef9SDimitry Andric 
80*e8d8bef9SDimitry Andric   return nullptr;
81*e8d8bef9SDimitry Andric }
82*e8d8bef9SDimitry Andric 
83*e8d8bef9SDimitry Andric // TODO: If the x86 backend knew how to convert a bool vector mask back to an
84*e8d8bef9SDimitry Andric // XMM register mask efficiently, we could transform all x86 masked intrinsics
85*e8d8bef9SDimitry Andric // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
86*e8d8bef9SDimitry Andric static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
87*e8d8bef9SDimitry Andric   Value *Ptr = II.getOperand(0);
88*e8d8bef9SDimitry Andric   Value *Mask = II.getOperand(1);
89*e8d8bef9SDimitry Andric   Value *Vec = II.getOperand(2);
90*e8d8bef9SDimitry Andric 
91*e8d8bef9SDimitry Andric   // Zero Mask - this masked store instruction does nothing.
92*e8d8bef9SDimitry Andric   if (isa<ConstantAggregateZero>(Mask)) {
93*e8d8bef9SDimitry Andric     IC.eraseInstFromFunction(II);
94*e8d8bef9SDimitry Andric     return true;
95*e8d8bef9SDimitry Andric   }
96*e8d8bef9SDimitry Andric 
97*e8d8bef9SDimitry Andric   // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
98*e8d8bef9SDimitry Andric   // anything else at this level.
99*e8d8bef9SDimitry Andric   if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
100*e8d8bef9SDimitry Andric     return false;
101*e8d8bef9SDimitry Andric 
102*e8d8bef9SDimitry Andric   // The mask is constant or extended from a bool vector. Convert this x86
103*e8d8bef9SDimitry Andric   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
104*e8d8bef9SDimitry Andric   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
105*e8d8bef9SDimitry Andric     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
106*e8d8bef9SDimitry Andric     PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
107*e8d8bef9SDimitry Andric     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
108*e8d8bef9SDimitry Andric 
109*e8d8bef9SDimitry Andric     IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
110*e8d8bef9SDimitry Andric 
111*e8d8bef9SDimitry Andric     // 'Replace uses' doesn't work for stores. Erase the original masked store.
112*e8d8bef9SDimitry Andric     IC.eraseInstFromFunction(II);
113*e8d8bef9SDimitry Andric     return true;
114*e8d8bef9SDimitry Andric   }
115*e8d8bef9SDimitry Andric 
116*e8d8bef9SDimitry Andric   return false;
117*e8d8bef9SDimitry Andric }
118*e8d8bef9SDimitry Andric 
119*e8d8bef9SDimitry Andric static Value *simplifyX86immShift(const IntrinsicInst &II,
120*e8d8bef9SDimitry Andric                                   InstCombiner::BuilderTy &Builder) {
121*e8d8bef9SDimitry Andric   bool LogicalShift = false;
122*e8d8bef9SDimitry Andric   bool ShiftLeft = false;
123*e8d8bef9SDimitry Andric   bool IsImm = false;
124*e8d8bef9SDimitry Andric 
125*e8d8bef9SDimitry Andric   switch (II.getIntrinsicID()) {
126*e8d8bef9SDimitry Andric   default:
127*e8d8bef9SDimitry Andric     llvm_unreachable("Unexpected intrinsic!");
128*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrai_d:
129*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrai_w:
130*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrai_d:
131*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrai_w:
132*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrai_q_128:
133*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrai_q_256:
134*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrai_d_512:
135*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrai_q_512:
136*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrai_w_512:
137*e8d8bef9SDimitry Andric     IsImm = true;
138*e8d8bef9SDimitry Andric     LLVM_FALLTHROUGH;
139*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psra_d:
140*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psra_w:
141*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psra_d:
142*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psra_w:
143*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psra_q_128:
144*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psra_q_256:
145*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psra_d_512:
146*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psra_q_512:
147*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psra_w_512:
148*e8d8bef9SDimitry Andric     LogicalShift = false;
149*e8d8bef9SDimitry Andric     ShiftLeft = false;
150*e8d8bef9SDimitry Andric     break;
151*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrli_d:
152*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrli_q:
153*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrli_w:
154*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrli_d:
155*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrli_q:
156*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrli_w:
157*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrli_d_512:
158*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrli_q_512:
159*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrli_w_512:
160*e8d8bef9SDimitry Andric     IsImm = true;
161*e8d8bef9SDimitry Andric     LLVM_FALLTHROUGH;
162*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrl_d:
163*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrl_q:
164*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrl_w:
165*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrl_d:
166*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrl_q:
167*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrl_w:
168*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrl_d_512:
169*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrl_q_512:
170*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrl_w_512:
171*e8d8bef9SDimitry Andric     LogicalShift = true;
172*e8d8bef9SDimitry Andric     ShiftLeft = false;
173*e8d8bef9SDimitry Andric     break;
174*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_pslli_d:
175*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_pslli_q:
176*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_pslli_w:
177*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_pslli_d:
178*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_pslli_q:
179*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_pslli_w:
180*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_pslli_d_512:
181*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_pslli_q_512:
182*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_pslli_w_512:
183*e8d8bef9SDimitry Andric     IsImm = true;
184*e8d8bef9SDimitry Andric     LLVM_FALLTHROUGH;
185*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psll_d:
186*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psll_q:
187*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psll_w:
188*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psll_d:
189*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psll_q:
190*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psll_w:
191*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psll_d_512:
192*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psll_q_512:
193*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psll_w_512:
194*e8d8bef9SDimitry Andric     LogicalShift = true;
195*e8d8bef9SDimitry Andric     ShiftLeft = true;
196*e8d8bef9SDimitry Andric     break;
197*e8d8bef9SDimitry Andric   }
198*e8d8bef9SDimitry Andric   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
199*e8d8bef9SDimitry Andric 
200*e8d8bef9SDimitry Andric   auto Vec = II.getArgOperand(0);
201*e8d8bef9SDimitry Andric   auto Amt = II.getArgOperand(1);
202*e8d8bef9SDimitry Andric   auto VT = cast<FixedVectorType>(Vec->getType());
203*e8d8bef9SDimitry Andric   auto SVT = VT->getElementType();
204*e8d8bef9SDimitry Andric   auto AmtVT = Amt->getType();
205*e8d8bef9SDimitry Andric   unsigned VWidth = VT->getNumElements();
206*e8d8bef9SDimitry Andric   unsigned BitWidth = SVT->getPrimitiveSizeInBits();
207*e8d8bef9SDimitry Andric 
208*e8d8bef9SDimitry Andric   // If the shift amount is guaranteed to be in-range we can replace it with a
209*e8d8bef9SDimitry Andric   // generic shift. If its guaranteed to be out of range, logical shifts combine
210*e8d8bef9SDimitry Andric   // to zero and arithmetic shifts are clamped to (BitWidth - 1).
211*e8d8bef9SDimitry Andric   if (IsImm) {
212*e8d8bef9SDimitry Andric     assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
213*e8d8bef9SDimitry Andric     KnownBits KnownAmtBits =
214*e8d8bef9SDimitry Andric         llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
215*e8d8bef9SDimitry Andric     if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
216*e8d8bef9SDimitry Andric       Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
217*e8d8bef9SDimitry Andric       Amt = Builder.CreateVectorSplat(VWidth, Amt);
218*e8d8bef9SDimitry Andric       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
219*e8d8bef9SDimitry Andric                                         : Builder.CreateLShr(Vec, Amt))
220*e8d8bef9SDimitry Andric                            : Builder.CreateAShr(Vec, Amt));
221*e8d8bef9SDimitry Andric     }
222*e8d8bef9SDimitry Andric     if (KnownAmtBits.getMinValue().uge(BitWidth)) {
223*e8d8bef9SDimitry Andric       if (LogicalShift)
224*e8d8bef9SDimitry Andric         return ConstantAggregateZero::get(VT);
225*e8d8bef9SDimitry Andric       Amt = ConstantInt::get(SVT, BitWidth - 1);
226*e8d8bef9SDimitry Andric       return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
227*e8d8bef9SDimitry Andric     }
228*e8d8bef9SDimitry Andric   } else {
229*e8d8bef9SDimitry Andric     // Ensure the first element has an in-range value and the rest of the
230*e8d8bef9SDimitry Andric     // elements in the bottom 64 bits are zero.
231*e8d8bef9SDimitry Andric     assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
232*e8d8bef9SDimitry Andric            cast<VectorType>(AmtVT)->getElementType() == SVT &&
233*e8d8bef9SDimitry Andric            "Unexpected shift-by-scalar type");
234*e8d8bef9SDimitry Andric     unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
235*e8d8bef9SDimitry Andric     APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
236*e8d8bef9SDimitry Andric     APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
237*e8d8bef9SDimitry Andric     KnownBits KnownLowerBits = llvm::computeKnownBits(
238*e8d8bef9SDimitry Andric         Amt, DemandedLower, II.getModule()->getDataLayout());
239*e8d8bef9SDimitry Andric     KnownBits KnownUpperBits = llvm::computeKnownBits(
240*e8d8bef9SDimitry Andric         Amt, DemandedUpper, II.getModule()->getDataLayout());
241*e8d8bef9SDimitry Andric     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
242*e8d8bef9SDimitry Andric         (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
243*e8d8bef9SDimitry Andric       SmallVector<int, 16> ZeroSplat(VWidth, 0);
244*e8d8bef9SDimitry Andric       Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
245*e8d8bef9SDimitry Andric       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
246*e8d8bef9SDimitry Andric                                         : Builder.CreateLShr(Vec, Amt))
247*e8d8bef9SDimitry Andric                            : Builder.CreateAShr(Vec, Amt));
248*e8d8bef9SDimitry Andric     }
249*e8d8bef9SDimitry Andric   }
250*e8d8bef9SDimitry Andric 
251*e8d8bef9SDimitry Andric   // Simplify if count is constant vector.
252*e8d8bef9SDimitry Andric   auto CDV = dyn_cast<ConstantDataVector>(Amt);
253*e8d8bef9SDimitry Andric   if (!CDV)
254*e8d8bef9SDimitry Andric     return nullptr;
255*e8d8bef9SDimitry Andric 
256*e8d8bef9SDimitry Andric   // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
257*e8d8bef9SDimitry Andric   // operand to compute the shift amount.
258*e8d8bef9SDimitry Andric   assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
259*e8d8bef9SDimitry Andric          cast<VectorType>(AmtVT)->getElementType() == SVT &&
260*e8d8bef9SDimitry Andric          "Unexpected shift-by-scalar type");
261*e8d8bef9SDimitry Andric 
262*e8d8bef9SDimitry Andric   // Concatenate the sub-elements to create the 64-bit value.
263*e8d8bef9SDimitry Andric   APInt Count(64, 0);
264*e8d8bef9SDimitry Andric   for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
265*e8d8bef9SDimitry Andric     unsigned SubEltIdx = (NumSubElts - 1) - i;
266*e8d8bef9SDimitry Andric     auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
267*e8d8bef9SDimitry Andric     Count <<= BitWidth;
268*e8d8bef9SDimitry Andric     Count |= SubElt->getValue().zextOrTrunc(64);
269*e8d8bef9SDimitry Andric   }
270*e8d8bef9SDimitry Andric 
271*e8d8bef9SDimitry Andric   // If shift-by-zero then just return the original value.
272*e8d8bef9SDimitry Andric   if (Count.isNullValue())
273*e8d8bef9SDimitry Andric     return Vec;
274*e8d8bef9SDimitry Andric 
275*e8d8bef9SDimitry Andric   // Handle cases when Shift >= BitWidth.
276*e8d8bef9SDimitry Andric   if (Count.uge(BitWidth)) {
277*e8d8bef9SDimitry Andric     // If LogicalShift - just return zero.
278*e8d8bef9SDimitry Andric     if (LogicalShift)
279*e8d8bef9SDimitry Andric       return ConstantAggregateZero::get(VT);
280*e8d8bef9SDimitry Andric 
281*e8d8bef9SDimitry Andric     // If ArithmeticShift - clamp Shift to (BitWidth - 1).
282*e8d8bef9SDimitry Andric     Count = APInt(64, BitWidth - 1);
283*e8d8bef9SDimitry Andric   }
284*e8d8bef9SDimitry Andric 
285*e8d8bef9SDimitry Andric   // Get a constant vector of the same type as the first operand.
286*e8d8bef9SDimitry Andric   auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
287*e8d8bef9SDimitry Andric   auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
288*e8d8bef9SDimitry Andric 
289*e8d8bef9SDimitry Andric   if (ShiftLeft)
290*e8d8bef9SDimitry Andric     return Builder.CreateShl(Vec, ShiftVec);
291*e8d8bef9SDimitry Andric 
292*e8d8bef9SDimitry Andric   if (LogicalShift)
293*e8d8bef9SDimitry Andric     return Builder.CreateLShr(Vec, ShiftVec);
294*e8d8bef9SDimitry Andric 
295*e8d8bef9SDimitry Andric   return Builder.CreateAShr(Vec, ShiftVec);
296*e8d8bef9SDimitry Andric }
297*e8d8bef9SDimitry Andric 
298*e8d8bef9SDimitry Andric // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
299*e8d8bef9SDimitry Andric // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
300*e8d8bef9SDimitry Andric // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
301*e8d8bef9SDimitry Andric static Value *simplifyX86varShift(const IntrinsicInst &II,
302*e8d8bef9SDimitry Andric                                   InstCombiner::BuilderTy &Builder) {
303*e8d8bef9SDimitry Andric   bool LogicalShift = false;
304*e8d8bef9SDimitry Andric   bool ShiftLeft = false;
305*e8d8bef9SDimitry Andric 
306*e8d8bef9SDimitry Andric   switch (II.getIntrinsicID()) {
307*e8d8bef9SDimitry Andric   default:
308*e8d8bef9SDimitry Andric     llvm_unreachable("Unexpected intrinsic!");
309*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrav_d:
310*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrav_d_256:
311*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_q_128:
312*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_q_256:
313*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_d_512:
314*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_q_512:
315*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_w_128:
316*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_w_256:
317*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_w_512:
318*e8d8bef9SDimitry Andric     LogicalShift = false;
319*e8d8bef9SDimitry Andric     ShiftLeft = false;
320*e8d8bef9SDimitry Andric     break;
321*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrlv_d:
322*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrlv_d_256:
323*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrlv_q:
324*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrlv_q_256:
325*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrlv_d_512:
326*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrlv_q_512:
327*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrlv_w_128:
328*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrlv_w_256:
329*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrlv_w_512:
330*e8d8bef9SDimitry Andric     LogicalShift = true;
331*e8d8bef9SDimitry Andric     ShiftLeft = false;
332*e8d8bef9SDimitry Andric     break;
333*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psllv_d:
334*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psllv_d_256:
335*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psllv_q:
336*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psllv_q_256:
337*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psllv_d_512:
338*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psllv_q_512:
339*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psllv_w_128:
340*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psllv_w_256:
341*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psllv_w_512:
342*e8d8bef9SDimitry Andric     LogicalShift = true;
343*e8d8bef9SDimitry Andric     ShiftLeft = true;
344*e8d8bef9SDimitry Andric     break;
345*e8d8bef9SDimitry Andric   }
346*e8d8bef9SDimitry Andric   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
347*e8d8bef9SDimitry Andric 
348*e8d8bef9SDimitry Andric   auto Vec = II.getArgOperand(0);
349*e8d8bef9SDimitry Andric   auto Amt = II.getArgOperand(1);
350*e8d8bef9SDimitry Andric   auto VT = cast<FixedVectorType>(II.getType());
351*e8d8bef9SDimitry Andric   auto SVT = VT->getElementType();
352*e8d8bef9SDimitry Andric   int NumElts = VT->getNumElements();
353*e8d8bef9SDimitry Andric   int BitWidth = SVT->getIntegerBitWidth();
354*e8d8bef9SDimitry Andric 
355*e8d8bef9SDimitry Andric   // If the shift amount is guaranteed to be in-range we can replace it with a
356*e8d8bef9SDimitry Andric   // generic shift.
357*e8d8bef9SDimitry Andric   APInt UpperBits =
358*e8d8bef9SDimitry Andric       APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
359*e8d8bef9SDimitry Andric   if (llvm::MaskedValueIsZero(Amt, UpperBits,
360*e8d8bef9SDimitry Andric                               II.getModule()->getDataLayout())) {
361*e8d8bef9SDimitry Andric     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
362*e8d8bef9SDimitry Andric                                       : Builder.CreateLShr(Vec, Amt))
363*e8d8bef9SDimitry Andric                          : Builder.CreateAShr(Vec, Amt));
364*e8d8bef9SDimitry Andric   }
365*e8d8bef9SDimitry Andric 
366*e8d8bef9SDimitry Andric   // Simplify if all shift amounts are constant/undef.
367*e8d8bef9SDimitry Andric   auto *CShift = dyn_cast<Constant>(Amt);
368*e8d8bef9SDimitry Andric   if (!CShift)
369*e8d8bef9SDimitry Andric     return nullptr;
370*e8d8bef9SDimitry Andric 
371*e8d8bef9SDimitry Andric   // Collect each element's shift amount.
372*e8d8bef9SDimitry Andric   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
373*e8d8bef9SDimitry Andric   bool AnyOutOfRange = false;
374*e8d8bef9SDimitry Andric   SmallVector<int, 8> ShiftAmts;
375*e8d8bef9SDimitry Andric   for (int I = 0; I < NumElts; ++I) {
376*e8d8bef9SDimitry Andric     auto *CElt = CShift->getAggregateElement(I);
377*e8d8bef9SDimitry Andric     if (isa_and_nonnull<UndefValue>(CElt)) {
378*e8d8bef9SDimitry Andric       ShiftAmts.push_back(-1);
379*e8d8bef9SDimitry Andric       continue;
380*e8d8bef9SDimitry Andric     }
381*e8d8bef9SDimitry Andric 
382*e8d8bef9SDimitry Andric     auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
383*e8d8bef9SDimitry Andric     if (!COp)
384*e8d8bef9SDimitry Andric       return nullptr;
385*e8d8bef9SDimitry Andric 
386*e8d8bef9SDimitry Andric     // Handle out of range shifts.
387*e8d8bef9SDimitry Andric     // If LogicalShift - set to BitWidth (special case).
388*e8d8bef9SDimitry Andric     // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
389*e8d8bef9SDimitry Andric     APInt ShiftVal = COp->getValue();
390*e8d8bef9SDimitry Andric     if (ShiftVal.uge(BitWidth)) {
391*e8d8bef9SDimitry Andric       AnyOutOfRange = LogicalShift;
392*e8d8bef9SDimitry Andric       ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
393*e8d8bef9SDimitry Andric       continue;
394*e8d8bef9SDimitry Andric     }
395*e8d8bef9SDimitry Andric 
396*e8d8bef9SDimitry Andric     ShiftAmts.push_back((int)ShiftVal.getZExtValue());
397*e8d8bef9SDimitry Andric   }
398*e8d8bef9SDimitry Andric 
399*e8d8bef9SDimitry Andric   // If all elements out of range or UNDEF, return vector of zeros/undefs.
400*e8d8bef9SDimitry Andric   // ArithmeticShift should only hit this if they are all UNDEF.
401*e8d8bef9SDimitry Andric   auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
402*e8d8bef9SDimitry Andric   if (llvm::all_of(ShiftAmts, OutOfRange)) {
403*e8d8bef9SDimitry Andric     SmallVector<Constant *, 8> ConstantVec;
404*e8d8bef9SDimitry Andric     for (int Idx : ShiftAmts) {
405*e8d8bef9SDimitry Andric       if (Idx < 0) {
406*e8d8bef9SDimitry Andric         ConstantVec.push_back(UndefValue::get(SVT));
407*e8d8bef9SDimitry Andric       } else {
408*e8d8bef9SDimitry Andric         assert(LogicalShift && "Logical shift expected");
409*e8d8bef9SDimitry Andric         ConstantVec.push_back(ConstantInt::getNullValue(SVT));
410*e8d8bef9SDimitry Andric       }
411*e8d8bef9SDimitry Andric     }
412*e8d8bef9SDimitry Andric     return ConstantVector::get(ConstantVec);
413*e8d8bef9SDimitry Andric   }
414*e8d8bef9SDimitry Andric 
415*e8d8bef9SDimitry Andric   // We can't handle only some out of range values with generic logical shifts.
416*e8d8bef9SDimitry Andric   if (AnyOutOfRange)
417*e8d8bef9SDimitry Andric     return nullptr;
418*e8d8bef9SDimitry Andric 
419*e8d8bef9SDimitry Andric   // Build the shift amount constant vector.
420*e8d8bef9SDimitry Andric   SmallVector<Constant *, 8> ShiftVecAmts;
421*e8d8bef9SDimitry Andric   for (int Idx : ShiftAmts) {
422*e8d8bef9SDimitry Andric     if (Idx < 0)
423*e8d8bef9SDimitry Andric       ShiftVecAmts.push_back(UndefValue::get(SVT));
424*e8d8bef9SDimitry Andric     else
425*e8d8bef9SDimitry Andric       ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
426*e8d8bef9SDimitry Andric   }
427*e8d8bef9SDimitry Andric   auto ShiftVec = ConstantVector::get(ShiftVecAmts);
428*e8d8bef9SDimitry Andric 
429*e8d8bef9SDimitry Andric   if (ShiftLeft)
430*e8d8bef9SDimitry Andric     return Builder.CreateShl(Vec, ShiftVec);
431*e8d8bef9SDimitry Andric 
432*e8d8bef9SDimitry Andric   if (LogicalShift)
433*e8d8bef9SDimitry Andric     return Builder.CreateLShr(Vec, ShiftVec);
434*e8d8bef9SDimitry Andric 
435*e8d8bef9SDimitry Andric   return Builder.CreateAShr(Vec, ShiftVec);
436*e8d8bef9SDimitry Andric }
437*e8d8bef9SDimitry Andric 
438*e8d8bef9SDimitry Andric static Value *simplifyX86pack(IntrinsicInst &II,
439*e8d8bef9SDimitry Andric                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
440*e8d8bef9SDimitry Andric   Value *Arg0 = II.getArgOperand(0);
441*e8d8bef9SDimitry Andric   Value *Arg1 = II.getArgOperand(1);
442*e8d8bef9SDimitry Andric   Type *ResTy = II.getType();
443*e8d8bef9SDimitry Andric 
444*e8d8bef9SDimitry Andric   // Fast all undef handling.
445*e8d8bef9SDimitry Andric   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
446*e8d8bef9SDimitry Andric     return UndefValue::get(ResTy);
447*e8d8bef9SDimitry Andric 
448*e8d8bef9SDimitry Andric   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
449*e8d8bef9SDimitry Andric   unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
450*e8d8bef9SDimitry Andric   unsigned NumSrcElts = ArgTy->getNumElements();
451*e8d8bef9SDimitry Andric   assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
452*e8d8bef9SDimitry Andric          "Unexpected packing types");
453*e8d8bef9SDimitry Andric 
454*e8d8bef9SDimitry Andric   unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
455*e8d8bef9SDimitry Andric   unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
456*e8d8bef9SDimitry Andric   unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
457*e8d8bef9SDimitry Andric   assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
458*e8d8bef9SDimitry Andric          "Unexpected packing types");
459*e8d8bef9SDimitry Andric 
460*e8d8bef9SDimitry Andric   // Constant folding.
461*e8d8bef9SDimitry Andric   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
462*e8d8bef9SDimitry Andric     return nullptr;
463*e8d8bef9SDimitry Andric 
464*e8d8bef9SDimitry Andric   // Clamp Values - signed/unsigned both use signed clamp values, but they
465*e8d8bef9SDimitry Andric   // differ on the min/max values.
466*e8d8bef9SDimitry Andric   APInt MinValue, MaxValue;
467*e8d8bef9SDimitry Andric   if (IsSigned) {
468*e8d8bef9SDimitry Andric     // PACKSS: Truncate signed value with signed saturation.
469*e8d8bef9SDimitry Andric     // Source values less than dst minint are saturated to minint.
470*e8d8bef9SDimitry Andric     // Source values greater than dst maxint are saturated to maxint.
471*e8d8bef9SDimitry Andric     MinValue =
472*e8d8bef9SDimitry Andric         APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
473*e8d8bef9SDimitry Andric     MaxValue =
474*e8d8bef9SDimitry Andric         APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
475*e8d8bef9SDimitry Andric   } else {
476*e8d8bef9SDimitry Andric     // PACKUS: Truncate signed value with unsigned saturation.
477*e8d8bef9SDimitry Andric     // Source values less than zero are saturated to zero.
478*e8d8bef9SDimitry Andric     // Source values greater than dst maxuint are saturated to maxuint.
479*e8d8bef9SDimitry Andric     MinValue = APInt::getNullValue(SrcScalarSizeInBits);
480*e8d8bef9SDimitry Andric     MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
481*e8d8bef9SDimitry Andric   }
482*e8d8bef9SDimitry Andric 
483*e8d8bef9SDimitry Andric   auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
484*e8d8bef9SDimitry Andric   auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
485*e8d8bef9SDimitry Andric   Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
486*e8d8bef9SDimitry Andric   Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
487*e8d8bef9SDimitry Andric   Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
488*e8d8bef9SDimitry Andric   Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
489*e8d8bef9SDimitry Andric 
490*e8d8bef9SDimitry Andric   // Shuffle clamped args together at the lane level.
491*e8d8bef9SDimitry Andric   SmallVector<int, 32> PackMask;
492*e8d8bef9SDimitry Andric   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
493*e8d8bef9SDimitry Andric     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
494*e8d8bef9SDimitry Andric       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
495*e8d8bef9SDimitry Andric     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
496*e8d8bef9SDimitry Andric       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
497*e8d8bef9SDimitry Andric   }
498*e8d8bef9SDimitry Andric   auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
499*e8d8bef9SDimitry Andric 
500*e8d8bef9SDimitry Andric   // Truncate to dst size.
501*e8d8bef9SDimitry Andric   return Builder.CreateTrunc(Shuffle, ResTy);
502*e8d8bef9SDimitry Andric }
503*e8d8bef9SDimitry Andric 
504*e8d8bef9SDimitry Andric static Value *simplifyX86movmsk(const IntrinsicInst &II,
505*e8d8bef9SDimitry Andric                                 InstCombiner::BuilderTy &Builder) {
506*e8d8bef9SDimitry Andric   Value *Arg = II.getArgOperand(0);
507*e8d8bef9SDimitry Andric   Type *ResTy = II.getType();
508*e8d8bef9SDimitry Andric 
509*e8d8bef9SDimitry Andric   // movmsk(undef) -> zero as we must ensure the upper bits are zero.
510*e8d8bef9SDimitry Andric   if (isa<UndefValue>(Arg))
511*e8d8bef9SDimitry Andric     return Constant::getNullValue(ResTy);
512*e8d8bef9SDimitry Andric 
513*e8d8bef9SDimitry Andric   auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
514*e8d8bef9SDimitry Andric   // We can't easily peek through x86_mmx types.
515*e8d8bef9SDimitry Andric   if (!ArgTy)
516*e8d8bef9SDimitry Andric     return nullptr;
517*e8d8bef9SDimitry Andric 
518*e8d8bef9SDimitry Andric   // Expand MOVMSK to compare/bitcast/zext:
519*e8d8bef9SDimitry Andric   // e.g. PMOVMSKB(v16i8 x):
520*e8d8bef9SDimitry Andric   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
521*e8d8bef9SDimitry Andric   // %int = bitcast <16 x i1> %cmp to i16
522*e8d8bef9SDimitry Andric   // %res = zext i16 %int to i32
523*e8d8bef9SDimitry Andric   unsigned NumElts = ArgTy->getNumElements();
524*e8d8bef9SDimitry Andric   Type *IntegerVecTy = VectorType::getInteger(ArgTy);
525*e8d8bef9SDimitry Andric   Type *IntegerTy = Builder.getIntNTy(NumElts);
526*e8d8bef9SDimitry Andric 
527*e8d8bef9SDimitry Andric   Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
528*e8d8bef9SDimitry Andric   Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
529*e8d8bef9SDimitry Andric   Res = Builder.CreateBitCast(Res, IntegerTy);
530*e8d8bef9SDimitry Andric   Res = Builder.CreateZExtOrTrunc(Res, ResTy);
531*e8d8bef9SDimitry Andric   return Res;
532*e8d8bef9SDimitry Andric }
533*e8d8bef9SDimitry Andric 
534*e8d8bef9SDimitry Andric static Value *simplifyX86addcarry(const IntrinsicInst &II,
535*e8d8bef9SDimitry Andric                                   InstCombiner::BuilderTy &Builder) {
536*e8d8bef9SDimitry Andric   Value *CarryIn = II.getArgOperand(0);
537*e8d8bef9SDimitry Andric   Value *Op1 = II.getArgOperand(1);
538*e8d8bef9SDimitry Andric   Value *Op2 = II.getArgOperand(2);
539*e8d8bef9SDimitry Andric   Type *RetTy = II.getType();
540*e8d8bef9SDimitry Andric   Type *OpTy = Op1->getType();
541*e8d8bef9SDimitry Andric   assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
542*e8d8bef9SDimitry Andric          RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
543*e8d8bef9SDimitry Andric          "Unexpected types for x86 addcarry");
544*e8d8bef9SDimitry Andric 
545*e8d8bef9SDimitry Andric   // If carry-in is zero, this is just an unsigned add with overflow.
546*e8d8bef9SDimitry Andric   if (match(CarryIn, PatternMatch::m_ZeroInt())) {
547*e8d8bef9SDimitry Andric     Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
548*e8d8bef9SDimitry Andric                                           {Op1, Op2});
549*e8d8bef9SDimitry Andric     // The types have to be adjusted to match the x86 call types.
550*e8d8bef9SDimitry Andric     Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
551*e8d8bef9SDimitry Andric     Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
552*e8d8bef9SDimitry Andric                                        Builder.getInt8Ty());
553*e8d8bef9SDimitry Andric     Value *Res = UndefValue::get(RetTy);
554*e8d8bef9SDimitry Andric     Res = Builder.CreateInsertValue(Res, UAddOV, 0);
555*e8d8bef9SDimitry Andric     return Builder.CreateInsertValue(Res, UAddResult, 1);
556*e8d8bef9SDimitry Andric   }
557*e8d8bef9SDimitry Andric 
558*e8d8bef9SDimitry Andric   return nullptr;
559*e8d8bef9SDimitry Andric }
560*e8d8bef9SDimitry Andric 
561*e8d8bef9SDimitry Andric static Value *simplifyX86insertps(const IntrinsicInst &II,
562*e8d8bef9SDimitry Andric                                   InstCombiner::BuilderTy &Builder) {
563*e8d8bef9SDimitry Andric   auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
564*e8d8bef9SDimitry Andric   if (!CInt)
565*e8d8bef9SDimitry Andric     return nullptr;
566*e8d8bef9SDimitry Andric 
567*e8d8bef9SDimitry Andric   auto *VecTy = cast<FixedVectorType>(II.getType());
568*e8d8bef9SDimitry Andric   assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
569*e8d8bef9SDimitry Andric 
570*e8d8bef9SDimitry Andric   // The immediate permute control byte looks like this:
571*e8d8bef9SDimitry Andric   //    [3:0] - zero mask for each 32-bit lane
572*e8d8bef9SDimitry Andric   //    [5:4] - select one 32-bit destination lane
573*e8d8bef9SDimitry Andric   //    [7:6] - select one 32-bit source lane
574*e8d8bef9SDimitry Andric 
575*e8d8bef9SDimitry Andric   uint8_t Imm = CInt->getZExtValue();
576*e8d8bef9SDimitry Andric   uint8_t ZMask = Imm & 0xf;
577*e8d8bef9SDimitry Andric   uint8_t DestLane = (Imm >> 4) & 0x3;
578*e8d8bef9SDimitry Andric   uint8_t SourceLane = (Imm >> 6) & 0x3;
579*e8d8bef9SDimitry Andric 
580*e8d8bef9SDimitry Andric   ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
581*e8d8bef9SDimitry Andric 
582*e8d8bef9SDimitry Andric   // If all zero mask bits are set, this was just a weird way to
583*e8d8bef9SDimitry Andric   // generate a zero vector.
584*e8d8bef9SDimitry Andric   if (ZMask == 0xf)
585*e8d8bef9SDimitry Andric     return ZeroVector;
586*e8d8bef9SDimitry Andric 
587*e8d8bef9SDimitry Andric   // Initialize by passing all of the first source bits through.
588*e8d8bef9SDimitry Andric   int ShuffleMask[4] = {0, 1, 2, 3};
589*e8d8bef9SDimitry Andric 
590*e8d8bef9SDimitry Andric   // We may replace the second operand with the zero vector.
591*e8d8bef9SDimitry Andric   Value *V1 = II.getArgOperand(1);
592*e8d8bef9SDimitry Andric 
593*e8d8bef9SDimitry Andric   if (ZMask) {
594*e8d8bef9SDimitry Andric     // If the zero mask is being used with a single input or the zero mask
595*e8d8bef9SDimitry Andric     // overrides the destination lane, this is a shuffle with the zero vector.
596*e8d8bef9SDimitry Andric     if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
597*e8d8bef9SDimitry Andric         (ZMask & (1 << DestLane))) {
598*e8d8bef9SDimitry Andric       V1 = ZeroVector;
599*e8d8bef9SDimitry Andric       // We may still move 32-bits of the first source vector from one lane
600*e8d8bef9SDimitry Andric       // to another.
601*e8d8bef9SDimitry Andric       ShuffleMask[DestLane] = SourceLane;
602*e8d8bef9SDimitry Andric       // The zero mask may override the previous insert operation.
603*e8d8bef9SDimitry Andric       for (unsigned i = 0; i < 4; ++i)
604*e8d8bef9SDimitry Andric         if ((ZMask >> i) & 0x1)
605*e8d8bef9SDimitry Andric           ShuffleMask[i] = i + 4;
606*e8d8bef9SDimitry Andric     } else {
607*e8d8bef9SDimitry Andric       // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
608*e8d8bef9SDimitry Andric       return nullptr;
609*e8d8bef9SDimitry Andric     }
610*e8d8bef9SDimitry Andric   } else {
611*e8d8bef9SDimitry Andric     // Replace the selected destination lane with the selected source lane.
612*e8d8bef9SDimitry Andric     ShuffleMask[DestLane] = SourceLane + 4;
613*e8d8bef9SDimitry Andric   }
614*e8d8bef9SDimitry Andric 
615*e8d8bef9SDimitry Andric   return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
616*e8d8bef9SDimitry Andric }
617*e8d8bef9SDimitry Andric 
618*e8d8bef9SDimitry Andric /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
619*e8d8bef9SDimitry Andric /// or conversion to a shuffle vector.
620*e8d8bef9SDimitry Andric static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
621*e8d8bef9SDimitry Andric                                ConstantInt *CILength, ConstantInt *CIIndex,
622*e8d8bef9SDimitry Andric                                InstCombiner::BuilderTy &Builder) {
623*e8d8bef9SDimitry Andric   auto LowConstantHighUndef = [&](uint64_t Val) {
624*e8d8bef9SDimitry Andric     Type *IntTy64 = Type::getInt64Ty(II.getContext());
625*e8d8bef9SDimitry Andric     Constant *Args[] = {ConstantInt::get(IntTy64, Val),
626*e8d8bef9SDimitry Andric                         UndefValue::get(IntTy64)};
627*e8d8bef9SDimitry Andric     return ConstantVector::get(Args);
628*e8d8bef9SDimitry Andric   };
629*e8d8bef9SDimitry Andric 
630*e8d8bef9SDimitry Andric   // See if we're dealing with constant values.
631*e8d8bef9SDimitry Andric   Constant *C0 = dyn_cast<Constant>(Op0);
632*e8d8bef9SDimitry Andric   ConstantInt *CI0 =
633*e8d8bef9SDimitry Andric       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
634*e8d8bef9SDimitry Andric          : nullptr;
635*e8d8bef9SDimitry Andric 
636*e8d8bef9SDimitry Andric   // Attempt to constant fold.
637*e8d8bef9SDimitry Andric   if (CILength && CIIndex) {
638*e8d8bef9SDimitry Andric     // From AMD documentation: "The bit index and field length are each six
639*e8d8bef9SDimitry Andric     // bits in length other bits of the field are ignored."
640*e8d8bef9SDimitry Andric     APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
641*e8d8bef9SDimitry Andric     APInt APLength = CILength->getValue().zextOrTrunc(6);
642*e8d8bef9SDimitry Andric 
643*e8d8bef9SDimitry Andric     unsigned Index = APIndex.getZExtValue();
644*e8d8bef9SDimitry Andric 
645*e8d8bef9SDimitry Andric     // From AMD documentation: "a value of zero in the field length is
646*e8d8bef9SDimitry Andric     // defined as length of 64".
647*e8d8bef9SDimitry Andric     unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
648*e8d8bef9SDimitry Andric 
649*e8d8bef9SDimitry Andric     // From AMD documentation: "If the sum of the bit index + length field
650*e8d8bef9SDimitry Andric     // is greater than 64, the results are undefined".
651*e8d8bef9SDimitry Andric     unsigned End = Index + Length;
652*e8d8bef9SDimitry Andric 
653*e8d8bef9SDimitry Andric     // Note that both field index and field length are 8-bit quantities.
654*e8d8bef9SDimitry Andric     // Since variables 'Index' and 'Length' are unsigned values
655*e8d8bef9SDimitry Andric     // obtained from zero-extending field index and field length
656*e8d8bef9SDimitry Andric     // respectively, their sum should never wrap around.
657*e8d8bef9SDimitry Andric     if (End > 64)
658*e8d8bef9SDimitry Andric       return UndefValue::get(II.getType());
659*e8d8bef9SDimitry Andric 
660*e8d8bef9SDimitry Andric     // If we are inserting whole bytes, we can convert this to a shuffle.
661*e8d8bef9SDimitry Andric     // Lowering can recognize EXTRQI shuffle masks.
662*e8d8bef9SDimitry Andric     if ((Length % 8) == 0 && (Index % 8) == 0) {
663*e8d8bef9SDimitry Andric       // Convert bit indices to byte indices.
664*e8d8bef9SDimitry Andric       Length /= 8;
665*e8d8bef9SDimitry Andric       Index /= 8;
666*e8d8bef9SDimitry Andric 
667*e8d8bef9SDimitry Andric       Type *IntTy8 = Type::getInt8Ty(II.getContext());
668*e8d8bef9SDimitry Andric       auto *ShufTy = FixedVectorType::get(IntTy8, 16);
669*e8d8bef9SDimitry Andric 
670*e8d8bef9SDimitry Andric       SmallVector<int, 16> ShuffleMask;
671*e8d8bef9SDimitry Andric       for (int i = 0; i != (int)Length; ++i)
672*e8d8bef9SDimitry Andric         ShuffleMask.push_back(i + Index);
673*e8d8bef9SDimitry Andric       for (int i = Length; i != 8; ++i)
674*e8d8bef9SDimitry Andric         ShuffleMask.push_back(i + 16);
675*e8d8bef9SDimitry Andric       for (int i = 8; i != 16; ++i)
676*e8d8bef9SDimitry Andric         ShuffleMask.push_back(-1);
677*e8d8bef9SDimitry Andric 
678*e8d8bef9SDimitry Andric       Value *SV = Builder.CreateShuffleVector(
679*e8d8bef9SDimitry Andric           Builder.CreateBitCast(Op0, ShufTy),
680*e8d8bef9SDimitry Andric           ConstantAggregateZero::get(ShufTy), ShuffleMask);
681*e8d8bef9SDimitry Andric       return Builder.CreateBitCast(SV, II.getType());
682*e8d8bef9SDimitry Andric     }
683*e8d8bef9SDimitry Andric 
684*e8d8bef9SDimitry Andric     // Constant Fold - shift Index'th bit to lowest position and mask off
685*e8d8bef9SDimitry Andric     // Length bits.
686*e8d8bef9SDimitry Andric     if (CI0) {
687*e8d8bef9SDimitry Andric       APInt Elt = CI0->getValue();
688*e8d8bef9SDimitry Andric       Elt.lshrInPlace(Index);
689*e8d8bef9SDimitry Andric       Elt = Elt.zextOrTrunc(Length);
690*e8d8bef9SDimitry Andric       return LowConstantHighUndef(Elt.getZExtValue());
691*e8d8bef9SDimitry Andric     }
692*e8d8bef9SDimitry Andric 
693*e8d8bef9SDimitry Andric     // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
694*e8d8bef9SDimitry Andric     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
695*e8d8bef9SDimitry Andric       Value *Args[] = {Op0, CILength, CIIndex};
696*e8d8bef9SDimitry Andric       Module *M = II.getModule();
697*e8d8bef9SDimitry Andric       Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
698*e8d8bef9SDimitry Andric       return Builder.CreateCall(F, Args);
699*e8d8bef9SDimitry Andric     }
700*e8d8bef9SDimitry Andric   }
701*e8d8bef9SDimitry Andric 
702*e8d8bef9SDimitry Andric   // Constant Fold - extraction from zero is always {zero, undef}.
703*e8d8bef9SDimitry Andric   if (CI0 && CI0->isZero())
704*e8d8bef9SDimitry Andric     return LowConstantHighUndef(0);
705*e8d8bef9SDimitry Andric 
706*e8d8bef9SDimitry Andric   return nullptr;
707*e8d8bef9SDimitry Andric }
708*e8d8bef9SDimitry Andric 
709*e8d8bef9SDimitry Andric /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
710*e8d8bef9SDimitry Andric /// folding or conversion to a shuffle vector.
711*e8d8bef9SDimitry Andric static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
712*e8d8bef9SDimitry Andric                                  APInt APLength, APInt APIndex,
713*e8d8bef9SDimitry Andric                                  InstCombiner::BuilderTy &Builder) {
714*e8d8bef9SDimitry Andric   // From AMD documentation: "The bit index and field length are each six bits
715*e8d8bef9SDimitry Andric   // in length other bits of the field are ignored."
716*e8d8bef9SDimitry Andric   APIndex = APIndex.zextOrTrunc(6);
717*e8d8bef9SDimitry Andric   APLength = APLength.zextOrTrunc(6);
718*e8d8bef9SDimitry Andric 
719*e8d8bef9SDimitry Andric   // Attempt to constant fold.
720*e8d8bef9SDimitry Andric   unsigned Index = APIndex.getZExtValue();
721*e8d8bef9SDimitry Andric 
722*e8d8bef9SDimitry Andric   // From AMD documentation: "a value of zero in the field length is
723*e8d8bef9SDimitry Andric   // defined as length of 64".
724*e8d8bef9SDimitry Andric   unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
725*e8d8bef9SDimitry Andric 
726*e8d8bef9SDimitry Andric   // From AMD documentation: "If the sum of the bit index + length field
727*e8d8bef9SDimitry Andric   // is greater than 64, the results are undefined".
728*e8d8bef9SDimitry Andric   unsigned End = Index + Length;
729*e8d8bef9SDimitry Andric 
730*e8d8bef9SDimitry Andric   // Note that both field index and field length are 8-bit quantities.
731*e8d8bef9SDimitry Andric   // Since variables 'Index' and 'Length' are unsigned values
732*e8d8bef9SDimitry Andric   // obtained from zero-extending field index and field length
733*e8d8bef9SDimitry Andric   // respectively, their sum should never wrap around.
734*e8d8bef9SDimitry Andric   if (End > 64)
735*e8d8bef9SDimitry Andric     return UndefValue::get(II.getType());
736*e8d8bef9SDimitry Andric 
737*e8d8bef9SDimitry Andric   // If we are inserting whole bytes, we can convert this to a shuffle.
738*e8d8bef9SDimitry Andric   // Lowering can recognize INSERTQI shuffle masks.
739*e8d8bef9SDimitry Andric   if ((Length % 8) == 0 && (Index % 8) == 0) {
740*e8d8bef9SDimitry Andric     // Convert bit indices to byte indices.
741*e8d8bef9SDimitry Andric     Length /= 8;
742*e8d8bef9SDimitry Andric     Index /= 8;
743*e8d8bef9SDimitry Andric 
744*e8d8bef9SDimitry Andric     Type *IntTy8 = Type::getInt8Ty(II.getContext());
745*e8d8bef9SDimitry Andric     auto *ShufTy = FixedVectorType::get(IntTy8, 16);
746*e8d8bef9SDimitry Andric 
747*e8d8bef9SDimitry Andric     SmallVector<int, 16> ShuffleMask;
748*e8d8bef9SDimitry Andric     for (int i = 0; i != (int)Index; ++i)
749*e8d8bef9SDimitry Andric       ShuffleMask.push_back(i);
750*e8d8bef9SDimitry Andric     for (int i = 0; i != (int)Length; ++i)
751*e8d8bef9SDimitry Andric       ShuffleMask.push_back(i + 16);
752*e8d8bef9SDimitry Andric     for (int i = Index + Length; i != 8; ++i)
753*e8d8bef9SDimitry Andric       ShuffleMask.push_back(i);
754*e8d8bef9SDimitry Andric     for (int i = 8; i != 16; ++i)
755*e8d8bef9SDimitry Andric       ShuffleMask.push_back(-1);
756*e8d8bef9SDimitry Andric 
757*e8d8bef9SDimitry Andric     Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
758*e8d8bef9SDimitry Andric                                             Builder.CreateBitCast(Op1, ShufTy),
759*e8d8bef9SDimitry Andric                                             ShuffleMask);
760*e8d8bef9SDimitry Andric     return Builder.CreateBitCast(SV, II.getType());
761*e8d8bef9SDimitry Andric   }
762*e8d8bef9SDimitry Andric 
763*e8d8bef9SDimitry Andric   // See if we're dealing with constant values.
764*e8d8bef9SDimitry Andric   Constant *C0 = dyn_cast<Constant>(Op0);
765*e8d8bef9SDimitry Andric   Constant *C1 = dyn_cast<Constant>(Op1);
766*e8d8bef9SDimitry Andric   ConstantInt *CI00 =
767*e8d8bef9SDimitry Andric       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
768*e8d8bef9SDimitry Andric          : nullptr;
769*e8d8bef9SDimitry Andric   ConstantInt *CI10 =
770*e8d8bef9SDimitry Andric       C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
771*e8d8bef9SDimitry Andric          : nullptr;
772*e8d8bef9SDimitry Andric 
773*e8d8bef9SDimitry Andric   // Constant Fold - insert bottom Length bits starting at the Index'th bit.
774*e8d8bef9SDimitry Andric   if (CI00 && CI10) {
775*e8d8bef9SDimitry Andric     APInt V00 = CI00->getValue();
776*e8d8bef9SDimitry Andric     APInt V10 = CI10->getValue();
777*e8d8bef9SDimitry Andric     APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
778*e8d8bef9SDimitry Andric     V00 = V00 & ~Mask;
779*e8d8bef9SDimitry Andric     V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
780*e8d8bef9SDimitry Andric     APInt Val = V00 | V10;
781*e8d8bef9SDimitry Andric     Type *IntTy64 = Type::getInt64Ty(II.getContext());
782*e8d8bef9SDimitry Andric     Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
783*e8d8bef9SDimitry Andric                         UndefValue::get(IntTy64)};
784*e8d8bef9SDimitry Andric     return ConstantVector::get(Args);
785*e8d8bef9SDimitry Andric   }
786*e8d8bef9SDimitry Andric 
787*e8d8bef9SDimitry Andric   // If we were an INSERTQ call, we'll save demanded elements if we convert to
788*e8d8bef9SDimitry Andric   // INSERTQI.
789*e8d8bef9SDimitry Andric   if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
790*e8d8bef9SDimitry Andric     Type *IntTy8 = Type::getInt8Ty(II.getContext());
791*e8d8bef9SDimitry Andric     Constant *CILength = ConstantInt::get(IntTy8, Length, false);
792*e8d8bef9SDimitry Andric     Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
793*e8d8bef9SDimitry Andric 
794*e8d8bef9SDimitry Andric     Value *Args[] = {Op0, Op1, CILength, CIIndex};
795*e8d8bef9SDimitry Andric     Module *M = II.getModule();
796*e8d8bef9SDimitry Andric     Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
797*e8d8bef9SDimitry Andric     return Builder.CreateCall(F, Args);
798*e8d8bef9SDimitry Andric   }
799*e8d8bef9SDimitry Andric 
800*e8d8bef9SDimitry Andric   return nullptr;
801*e8d8bef9SDimitry Andric }
802*e8d8bef9SDimitry Andric 
803*e8d8bef9SDimitry Andric /// Attempt to convert pshufb* to shufflevector if the mask is constant.
804*e8d8bef9SDimitry Andric static Value *simplifyX86pshufb(const IntrinsicInst &II,
805*e8d8bef9SDimitry Andric                                 InstCombiner::BuilderTy &Builder) {
806*e8d8bef9SDimitry Andric   Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
807*e8d8bef9SDimitry Andric   if (!V)
808*e8d8bef9SDimitry Andric     return nullptr;
809*e8d8bef9SDimitry Andric 
810*e8d8bef9SDimitry Andric   auto *VecTy = cast<FixedVectorType>(II.getType());
811*e8d8bef9SDimitry Andric   unsigned NumElts = VecTy->getNumElements();
812*e8d8bef9SDimitry Andric   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
813*e8d8bef9SDimitry Andric          "Unexpected number of elements in shuffle mask!");
814*e8d8bef9SDimitry Andric 
815*e8d8bef9SDimitry Andric   // Construct a shuffle mask from constant integers or UNDEFs.
816*e8d8bef9SDimitry Andric   int Indexes[64];
817*e8d8bef9SDimitry Andric 
818*e8d8bef9SDimitry Andric   // Each byte in the shuffle control mask forms an index to permute the
819*e8d8bef9SDimitry Andric   // corresponding byte in the destination operand.
820*e8d8bef9SDimitry Andric   for (unsigned I = 0; I < NumElts; ++I) {
821*e8d8bef9SDimitry Andric     Constant *COp = V->getAggregateElement(I);
822*e8d8bef9SDimitry Andric     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
823*e8d8bef9SDimitry Andric       return nullptr;
824*e8d8bef9SDimitry Andric 
825*e8d8bef9SDimitry Andric     if (isa<UndefValue>(COp)) {
826*e8d8bef9SDimitry Andric       Indexes[I] = -1;
827*e8d8bef9SDimitry Andric       continue;
828*e8d8bef9SDimitry Andric     }
829*e8d8bef9SDimitry Andric 
830*e8d8bef9SDimitry Andric     int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
831*e8d8bef9SDimitry Andric 
832*e8d8bef9SDimitry Andric     // If the most significant bit (bit[7]) of each byte of the shuffle
833*e8d8bef9SDimitry Andric     // control mask is set, then zero is written in the result byte.
834*e8d8bef9SDimitry Andric     // The zero vector is in the right-hand side of the resulting
835*e8d8bef9SDimitry Andric     // shufflevector.
836*e8d8bef9SDimitry Andric 
837*e8d8bef9SDimitry Andric     // The value of each index for the high 128-bit lane is the least
838*e8d8bef9SDimitry Andric     // significant 4 bits of the respective shuffle control byte.
839*e8d8bef9SDimitry Andric     Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
840*e8d8bef9SDimitry Andric     Indexes[I] = Index;
841*e8d8bef9SDimitry Andric   }
842*e8d8bef9SDimitry Andric 
843*e8d8bef9SDimitry Andric   auto V1 = II.getArgOperand(0);
844*e8d8bef9SDimitry Andric   auto V2 = Constant::getNullValue(VecTy);
845*e8d8bef9SDimitry Andric   return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
846*e8d8bef9SDimitry Andric }
847*e8d8bef9SDimitry Andric 
848*e8d8bef9SDimitry Andric /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
849*e8d8bef9SDimitry Andric static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
850*e8d8bef9SDimitry Andric                                     InstCombiner::BuilderTy &Builder) {
851*e8d8bef9SDimitry Andric   Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
852*e8d8bef9SDimitry Andric   if (!V)
853*e8d8bef9SDimitry Andric     return nullptr;
854*e8d8bef9SDimitry Andric 
855*e8d8bef9SDimitry Andric   auto *VecTy = cast<FixedVectorType>(II.getType());
856*e8d8bef9SDimitry Andric   unsigned NumElts = VecTy->getNumElements();
857*e8d8bef9SDimitry Andric   bool IsPD = VecTy->getScalarType()->isDoubleTy();
858*e8d8bef9SDimitry Andric   unsigned NumLaneElts = IsPD ? 2 : 4;
859*e8d8bef9SDimitry Andric   assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
860*e8d8bef9SDimitry Andric 
861*e8d8bef9SDimitry Andric   // Construct a shuffle mask from constant integers or UNDEFs.
862*e8d8bef9SDimitry Andric   int Indexes[16];
863*e8d8bef9SDimitry Andric 
864*e8d8bef9SDimitry Andric   // The intrinsics only read one or two bits, clear the rest.
865*e8d8bef9SDimitry Andric   for (unsigned I = 0; I < NumElts; ++I) {
866*e8d8bef9SDimitry Andric     Constant *COp = V->getAggregateElement(I);
867*e8d8bef9SDimitry Andric     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
868*e8d8bef9SDimitry Andric       return nullptr;
869*e8d8bef9SDimitry Andric 
870*e8d8bef9SDimitry Andric     if (isa<UndefValue>(COp)) {
871*e8d8bef9SDimitry Andric       Indexes[I] = -1;
872*e8d8bef9SDimitry Andric       continue;
873*e8d8bef9SDimitry Andric     }
874*e8d8bef9SDimitry Andric 
875*e8d8bef9SDimitry Andric     APInt Index = cast<ConstantInt>(COp)->getValue();
876*e8d8bef9SDimitry Andric     Index = Index.zextOrTrunc(32).getLoBits(2);
877*e8d8bef9SDimitry Andric 
878*e8d8bef9SDimitry Andric     // The PD variants uses bit 1 to select per-lane element index, so
879*e8d8bef9SDimitry Andric     // shift down to convert to generic shuffle mask index.
880*e8d8bef9SDimitry Andric     if (IsPD)
881*e8d8bef9SDimitry Andric       Index.lshrInPlace(1);
882*e8d8bef9SDimitry Andric 
883*e8d8bef9SDimitry Andric     // The _256 variants are a bit trickier since the mask bits always index
884*e8d8bef9SDimitry Andric     // into the corresponding 128 half. In order to convert to a generic
885*e8d8bef9SDimitry Andric     // shuffle, we have to make that explicit.
886*e8d8bef9SDimitry Andric     Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
887*e8d8bef9SDimitry Andric 
888*e8d8bef9SDimitry Andric     Indexes[I] = Index.getZExtValue();
889*e8d8bef9SDimitry Andric   }
890*e8d8bef9SDimitry Andric 
891*e8d8bef9SDimitry Andric   auto V1 = II.getArgOperand(0);
892*e8d8bef9SDimitry Andric   return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
893*e8d8bef9SDimitry Andric }
894*e8d8bef9SDimitry Andric 
895*e8d8bef9SDimitry Andric /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
896*e8d8bef9SDimitry Andric static Value *simplifyX86vpermv(const IntrinsicInst &II,
897*e8d8bef9SDimitry Andric                                 InstCombiner::BuilderTy &Builder) {
898*e8d8bef9SDimitry Andric   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
899*e8d8bef9SDimitry Andric   if (!V)
900*e8d8bef9SDimitry Andric     return nullptr;
901*e8d8bef9SDimitry Andric 
902*e8d8bef9SDimitry Andric   auto *VecTy = cast<FixedVectorType>(II.getType());
903*e8d8bef9SDimitry Andric   unsigned Size = VecTy->getNumElements();
904*e8d8bef9SDimitry Andric   assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
905*e8d8bef9SDimitry Andric          "Unexpected shuffle mask size");
906*e8d8bef9SDimitry Andric 
907*e8d8bef9SDimitry Andric   // Construct a shuffle mask from constant integers or UNDEFs.
908*e8d8bef9SDimitry Andric   int Indexes[64];
909*e8d8bef9SDimitry Andric 
910*e8d8bef9SDimitry Andric   for (unsigned I = 0; I < Size; ++I) {
911*e8d8bef9SDimitry Andric     Constant *COp = V->getAggregateElement(I);
912*e8d8bef9SDimitry Andric     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
913*e8d8bef9SDimitry Andric       return nullptr;
914*e8d8bef9SDimitry Andric 
915*e8d8bef9SDimitry Andric     if (isa<UndefValue>(COp)) {
916*e8d8bef9SDimitry Andric       Indexes[I] = -1;
917*e8d8bef9SDimitry Andric       continue;
918*e8d8bef9SDimitry Andric     }
919*e8d8bef9SDimitry Andric 
920*e8d8bef9SDimitry Andric     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
921*e8d8bef9SDimitry Andric     Index &= Size - 1;
922*e8d8bef9SDimitry Andric     Indexes[I] = Index;
923*e8d8bef9SDimitry Andric   }
924*e8d8bef9SDimitry Andric 
925*e8d8bef9SDimitry Andric   auto V1 = II.getArgOperand(0);
926*e8d8bef9SDimitry Andric   return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
927*e8d8bef9SDimitry Andric }
928*e8d8bef9SDimitry Andric 
929*e8d8bef9SDimitry Andric Optional<Instruction *>
930*e8d8bef9SDimitry Andric X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
931*e8d8bef9SDimitry Andric   auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
932*e8d8bef9SDimitry Andric                                              unsigned DemandedWidth) {
933*e8d8bef9SDimitry Andric     APInt UndefElts(Width, 0);
934*e8d8bef9SDimitry Andric     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
935*e8d8bef9SDimitry Andric     return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
936*e8d8bef9SDimitry Andric   };
937*e8d8bef9SDimitry Andric 
938*e8d8bef9SDimitry Andric   Intrinsic::ID IID = II.getIntrinsicID();
939*e8d8bef9SDimitry Andric   switch (IID) {
940*e8d8bef9SDimitry Andric   case Intrinsic::x86_bmi_bextr_32:
941*e8d8bef9SDimitry Andric   case Intrinsic::x86_bmi_bextr_64:
942*e8d8bef9SDimitry Andric   case Intrinsic::x86_tbm_bextri_u32:
943*e8d8bef9SDimitry Andric   case Intrinsic::x86_tbm_bextri_u64:
944*e8d8bef9SDimitry Andric     // If the RHS is a constant we can try some simplifications.
945*e8d8bef9SDimitry Andric     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
946*e8d8bef9SDimitry Andric       uint64_t Shift = C->getZExtValue();
947*e8d8bef9SDimitry Andric       uint64_t Length = (Shift >> 8) & 0xff;
948*e8d8bef9SDimitry Andric       Shift &= 0xff;
949*e8d8bef9SDimitry Andric       unsigned BitWidth = II.getType()->getIntegerBitWidth();
950*e8d8bef9SDimitry Andric       // If the length is 0 or the shift is out of range, replace with zero.
951*e8d8bef9SDimitry Andric       if (Length == 0 || Shift >= BitWidth) {
952*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
953*e8d8bef9SDimitry Andric       }
954*e8d8bef9SDimitry Andric       // If the LHS is also a constant, we can completely constant fold this.
955*e8d8bef9SDimitry Andric       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
956*e8d8bef9SDimitry Andric         uint64_t Result = InC->getZExtValue() >> Shift;
957*e8d8bef9SDimitry Andric         if (Length > BitWidth)
958*e8d8bef9SDimitry Andric           Length = BitWidth;
959*e8d8bef9SDimitry Andric         Result &= maskTrailingOnes<uint64_t>(Length);
960*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II,
961*e8d8bef9SDimitry Andric                                       ConstantInt::get(II.getType(), Result));
962*e8d8bef9SDimitry Andric       }
963*e8d8bef9SDimitry Andric       // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
964*e8d8bef9SDimitry Andric       // are only masking bits that a shift already cleared?
965*e8d8bef9SDimitry Andric     }
966*e8d8bef9SDimitry Andric     break;
967*e8d8bef9SDimitry Andric 
968*e8d8bef9SDimitry Andric   case Intrinsic::x86_bmi_bzhi_32:
969*e8d8bef9SDimitry Andric   case Intrinsic::x86_bmi_bzhi_64:
970*e8d8bef9SDimitry Andric     // If the RHS is a constant we can try some simplifications.
971*e8d8bef9SDimitry Andric     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
972*e8d8bef9SDimitry Andric       uint64_t Index = C->getZExtValue() & 0xff;
973*e8d8bef9SDimitry Andric       unsigned BitWidth = II.getType()->getIntegerBitWidth();
974*e8d8bef9SDimitry Andric       if (Index >= BitWidth) {
975*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
976*e8d8bef9SDimitry Andric       }
977*e8d8bef9SDimitry Andric       if (Index == 0) {
978*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
979*e8d8bef9SDimitry Andric       }
980*e8d8bef9SDimitry Andric       // If the LHS is also a constant, we can completely constant fold this.
981*e8d8bef9SDimitry Andric       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
982*e8d8bef9SDimitry Andric         uint64_t Result = InC->getZExtValue();
983*e8d8bef9SDimitry Andric         Result &= maskTrailingOnes<uint64_t>(Index);
984*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II,
985*e8d8bef9SDimitry Andric                                       ConstantInt::get(II.getType(), Result));
986*e8d8bef9SDimitry Andric       }
987*e8d8bef9SDimitry Andric       // TODO should we convert this to an AND if the RHS is constant?
988*e8d8bef9SDimitry Andric     }
989*e8d8bef9SDimitry Andric     break;
990*e8d8bef9SDimitry Andric   case Intrinsic::x86_bmi_pext_32:
991*e8d8bef9SDimitry Andric   case Intrinsic::x86_bmi_pext_64:
992*e8d8bef9SDimitry Andric     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
993*e8d8bef9SDimitry Andric       if (MaskC->isNullValue()) {
994*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
995*e8d8bef9SDimitry Andric       }
996*e8d8bef9SDimitry Andric       if (MaskC->isAllOnesValue()) {
997*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
998*e8d8bef9SDimitry Andric       }
999*e8d8bef9SDimitry Andric 
1000*e8d8bef9SDimitry Andric       if (MaskC->getValue().isShiftedMask()) {
1001*e8d8bef9SDimitry Andric         // any single contingous sequence of 1s anywhere in the mask simply
1002*e8d8bef9SDimitry Andric         // describes a subset of the input bits shifted to the appropriate
1003*e8d8bef9SDimitry Andric         // position.  Replace with the straight forward IR.
1004*e8d8bef9SDimitry Andric         unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
1005*e8d8bef9SDimitry Andric         Value *Input = II.getArgOperand(0);
1006*e8d8bef9SDimitry Andric         Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
1007*e8d8bef9SDimitry Andric         Value *Shifted = IC.Builder.CreateLShr(Masked,
1008*e8d8bef9SDimitry Andric                                                ConstantInt::get(II.getType(),
1009*e8d8bef9SDimitry Andric                                                                 ShiftAmount));
1010*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II, Shifted);
1011*e8d8bef9SDimitry Andric       }
1012*e8d8bef9SDimitry Andric 
1013*e8d8bef9SDimitry Andric 
1014*e8d8bef9SDimitry Andric       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1015*e8d8bef9SDimitry Andric         uint64_t Src = SrcC->getZExtValue();
1016*e8d8bef9SDimitry Andric         uint64_t Mask = MaskC->getZExtValue();
1017*e8d8bef9SDimitry Andric         uint64_t Result = 0;
1018*e8d8bef9SDimitry Andric         uint64_t BitToSet = 1;
1019*e8d8bef9SDimitry Andric 
1020*e8d8bef9SDimitry Andric         while (Mask) {
1021*e8d8bef9SDimitry Andric           // Isolate lowest set bit.
1022*e8d8bef9SDimitry Andric           uint64_t BitToTest = Mask & -Mask;
1023*e8d8bef9SDimitry Andric           if (BitToTest & Src)
1024*e8d8bef9SDimitry Andric             Result |= BitToSet;
1025*e8d8bef9SDimitry Andric 
1026*e8d8bef9SDimitry Andric           BitToSet <<= 1;
1027*e8d8bef9SDimitry Andric           // Clear lowest set bit.
1028*e8d8bef9SDimitry Andric           Mask &= Mask - 1;
1029*e8d8bef9SDimitry Andric         }
1030*e8d8bef9SDimitry Andric 
1031*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II,
1032*e8d8bef9SDimitry Andric                                       ConstantInt::get(II.getType(), Result));
1033*e8d8bef9SDimitry Andric       }
1034*e8d8bef9SDimitry Andric     }
1035*e8d8bef9SDimitry Andric     break;
1036*e8d8bef9SDimitry Andric   case Intrinsic::x86_bmi_pdep_32:
1037*e8d8bef9SDimitry Andric   case Intrinsic::x86_bmi_pdep_64:
1038*e8d8bef9SDimitry Andric     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
1039*e8d8bef9SDimitry Andric       if (MaskC->isNullValue()) {
1040*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
1041*e8d8bef9SDimitry Andric       }
1042*e8d8bef9SDimitry Andric       if (MaskC->isAllOnesValue()) {
1043*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1044*e8d8bef9SDimitry Andric       }
1045*e8d8bef9SDimitry Andric       if (MaskC->getValue().isShiftedMask()) {
1046*e8d8bef9SDimitry Andric         // any single contingous sequence of 1s anywhere in the mask simply
1047*e8d8bef9SDimitry Andric         // describes a subset of the input bits shifted to the appropriate
1048*e8d8bef9SDimitry Andric         // position.  Replace with the straight forward IR.
1049*e8d8bef9SDimitry Andric         unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
1050*e8d8bef9SDimitry Andric         Value *Input = II.getArgOperand(0);
1051*e8d8bef9SDimitry Andric         Value *Shifted = IC.Builder.CreateShl(Input,
1052*e8d8bef9SDimitry Andric                                               ConstantInt::get(II.getType(),
1053*e8d8bef9SDimitry Andric                                                                ShiftAmount));
1054*e8d8bef9SDimitry Andric         Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
1055*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II, Masked);
1056*e8d8bef9SDimitry Andric       }
1057*e8d8bef9SDimitry Andric 
1058*e8d8bef9SDimitry Andric       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1059*e8d8bef9SDimitry Andric         uint64_t Src = SrcC->getZExtValue();
1060*e8d8bef9SDimitry Andric         uint64_t Mask = MaskC->getZExtValue();
1061*e8d8bef9SDimitry Andric         uint64_t Result = 0;
1062*e8d8bef9SDimitry Andric         uint64_t BitToTest = 1;
1063*e8d8bef9SDimitry Andric 
1064*e8d8bef9SDimitry Andric         while (Mask) {
1065*e8d8bef9SDimitry Andric           // Isolate lowest set bit.
1066*e8d8bef9SDimitry Andric           uint64_t BitToSet = Mask & -Mask;
1067*e8d8bef9SDimitry Andric           if (BitToTest & Src)
1068*e8d8bef9SDimitry Andric             Result |= BitToSet;
1069*e8d8bef9SDimitry Andric 
1070*e8d8bef9SDimitry Andric           BitToTest <<= 1;
1071*e8d8bef9SDimitry Andric           // Clear lowest set bit;
1072*e8d8bef9SDimitry Andric           Mask &= Mask - 1;
1073*e8d8bef9SDimitry Andric         }
1074*e8d8bef9SDimitry Andric 
1075*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II,
1076*e8d8bef9SDimitry Andric                                       ConstantInt::get(II.getType(), Result));
1077*e8d8bef9SDimitry Andric       }
1078*e8d8bef9SDimitry Andric     }
1079*e8d8bef9SDimitry Andric     break;
1080*e8d8bef9SDimitry Andric 
1081*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_cvtss2si:
1082*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_cvtss2si64:
1083*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_cvttss2si:
1084*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_cvttss2si64:
1085*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_cvtsd2si:
1086*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_cvtsd2si64:
1087*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_cvttsd2si:
1088*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_cvttsd2si64:
1089*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vcvtss2si32:
1090*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vcvtss2si64:
1091*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vcvtss2usi32:
1092*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vcvtss2usi64:
1093*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vcvtsd2si32:
1094*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vcvtsd2si64:
1095*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vcvtsd2usi32:
1096*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vcvtsd2usi64:
1097*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_cvttss2si:
1098*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_cvttss2si64:
1099*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_cvttss2usi:
1100*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_cvttss2usi64:
1101*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_cvttsd2si:
1102*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_cvttsd2si64:
1103*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_cvttsd2usi:
1104*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_cvttsd2usi64: {
1105*e8d8bef9SDimitry Andric     // These intrinsics only demand the 0th element of their input vectors. If
1106*e8d8bef9SDimitry Andric     // we can simplify the input based on that, do so now.
1107*e8d8bef9SDimitry Andric     Value *Arg = II.getArgOperand(0);
1108*e8d8bef9SDimitry Andric     unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
1109*e8d8bef9SDimitry Andric     if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
1110*e8d8bef9SDimitry Andric       return IC.replaceOperand(II, 0, V);
1111*e8d8bef9SDimitry Andric     }
1112*e8d8bef9SDimitry Andric     break;
1113*e8d8bef9SDimitry Andric   }
1114*e8d8bef9SDimitry Andric 
1115*e8d8bef9SDimitry Andric   case Intrinsic::x86_mmx_pmovmskb:
1116*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_movmsk_ps:
1117*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_movmsk_pd:
1118*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_pmovmskb_128:
1119*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_movmsk_pd_256:
1120*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_movmsk_ps_256:
1121*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_pmovmskb:
1122*e8d8bef9SDimitry Andric     if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
1123*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
1124*e8d8bef9SDimitry Andric     }
1125*e8d8bef9SDimitry Andric     break;
1126*e8d8bef9SDimitry Andric 
1127*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_comieq_ss:
1128*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_comige_ss:
1129*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_comigt_ss:
1130*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_comile_ss:
1131*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_comilt_ss:
1132*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_comineq_ss:
1133*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_ucomieq_ss:
1134*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_ucomige_ss:
1135*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_ucomigt_ss:
1136*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_ucomile_ss:
1137*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_ucomilt_ss:
1138*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_ucomineq_ss:
1139*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_comieq_sd:
1140*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_comige_sd:
1141*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_comigt_sd:
1142*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_comile_sd:
1143*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_comilt_sd:
1144*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_comineq_sd:
1145*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_ucomieq_sd:
1146*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_ucomige_sd:
1147*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_ucomigt_sd:
1148*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_ucomile_sd:
1149*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_ucomilt_sd:
1150*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_ucomineq_sd:
1151*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vcomi_ss:
1152*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vcomi_sd:
1153*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_cmp_ss:
1154*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_cmp_sd: {
1155*e8d8bef9SDimitry Andric     // These intrinsics only demand the 0th element of their input vectors. If
1156*e8d8bef9SDimitry Andric     // we can simplify the input based on that, do so now.
1157*e8d8bef9SDimitry Andric     bool MadeChange = false;
1158*e8d8bef9SDimitry Andric     Value *Arg0 = II.getArgOperand(0);
1159*e8d8bef9SDimitry Andric     Value *Arg1 = II.getArgOperand(1);
1160*e8d8bef9SDimitry Andric     unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
1161*e8d8bef9SDimitry Andric     if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
1162*e8d8bef9SDimitry Andric       IC.replaceOperand(II, 0, V);
1163*e8d8bef9SDimitry Andric       MadeChange = true;
1164*e8d8bef9SDimitry Andric     }
1165*e8d8bef9SDimitry Andric     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
1166*e8d8bef9SDimitry Andric       IC.replaceOperand(II, 1, V);
1167*e8d8bef9SDimitry Andric       MadeChange = true;
1168*e8d8bef9SDimitry Andric     }
1169*e8d8bef9SDimitry Andric     if (MadeChange) {
1170*e8d8bef9SDimitry Andric       return &II;
1171*e8d8bef9SDimitry Andric     }
1172*e8d8bef9SDimitry Andric     break;
1173*e8d8bef9SDimitry Andric   }
1174*e8d8bef9SDimitry Andric 
1175*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_add_ps_512:
1176*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_div_ps_512:
1177*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mul_ps_512:
1178*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_sub_ps_512:
1179*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_add_pd_512:
1180*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_div_pd_512:
1181*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mul_pd_512:
1182*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_sub_pd_512:
1183*e8d8bef9SDimitry Andric     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1184*e8d8bef9SDimitry Andric     // IR operations.
1185*e8d8bef9SDimitry Andric     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1186*e8d8bef9SDimitry Andric       if (R->getValue() == 4) {
1187*e8d8bef9SDimitry Andric         Value *Arg0 = II.getArgOperand(0);
1188*e8d8bef9SDimitry Andric         Value *Arg1 = II.getArgOperand(1);
1189*e8d8bef9SDimitry Andric 
1190*e8d8bef9SDimitry Andric         Value *V;
1191*e8d8bef9SDimitry Andric         switch (IID) {
1192*e8d8bef9SDimitry Andric         default:
1193*e8d8bef9SDimitry Andric           llvm_unreachable("Case stmts out of sync!");
1194*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_add_ps_512:
1195*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_add_pd_512:
1196*e8d8bef9SDimitry Andric           V = IC.Builder.CreateFAdd(Arg0, Arg1);
1197*e8d8bef9SDimitry Andric           break;
1198*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_sub_ps_512:
1199*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_sub_pd_512:
1200*e8d8bef9SDimitry Andric           V = IC.Builder.CreateFSub(Arg0, Arg1);
1201*e8d8bef9SDimitry Andric           break;
1202*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_mul_ps_512:
1203*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_mul_pd_512:
1204*e8d8bef9SDimitry Andric           V = IC.Builder.CreateFMul(Arg0, Arg1);
1205*e8d8bef9SDimitry Andric           break;
1206*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_div_ps_512:
1207*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_div_pd_512:
1208*e8d8bef9SDimitry Andric           V = IC.Builder.CreateFDiv(Arg0, Arg1);
1209*e8d8bef9SDimitry Andric           break;
1210*e8d8bef9SDimitry Andric         }
1211*e8d8bef9SDimitry Andric 
1212*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II, V);
1213*e8d8bef9SDimitry Andric       }
1214*e8d8bef9SDimitry Andric     }
1215*e8d8bef9SDimitry Andric     break;
1216*e8d8bef9SDimitry Andric 
1217*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_add_ss_round:
1218*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_div_ss_round:
1219*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_mul_ss_round:
1220*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_sub_ss_round:
1221*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_add_sd_round:
1222*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_div_sd_round:
1223*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_mul_sd_round:
1224*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_sub_sd_round:
1225*e8d8bef9SDimitry Andric     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1226*e8d8bef9SDimitry Andric     // IR operations.
1227*e8d8bef9SDimitry Andric     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
1228*e8d8bef9SDimitry Andric       if (R->getValue() == 4) {
1229*e8d8bef9SDimitry Andric         // Extract the element as scalars.
1230*e8d8bef9SDimitry Andric         Value *Arg0 = II.getArgOperand(0);
1231*e8d8bef9SDimitry Andric         Value *Arg1 = II.getArgOperand(1);
1232*e8d8bef9SDimitry Andric         Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
1233*e8d8bef9SDimitry Andric         Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
1234*e8d8bef9SDimitry Andric 
1235*e8d8bef9SDimitry Andric         Value *V;
1236*e8d8bef9SDimitry Andric         switch (IID) {
1237*e8d8bef9SDimitry Andric         default:
1238*e8d8bef9SDimitry Andric           llvm_unreachable("Case stmts out of sync!");
1239*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_mask_add_ss_round:
1240*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_mask_add_sd_round:
1241*e8d8bef9SDimitry Andric           V = IC.Builder.CreateFAdd(LHS, RHS);
1242*e8d8bef9SDimitry Andric           break;
1243*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_mask_sub_ss_round:
1244*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_mask_sub_sd_round:
1245*e8d8bef9SDimitry Andric           V = IC.Builder.CreateFSub(LHS, RHS);
1246*e8d8bef9SDimitry Andric           break;
1247*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_mask_mul_ss_round:
1248*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_mask_mul_sd_round:
1249*e8d8bef9SDimitry Andric           V = IC.Builder.CreateFMul(LHS, RHS);
1250*e8d8bef9SDimitry Andric           break;
1251*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_mask_div_ss_round:
1252*e8d8bef9SDimitry Andric         case Intrinsic::x86_avx512_mask_div_sd_round:
1253*e8d8bef9SDimitry Andric           V = IC.Builder.CreateFDiv(LHS, RHS);
1254*e8d8bef9SDimitry Andric           break;
1255*e8d8bef9SDimitry Andric         }
1256*e8d8bef9SDimitry Andric 
1257*e8d8bef9SDimitry Andric         // Handle the masking aspect of the intrinsic.
1258*e8d8bef9SDimitry Andric         Value *Mask = II.getArgOperand(3);
1259*e8d8bef9SDimitry Andric         auto *C = dyn_cast<ConstantInt>(Mask);
1260*e8d8bef9SDimitry Andric         // We don't need a select if we know the mask bit is a 1.
1261*e8d8bef9SDimitry Andric         if (!C || !C->getValue()[0]) {
1262*e8d8bef9SDimitry Andric           // Cast the mask to an i1 vector and then extract the lowest element.
1263*e8d8bef9SDimitry Andric           auto *MaskTy = FixedVectorType::get(
1264*e8d8bef9SDimitry Andric               IC.Builder.getInt1Ty(),
1265*e8d8bef9SDimitry Andric               cast<IntegerType>(Mask->getType())->getBitWidth());
1266*e8d8bef9SDimitry Andric           Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
1267*e8d8bef9SDimitry Andric           Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
1268*e8d8bef9SDimitry Andric           // Extract the lowest element from the passthru operand.
1269*e8d8bef9SDimitry Andric           Value *Passthru =
1270*e8d8bef9SDimitry Andric               IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
1271*e8d8bef9SDimitry Andric           V = IC.Builder.CreateSelect(Mask, V, Passthru);
1272*e8d8bef9SDimitry Andric         }
1273*e8d8bef9SDimitry Andric 
1274*e8d8bef9SDimitry Andric         // Insert the result back into the original argument 0.
1275*e8d8bef9SDimitry Andric         V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
1276*e8d8bef9SDimitry Andric 
1277*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II, V);
1278*e8d8bef9SDimitry Andric       }
1279*e8d8bef9SDimitry Andric     }
1280*e8d8bef9SDimitry Andric     break;
1281*e8d8bef9SDimitry Andric 
1282*e8d8bef9SDimitry Andric   // Constant fold ashr( <A x Bi>, Ci ).
1283*e8d8bef9SDimitry Andric   // Constant fold lshr( <A x Bi>, Ci ).
1284*e8d8bef9SDimitry Andric   // Constant fold shl( <A x Bi>, Ci ).
1285*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrai_d:
1286*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrai_w:
1287*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrai_d:
1288*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrai_w:
1289*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrai_q_128:
1290*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrai_q_256:
1291*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrai_d_512:
1292*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrai_q_512:
1293*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrai_w_512:
1294*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrli_d:
1295*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrli_q:
1296*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrli_w:
1297*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrli_d:
1298*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrli_q:
1299*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrli_w:
1300*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrli_d_512:
1301*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrli_q_512:
1302*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrli_w_512:
1303*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_pslli_d:
1304*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_pslli_q:
1305*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_pslli_w:
1306*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_pslli_d:
1307*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_pslli_q:
1308*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_pslli_w:
1309*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_pslli_d_512:
1310*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_pslli_q_512:
1311*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_pslli_w_512:
1312*e8d8bef9SDimitry Andric     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1313*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
1314*e8d8bef9SDimitry Andric     }
1315*e8d8bef9SDimitry Andric     break;
1316*e8d8bef9SDimitry Andric 
1317*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psra_d:
1318*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psra_w:
1319*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psra_d:
1320*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psra_w:
1321*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psra_q_128:
1322*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psra_q_256:
1323*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psra_d_512:
1324*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psra_q_512:
1325*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psra_w_512:
1326*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrl_d:
1327*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrl_q:
1328*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psrl_w:
1329*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrl_d:
1330*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrl_q:
1331*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrl_w:
1332*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrl_d_512:
1333*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrl_q_512:
1334*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrl_w_512:
1335*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psll_d:
1336*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psll_q:
1337*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_psll_w:
1338*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psll_d:
1339*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psll_q:
1340*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psll_w:
1341*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psll_d_512:
1342*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psll_q_512:
1343*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psll_w_512: {
1344*e8d8bef9SDimitry Andric     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1345*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
1346*e8d8bef9SDimitry Andric     }
1347*e8d8bef9SDimitry Andric 
1348*e8d8bef9SDimitry Andric     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
1349*e8d8bef9SDimitry Andric     // operand to compute the shift amount.
1350*e8d8bef9SDimitry Andric     Value *Arg1 = II.getArgOperand(1);
1351*e8d8bef9SDimitry Andric     assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
1352*e8d8bef9SDimitry Andric            "Unexpected packed shift size");
1353*e8d8bef9SDimitry Andric     unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
1354*e8d8bef9SDimitry Andric 
1355*e8d8bef9SDimitry Andric     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
1356*e8d8bef9SDimitry Andric       return IC.replaceOperand(II, 1, V);
1357*e8d8bef9SDimitry Andric     }
1358*e8d8bef9SDimitry Andric     break;
1359*e8d8bef9SDimitry Andric   }
1360*e8d8bef9SDimitry Andric 
1361*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psllv_d:
1362*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psllv_d_256:
1363*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psllv_q:
1364*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psllv_q_256:
1365*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psllv_d_512:
1366*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psllv_q_512:
1367*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psllv_w_128:
1368*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psllv_w_256:
1369*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psllv_w_512:
1370*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrav_d:
1371*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrav_d_256:
1372*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_q_128:
1373*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_q_256:
1374*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_d_512:
1375*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_q_512:
1376*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_w_128:
1377*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_w_256:
1378*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrav_w_512:
1379*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrlv_d:
1380*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrlv_d_256:
1381*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrlv_q:
1382*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_psrlv_q_256:
1383*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrlv_d_512:
1384*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrlv_q_512:
1385*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrlv_w_128:
1386*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrlv_w_256:
1387*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_psrlv_w_512:
1388*e8d8bef9SDimitry Andric     if (Value *V = simplifyX86varShift(II, IC.Builder)) {
1389*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
1390*e8d8bef9SDimitry Andric     }
1391*e8d8bef9SDimitry Andric     break;
1392*e8d8bef9SDimitry Andric 
1393*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_packssdw_128:
1394*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_packsswb_128:
1395*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_packssdw:
1396*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_packsswb:
1397*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_packssdw_512:
1398*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_packsswb_512:
1399*e8d8bef9SDimitry Andric     if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
1400*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
1401*e8d8bef9SDimitry Andric     }
1402*e8d8bef9SDimitry Andric     break;
1403*e8d8bef9SDimitry Andric 
1404*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_packuswb_128:
1405*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse41_packusdw:
1406*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_packusdw:
1407*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_packuswb:
1408*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_packusdw_512:
1409*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_packuswb_512:
1410*e8d8bef9SDimitry Andric     if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
1411*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
1412*e8d8bef9SDimitry Andric     }
1413*e8d8bef9SDimitry Andric     break;
1414*e8d8bef9SDimitry Andric 
1415*e8d8bef9SDimitry Andric   case Intrinsic::x86_pclmulqdq:
1416*e8d8bef9SDimitry Andric   case Intrinsic::x86_pclmulqdq_256:
1417*e8d8bef9SDimitry Andric   case Intrinsic::x86_pclmulqdq_512: {
1418*e8d8bef9SDimitry Andric     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1419*e8d8bef9SDimitry Andric       unsigned Imm = C->getZExtValue();
1420*e8d8bef9SDimitry Andric 
1421*e8d8bef9SDimitry Andric       bool MadeChange = false;
1422*e8d8bef9SDimitry Andric       Value *Arg0 = II.getArgOperand(0);
1423*e8d8bef9SDimitry Andric       Value *Arg1 = II.getArgOperand(1);
1424*e8d8bef9SDimitry Andric       unsigned VWidth =
1425*e8d8bef9SDimitry Andric           cast<FixedVectorType>(Arg0->getType())->getNumElements();
1426*e8d8bef9SDimitry Andric 
1427*e8d8bef9SDimitry Andric       APInt UndefElts1(VWidth, 0);
1428*e8d8bef9SDimitry Andric       APInt DemandedElts1 =
1429*e8d8bef9SDimitry Andric           APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
1430*e8d8bef9SDimitry Andric       if (Value *V =
1431*e8d8bef9SDimitry Andric               IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
1432*e8d8bef9SDimitry Andric         IC.replaceOperand(II, 0, V);
1433*e8d8bef9SDimitry Andric         MadeChange = true;
1434*e8d8bef9SDimitry Andric       }
1435*e8d8bef9SDimitry Andric 
1436*e8d8bef9SDimitry Andric       APInt UndefElts2(VWidth, 0);
1437*e8d8bef9SDimitry Andric       APInt DemandedElts2 =
1438*e8d8bef9SDimitry Andric           APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
1439*e8d8bef9SDimitry Andric       if (Value *V =
1440*e8d8bef9SDimitry Andric               IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
1441*e8d8bef9SDimitry Andric         IC.replaceOperand(II, 1, V);
1442*e8d8bef9SDimitry Andric         MadeChange = true;
1443*e8d8bef9SDimitry Andric       }
1444*e8d8bef9SDimitry Andric 
1445*e8d8bef9SDimitry Andric       // If either input elements are undef, the result is zero.
1446*e8d8bef9SDimitry Andric       if (DemandedElts1.isSubsetOf(UndefElts1) ||
1447*e8d8bef9SDimitry Andric           DemandedElts2.isSubsetOf(UndefElts2)) {
1448*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II,
1449*e8d8bef9SDimitry Andric                                       ConstantAggregateZero::get(II.getType()));
1450*e8d8bef9SDimitry Andric       }
1451*e8d8bef9SDimitry Andric 
1452*e8d8bef9SDimitry Andric       if (MadeChange) {
1453*e8d8bef9SDimitry Andric         return &II;
1454*e8d8bef9SDimitry Andric       }
1455*e8d8bef9SDimitry Andric     }
1456*e8d8bef9SDimitry Andric     break;
1457*e8d8bef9SDimitry Andric   }
1458*e8d8bef9SDimitry Andric 
1459*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse41_insertps:
1460*e8d8bef9SDimitry Andric     if (Value *V = simplifyX86insertps(II, IC.Builder)) {
1461*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
1462*e8d8bef9SDimitry Andric     }
1463*e8d8bef9SDimitry Andric     break;
1464*e8d8bef9SDimitry Andric 
1465*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse4a_extrq: {
1466*e8d8bef9SDimitry Andric     Value *Op0 = II.getArgOperand(0);
1467*e8d8bef9SDimitry Andric     Value *Op1 = II.getArgOperand(1);
1468*e8d8bef9SDimitry Andric     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1469*e8d8bef9SDimitry Andric     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1470*e8d8bef9SDimitry Andric     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1471*e8d8bef9SDimitry Andric            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1472*e8d8bef9SDimitry Andric            VWidth1 == 16 && "Unexpected operand sizes");
1473*e8d8bef9SDimitry Andric 
1474*e8d8bef9SDimitry Andric     // See if we're dealing with constant values.
1475*e8d8bef9SDimitry Andric     Constant *C1 = dyn_cast<Constant>(Op1);
1476*e8d8bef9SDimitry Andric     ConstantInt *CILength =
1477*e8d8bef9SDimitry Andric         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1478*e8d8bef9SDimitry Andric            : nullptr;
1479*e8d8bef9SDimitry Andric     ConstantInt *CIIndex =
1480*e8d8bef9SDimitry Andric         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1481*e8d8bef9SDimitry Andric            : nullptr;
1482*e8d8bef9SDimitry Andric 
1483*e8d8bef9SDimitry Andric     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
1484*e8d8bef9SDimitry Andric     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1485*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
1486*e8d8bef9SDimitry Andric     }
1487*e8d8bef9SDimitry Andric 
1488*e8d8bef9SDimitry Andric     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
1489*e8d8bef9SDimitry Andric     // operands and the lowest 16-bits of the second.
1490*e8d8bef9SDimitry Andric     bool MadeChange = false;
1491*e8d8bef9SDimitry Andric     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1492*e8d8bef9SDimitry Andric       IC.replaceOperand(II, 0, V);
1493*e8d8bef9SDimitry Andric       MadeChange = true;
1494*e8d8bef9SDimitry Andric     }
1495*e8d8bef9SDimitry Andric     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
1496*e8d8bef9SDimitry Andric       IC.replaceOperand(II, 1, V);
1497*e8d8bef9SDimitry Andric       MadeChange = true;
1498*e8d8bef9SDimitry Andric     }
1499*e8d8bef9SDimitry Andric     if (MadeChange) {
1500*e8d8bef9SDimitry Andric       return &II;
1501*e8d8bef9SDimitry Andric     }
1502*e8d8bef9SDimitry Andric     break;
1503*e8d8bef9SDimitry Andric   }
1504*e8d8bef9SDimitry Andric 
1505*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse4a_extrqi: {
1506*e8d8bef9SDimitry Andric     // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
1507*e8d8bef9SDimitry Andric     // bits of the lower 64-bits. The upper 64-bits are undefined.
1508*e8d8bef9SDimitry Andric     Value *Op0 = II.getArgOperand(0);
1509*e8d8bef9SDimitry Andric     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1510*e8d8bef9SDimitry Andric     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1511*e8d8bef9SDimitry Andric            "Unexpected operand size");
1512*e8d8bef9SDimitry Andric 
1513*e8d8bef9SDimitry Andric     // See if we're dealing with constant values.
1514*e8d8bef9SDimitry Andric     ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
1515*e8d8bef9SDimitry Andric     ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
1516*e8d8bef9SDimitry Andric 
1517*e8d8bef9SDimitry Andric     // Attempt to simplify to a constant or shuffle vector.
1518*e8d8bef9SDimitry Andric     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1519*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
1520*e8d8bef9SDimitry Andric     }
1521*e8d8bef9SDimitry Andric 
1522*e8d8bef9SDimitry Andric     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
1523*e8d8bef9SDimitry Andric     // operand.
1524*e8d8bef9SDimitry Andric     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1525*e8d8bef9SDimitry Andric       return IC.replaceOperand(II, 0, V);
1526*e8d8bef9SDimitry Andric     }
1527*e8d8bef9SDimitry Andric     break;
1528*e8d8bef9SDimitry Andric   }
1529*e8d8bef9SDimitry Andric 
1530*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse4a_insertq: {
1531*e8d8bef9SDimitry Andric     Value *Op0 = II.getArgOperand(0);
1532*e8d8bef9SDimitry Andric     Value *Op1 = II.getArgOperand(1);
1533*e8d8bef9SDimitry Andric     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1534*e8d8bef9SDimitry Andric     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1535*e8d8bef9SDimitry Andric            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1536*e8d8bef9SDimitry Andric            cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
1537*e8d8bef9SDimitry Andric            "Unexpected operand size");
1538*e8d8bef9SDimitry Andric 
1539*e8d8bef9SDimitry Andric     // See if we're dealing with constant values.
1540*e8d8bef9SDimitry Andric     Constant *C1 = dyn_cast<Constant>(Op1);
1541*e8d8bef9SDimitry Andric     ConstantInt *CI11 =
1542*e8d8bef9SDimitry Andric         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1543*e8d8bef9SDimitry Andric            : nullptr;
1544*e8d8bef9SDimitry Andric 
1545*e8d8bef9SDimitry Andric     // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
1546*e8d8bef9SDimitry Andric     if (CI11) {
1547*e8d8bef9SDimitry Andric       const APInt &V11 = CI11->getValue();
1548*e8d8bef9SDimitry Andric       APInt Len = V11.zextOrTrunc(6);
1549*e8d8bef9SDimitry Andric       APInt Idx = V11.lshr(8).zextOrTrunc(6);
1550*e8d8bef9SDimitry Andric       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1551*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II, V);
1552*e8d8bef9SDimitry Andric       }
1553*e8d8bef9SDimitry Andric     }
1554*e8d8bef9SDimitry Andric 
1555*e8d8bef9SDimitry Andric     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
1556*e8d8bef9SDimitry Andric     // operand.
1557*e8d8bef9SDimitry Andric     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1558*e8d8bef9SDimitry Andric       return IC.replaceOperand(II, 0, V);
1559*e8d8bef9SDimitry Andric     }
1560*e8d8bef9SDimitry Andric     break;
1561*e8d8bef9SDimitry Andric   }
1562*e8d8bef9SDimitry Andric 
1563*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse4a_insertqi: {
1564*e8d8bef9SDimitry Andric     // INSERTQI: Extract lowest Length bits from lower half of second source and
1565*e8d8bef9SDimitry Andric     // insert over first source starting at Index bit. The upper 64-bits are
1566*e8d8bef9SDimitry Andric     // undefined.
1567*e8d8bef9SDimitry Andric     Value *Op0 = II.getArgOperand(0);
1568*e8d8bef9SDimitry Andric     Value *Op1 = II.getArgOperand(1);
1569*e8d8bef9SDimitry Andric     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1570*e8d8bef9SDimitry Andric     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1571*e8d8bef9SDimitry Andric     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1572*e8d8bef9SDimitry Andric            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1573*e8d8bef9SDimitry Andric            VWidth1 == 2 && "Unexpected operand sizes");
1574*e8d8bef9SDimitry Andric 
1575*e8d8bef9SDimitry Andric     // See if we're dealing with constant values.
1576*e8d8bef9SDimitry Andric     ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
1577*e8d8bef9SDimitry Andric     ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
1578*e8d8bef9SDimitry Andric 
1579*e8d8bef9SDimitry Andric     // Attempt to simplify to a constant or shuffle vector.
1580*e8d8bef9SDimitry Andric     if (CILength && CIIndex) {
1581*e8d8bef9SDimitry Andric       APInt Len = CILength->getValue().zextOrTrunc(6);
1582*e8d8bef9SDimitry Andric       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
1583*e8d8bef9SDimitry Andric       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1584*e8d8bef9SDimitry Andric         return IC.replaceInstUsesWith(II, V);
1585*e8d8bef9SDimitry Andric       }
1586*e8d8bef9SDimitry Andric     }
1587*e8d8bef9SDimitry Andric 
1588*e8d8bef9SDimitry Andric     // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
1589*e8d8bef9SDimitry Andric     // operands.
1590*e8d8bef9SDimitry Andric     bool MadeChange = false;
1591*e8d8bef9SDimitry Andric     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1592*e8d8bef9SDimitry Andric       IC.replaceOperand(II, 0, V);
1593*e8d8bef9SDimitry Andric       MadeChange = true;
1594*e8d8bef9SDimitry Andric     }
1595*e8d8bef9SDimitry Andric     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
1596*e8d8bef9SDimitry Andric       IC.replaceOperand(II, 1, V);
1597*e8d8bef9SDimitry Andric       MadeChange = true;
1598*e8d8bef9SDimitry Andric     }
1599*e8d8bef9SDimitry Andric     if (MadeChange) {
1600*e8d8bef9SDimitry Andric       return &II;
1601*e8d8bef9SDimitry Andric     }
1602*e8d8bef9SDimitry Andric     break;
1603*e8d8bef9SDimitry Andric   }
1604*e8d8bef9SDimitry Andric 
1605*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse41_pblendvb:
1606*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse41_blendvps:
1607*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse41_blendvpd:
1608*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_blendv_ps_256:
1609*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_blendv_pd_256:
1610*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_pblendvb: {
1611*e8d8bef9SDimitry Andric     // fold (blend A, A, Mask) -> A
1612*e8d8bef9SDimitry Andric     Value *Op0 = II.getArgOperand(0);
1613*e8d8bef9SDimitry Andric     Value *Op1 = II.getArgOperand(1);
1614*e8d8bef9SDimitry Andric     Value *Mask = II.getArgOperand(2);
1615*e8d8bef9SDimitry Andric     if (Op0 == Op1) {
1616*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, Op0);
1617*e8d8bef9SDimitry Andric     }
1618*e8d8bef9SDimitry Andric 
1619*e8d8bef9SDimitry Andric     // Zero Mask - select 1st argument.
1620*e8d8bef9SDimitry Andric     if (isa<ConstantAggregateZero>(Mask)) {
1621*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, Op0);
1622*e8d8bef9SDimitry Andric     }
1623*e8d8bef9SDimitry Andric 
1624*e8d8bef9SDimitry Andric     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
1625*e8d8bef9SDimitry Andric     if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
1626*e8d8bef9SDimitry Andric       Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
1627*e8d8bef9SDimitry Andric       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
1628*e8d8bef9SDimitry Andric     }
1629*e8d8bef9SDimitry Andric 
1630*e8d8bef9SDimitry Andric     // Convert to a vector select if we can bypass casts and find a boolean
1631*e8d8bef9SDimitry Andric     // vector condition value.
1632*e8d8bef9SDimitry Andric     Value *BoolVec;
1633*e8d8bef9SDimitry Andric     Mask = InstCombiner::peekThroughBitcast(Mask);
1634*e8d8bef9SDimitry Andric     if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
1635*e8d8bef9SDimitry Andric         BoolVec->getType()->isVectorTy() &&
1636*e8d8bef9SDimitry Andric         BoolVec->getType()->getScalarSizeInBits() == 1) {
1637*e8d8bef9SDimitry Andric       assert(Mask->getType()->getPrimitiveSizeInBits() ==
1638*e8d8bef9SDimitry Andric                  II.getType()->getPrimitiveSizeInBits() &&
1639*e8d8bef9SDimitry Andric              "Not expecting mask and operands with different sizes");
1640*e8d8bef9SDimitry Andric 
1641*e8d8bef9SDimitry Andric       unsigned NumMaskElts =
1642*e8d8bef9SDimitry Andric           cast<FixedVectorType>(Mask->getType())->getNumElements();
1643*e8d8bef9SDimitry Andric       unsigned NumOperandElts =
1644*e8d8bef9SDimitry Andric           cast<FixedVectorType>(II.getType())->getNumElements();
1645*e8d8bef9SDimitry Andric       if (NumMaskElts == NumOperandElts) {
1646*e8d8bef9SDimitry Andric         return SelectInst::Create(BoolVec, Op1, Op0);
1647*e8d8bef9SDimitry Andric       }
1648*e8d8bef9SDimitry Andric 
1649*e8d8bef9SDimitry Andric       // If the mask has less elements than the operands, each mask bit maps to
1650*e8d8bef9SDimitry Andric       // multiple elements of the operands. Bitcast back and forth.
1651*e8d8bef9SDimitry Andric       if (NumMaskElts < NumOperandElts) {
1652*e8d8bef9SDimitry Andric         Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
1653*e8d8bef9SDimitry Andric         Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
1654*e8d8bef9SDimitry Andric         Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
1655*e8d8bef9SDimitry Andric         return new BitCastInst(Sel, II.getType());
1656*e8d8bef9SDimitry Andric       }
1657*e8d8bef9SDimitry Andric     }
1658*e8d8bef9SDimitry Andric 
1659*e8d8bef9SDimitry Andric     break;
1660*e8d8bef9SDimitry Andric   }
1661*e8d8bef9SDimitry Andric 
1662*e8d8bef9SDimitry Andric   case Intrinsic::x86_ssse3_pshuf_b_128:
1663*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_pshuf_b:
1664*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_pshuf_b_512:
1665*e8d8bef9SDimitry Andric     if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
1666*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
1667*e8d8bef9SDimitry Andric     }
1668*e8d8bef9SDimitry Andric     break;
1669*e8d8bef9SDimitry Andric 
1670*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_vpermilvar_ps:
1671*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_vpermilvar_ps_256:
1672*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vpermilvar_ps_512:
1673*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_vpermilvar_pd:
1674*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_vpermilvar_pd_256:
1675*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vpermilvar_pd_512:
1676*e8d8bef9SDimitry Andric     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
1677*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
1678*e8d8bef9SDimitry Andric     }
1679*e8d8bef9SDimitry Andric     break;
1680*e8d8bef9SDimitry Andric 
1681*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_permd:
1682*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_permps:
1683*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_permvar_df_256:
1684*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_permvar_df_512:
1685*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_permvar_di_256:
1686*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_permvar_di_512:
1687*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_permvar_hi_128:
1688*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_permvar_hi_256:
1689*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_permvar_hi_512:
1690*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_permvar_qi_128:
1691*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_permvar_qi_256:
1692*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_permvar_qi_512:
1693*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_permvar_sf_512:
1694*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_permvar_si_512:
1695*e8d8bef9SDimitry Andric     if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
1696*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
1697*e8d8bef9SDimitry Andric     }
1698*e8d8bef9SDimitry Andric     break;
1699*e8d8bef9SDimitry Andric 
1700*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_maskload_ps:
1701*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_maskload_pd:
1702*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_maskload_ps_256:
1703*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_maskload_pd_256:
1704*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_maskload_d:
1705*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_maskload_q:
1706*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_maskload_d_256:
1707*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_maskload_q_256:
1708*e8d8bef9SDimitry Andric     if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
1709*e8d8bef9SDimitry Andric       return I;
1710*e8d8bef9SDimitry Andric     }
1711*e8d8bef9SDimitry Andric     break;
1712*e8d8bef9SDimitry Andric 
1713*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_maskmov_dqu:
1714*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_maskstore_ps:
1715*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_maskstore_pd:
1716*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_maskstore_ps_256:
1717*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_maskstore_pd_256:
1718*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_maskstore_d:
1719*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_maskstore_q:
1720*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_maskstore_d_256:
1721*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_maskstore_q_256:
1722*e8d8bef9SDimitry Andric     if (simplifyX86MaskedStore(II, IC)) {
1723*e8d8bef9SDimitry Andric       return nullptr;
1724*e8d8bef9SDimitry Andric     }
1725*e8d8bef9SDimitry Andric     break;
1726*e8d8bef9SDimitry Andric 
1727*e8d8bef9SDimitry Andric   case Intrinsic::x86_addcarry_32:
1728*e8d8bef9SDimitry Andric   case Intrinsic::x86_addcarry_64:
1729*e8d8bef9SDimitry Andric     if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
1730*e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
1731*e8d8bef9SDimitry Andric     }
1732*e8d8bef9SDimitry Andric     break;
1733*e8d8bef9SDimitry Andric 
1734*e8d8bef9SDimitry Andric   default:
1735*e8d8bef9SDimitry Andric     break;
1736*e8d8bef9SDimitry Andric   }
1737*e8d8bef9SDimitry Andric   return None;
1738*e8d8bef9SDimitry Andric }
1739*e8d8bef9SDimitry Andric 
1740*e8d8bef9SDimitry Andric Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
1741*e8d8bef9SDimitry Andric     InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
1742*e8d8bef9SDimitry Andric     bool &KnownBitsComputed) const {
1743*e8d8bef9SDimitry Andric   switch (II.getIntrinsicID()) {
1744*e8d8bef9SDimitry Andric   default:
1745*e8d8bef9SDimitry Andric     break;
1746*e8d8bef9SDimitry Andric   case Intrinsic::x86_mmx_pmovmskb:
1747*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_movmsk_ps:
1748*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_movmsk_pd:
1749*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_pmovmskb_128:
1750*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_movmsk_ps_256:
1751*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_movmsk_pd_256:
1752*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_pmovmskb: {
1753*e8d8bef9SDimitry Andric     // MOVMSK copies the vector elements' sign bits to the low bits
1754*e8d8bef9SDimitry Andric     // and zeros the high bits.
1755*e8d8bef9SDimitry Andric     unsigned ArgWidth;
1756*e8d8bef9SDimitry Andric     if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
1757*e8d8bef9SDimitry Andric       ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
1758*e8d8bef9SDimitry Andric     } else {
1759*e8d8bef9SDimitry Andric       auto Arg = II.getArgOperand(0);
1760*e8d8bef9SDimitry Andric       auto ArgType = cast<FixedVectorType>(Arg->getType());
1761*e8d8bef9SDimitry Andric       ArgWidth = ArgType->getNumElements();
1762*e8d8bef9SDimitry Andric     }
1763*e8d8bef9SDimitry Andric 
1764*e8d8bef9SDimitry Andric     // If we don't need any of low bits then return zero,
1765*e8d8bef9SDimitry Andric     // we know that DemandedMask is non-zero already.
1766*e8d8bef9SDimitry Andric     APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
1767*e8d8bef9SDimitry Andric     Type *VTy = II.getType();
1768*e8d8bef9SDimitry Andric     if (DemandedElts.isNullValue()) {
1769*e8d8bef9SDimitry Andric       return ConstantInt::getNullValue(VTy);
1770*e8d8bef9SDimitry Andric     }
1771*e8d8bef9SDimitry Andric 
1772*e8d8bef9SDimitry Andric     // We know that the upper bits are set to zero.
1773*e8d8bef9SDimitry Andric     Known.Zero.setBitsFrom(ArgWidth);
1774*e8d8bef9SDimitry Andric     KnownBitsComputed = true;
1775*e8d8bef9SDimitry Andric     break;
1776*e8d8bef9SDimitry Andric   }
1777*e8d8bef9SDimitry Andric   }
1778*e8d8bef9SDimitry Andric   return None;
1779*e8d8bef9SDimitry Andric }
1780*e8d8bef9SDimitry Andric 
1781*e8d8bef9SDimitry Andric Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
1782*e8d8bef9SDimitry Andric     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1783*e8d8bef9SDimitry Andric     APInt &UndefElts2, APInt &UndefElts3,
1784*e8d8bef9SDimitry Andric     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1785*e8d8bef9SDimitry Andric         simplifyAndSetOp) const {
1786*e8d8bef9SDimitry Andric   unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
1787*e8d8bef9SDimitry Andric   switch (II.getIntrinsicID()) {
1788*e8d8bef9SDimitry Andric   default:
1789*e8d8bef9SDimitry Andric     break;
1790*e8d8bef9SDimitry Andric   case Intrinsic::x86_xop_vfrcz_ss:
1791*e8d8bef9SDimitry Andric   case Intrinsic::x86_xop_vfrcz_sd:
1792*e8d8bef9SDimitry Andric     // The instructions for these intrinsics are speced to zero upper bits not
1793*e8d8bef9SDimitry Andric     // pass them through like other scalar intrinsics. So we shouldn't just
1794*e8d8bef9SDimitry Andric     // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
1795*e8d8bef9SDimitry Andric     // Instead we should return a zero vector.
1796*e8d8bef9SDimitry Andric     if (!DemandedElts[0]) {
1797*e8d8bef9SDimitry Andric       IC.addToWorklist(&II);
1798*e8d8bef9SDimitry Andric       return ConstantAggregateZero::get(II.getType());
1799*e8d8bef9SDimitry Andric     }
1800*e8d8bef9SDimitry Andric 
1801*e8d8bef9SDimitry Andric     // Only the lower element is used.
1802*e8d8bef9SDimitry Andric     DemandedElts = 1;
1803*e8d8bef9SDimitry Andric     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1804*e8d8bef9SDimitry Andric 
1805*e8d8bef9SDimitry Andric     // Only the lower element is undefined. The high elements are zero.
1806*e8d8bef9SDimitry Andric     UndefElts = UndefElts[0];
1807*e8d8bef9SDimitry Andric     break;
1808*e8d8bef9SDimitry Andric 
1809*e8d8bef9SDimitry Andric   // Unary scalar-as-vector operations that work column-wise.
1810*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_rcp_ss:
1811*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_rsqrt_ss:
1812*e8d8bef9SDimitry Andric     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1813*e8d8bef9SDimitry Andric 
1814*e8d8bef9SDimitry Andric     // If lowest element of a scalar op isn't used then use Arg0.
1815*e8d8bef9SDimitry Andric     if (!DemandedElts[0]) {
1816*e8d8bef9SDimitry Andric       IC.addToWorklist(&II);
1817*e8d8bef9SDimitry Andric       return II.getArgOperand(0);
1818*e8d8bef9SDimitry Andric     }
1819*e8d8bef9SDimitry Andric     // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
1820*e8d8bef9SDimitry Andric     // checks).
1821*e8d8bef9SDimitry Andric     break;
1822*e8d8bef9SDimitry Andric 
1823*e8d8bef9SDimitry Andric   // Binary scalar-as-vector operations that work column-wise. The high
1824*e8d8bef9SDimitry Andric   // elements come from operand 0. The low element is a function of both
1825*e8d8bef9SDimitry Andric   // operands.
1826*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_min_ss:
1827*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_max_ss:
1828*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse_cmp_ss:
1829*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_min_sd:
1830*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_max_sd:
1831*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_cmp_sd: {
1832*e8d8bef9SDimitry Andric     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1833*e8d8bef9SDimitry Andric 
1834*e8d8bef9SDimitry Andric     // If lowest element of a scalar op isn't used then use Arg0.
1835*e8d8bef9SDimitry Andric     if (!DemandedElts[0]) {
1836*e8d8bef9SDimitry Andric       IC.addToWorklist(&II);
1837*e8d8bef9SDimitry Andric       return II.getArgOperand(0);
1838*e8d8bef9SDimitry Andric     }
1839*e8d8bef9SDimitry Andric 
1840*e8d8bef9SDimitry Andric     // Only lower element is used for operand 1.
1841*e8d8bef9SDimitry Andric     DemandedElts = 1;
1842*e8d8bef9SDimitry Andric     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1843*e8d8bef9SDimitry Andric 
1844*e8d8bef9SDimitry Andric     // Lower element is undefined if both lower elements are undefined.
1845*e8d8bef9SDimitry Andric     // Consider things like undef&0.  The result is known zero, not undef.
1846*e8d8bef9SDimitry Andric     if (!UndefElts2[0])
1847*e8d8bef9SDimitry Andric       UndefElts.clearBit(0);
1848*e8d8bef9SDimitry Andric 
1849*e8d8bef9SDimitry Andric     break;
1850*e8d8bef9SDimitry Andric   }
1851*e8d8bef9SDimitry Andric 
1852*e8d8bef9SDimitry Andric   // Binary scalar-as-vector operations that work column-wise. The high
1853*e8d8bef9SDimitry Andric   // elements come from operand 0 and the low element comes from operand 1.
1854*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse41_round_ss:
1855*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse41_round_sd: {
1856*e8d8bef9SDimitry Andric     // Don't use the low element of operand 0.
1857*e8d8bef9SDimitry Andric     APInt DemandedElts2 = DemandedElts;
1858*e8d8bef9SDimitry Andric     DemandedElts2.clearBit(0);
1859*e8d8bef9SDimitry Andric     simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
1860*e8d8bef9SDimitry Andric 
1861*e8d8bef9SDimitry Andric     // If lowest element of a scalar op isn't used then use Arg0.
1862*e8d8bef9SDimitry Andric     if (!DemandedElts[0]) {
1863*e8d8bef9SDimitry Andric       IC.addToWorklist(&II);
1864*e8d8bef9SDimitry Andric       return II.getArgOperand(0);
1865*e8d8bef9SDimitry Andric     }
1866*e8d8bef9SDimitry Andric 
1867*e8d8bef9SDimitry Andric     // Only lower element is used for operand 1.
1868*e8d8bef9SDimitry Andric     DemandedElts = 1;
1869*e8d8bef9SDimitry Andric     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1870*e8d8bef9SDimitry Andric 
1871*e8d8bef9SDimitry Andric     // Take the high undef elements from operand 0 and take the lower element
1872*e8d8bef9SDimitry Andric     // from operand 1.
1873*e8d8bef9SDimitry Andric     UndefElts.clearBit(0);
1874*e8d8bef9SDimitry Andric     UndefElts |= UndefElts2[0];
1875*e8d8bef9SDimitry Andric     break;
1876*e8d8bef9SDimitry Andric   }
1877*e8d8bef9SDimitry Andric 
1878*e8d8bef9SDimitry Andric   // Three input scalar-as-vector operations that work column-wise. The high
1879*e8d8bef9SDimitry Andric   // elements come from operand 0 and the low element is a function of all
1880*e8d8bef9SDimitry Andric   // three inputs.
1881*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_add_ss_round:
1882*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_div_ss_round:
1883*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_mul_ss_round:
1884*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_sub_ss_round:
1885*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_max_ss_round:
1886*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_min_ss_round:
1887*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_add_sd_round:
1888*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_div_sd_round:
1889*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_mul_sd_round:
1890*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_sub_sd_round:
1891*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_max_sd_round:
1892*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_mask_min_sd_round:
1893*e8d8bef9SDimitry Andric     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1894*e8d8bef9SDimitry Andric 
1895*e8d8bef9SDimitry Andric     // If lowest element of a scalar op isn't used then use Arg0.
1896*e8d8bef9SDimitry Andric     if (!DemandedElts[0]) {
1897*e8d8bef9SDimitry Andric       IC.addToWorklist(&II);
1898*e8d8bef9SDimitry Andric       return II.getArgOperand(0);
1899*e8d8bef9SDimitry Andric     }
1900*e8d8bef9SDimitry Andric 
1901*e8d8bef9SDimitry Andric     // Only lower element is used for operand 1 and 2.
1902*e8d8bef9SDimitry Andric     DemandedElts = 1;
1903*e8d8bef9SDimitry Andric     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1904*e8d8bef9SDimitry Andric     simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
1905*e8d8bef9SDimitry Andric 
1906*e8d8bef9SDimitry Andric     // Lower element is undefined if all three lower elements are undefined.
1907*e8d8bef9SDimitry Andric     // Consider things like undef&0.  The result is known zero, not undef.
1908*e8d8bef9SDimitry Andric     if (!UndefElts2[0] || !UndefElts3[0])
1909*e8d8bef9SDimitry Andric       UndefElts.clearBit(0);
1910*e8d8bef9SDimitry Andric     break;
1911*e8d8bef9SDimitry Andric 
1912*e8d8bef9SDimitry Andric   // TODO: Add fmaddsub support?
1913*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse3_addsub_pd:
1914*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse3_addsub_ps:
1915*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_addsub_pd_256:
1916*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_addsub_ps_256: {
1917*e8d8bef9SDimitry Andric     // If none of the even or none of the odd lanes are required, turn this
1918*e8d8bef9SDimitry Andric     // into a generic FP math instruction.
1919*e8d8bef9SDimitry Andric     APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
1920*e8d8bef9SDimitry Andric     APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
1921*e8d8bef9SDimitry Andric     bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
1922*e8d8bef9SDimitry Andric     bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
1923*e8d8bef9SDimitry Andric     if (IsSubOnly || IsAddOnly) {
1924*e8d8bef9SDimitry Andric       assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
1925*e8d8bef9SDimitry Andric       IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1926*e8d8bef9SDimitry Andric       IC.Builder.SetInsertPoint(&II);
1927*e8d8bef9SDimitry Andric       Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
1928*e8d8bef9SDimitry Andric       return IC.Builder.CreateBinOp(
1929*e8d8bef9SDimitry Andric           IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
1930*e8d8bef9SDimitry Andric     }
1931*e8d8bef9SDimitry Andric 
1932*e8d8bef9SDimitry Andric     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1933*e8d8bef9SDimitry Andric     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1934*e8d8bef9SDimitry Andric     UndefElts &= UndefElts2;
1935*e8d8bef9SDimitry Andric     break;
1936*e8d8bef9SDimitry Andric   }
1937*e8d8bef9SDimitry Andric 
1938*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_packssdw_128:
1939*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_packsswb_128:
1940*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse2_packuswb_128:
1941*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse41_packusdw:
1942*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_packssdw:
1943*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_packsswb:
1944*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_packusdw:
1945*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_packuswb:
1946*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_packssdw_512:
1947*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_packsswb_512:
1948*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_packusdw_512:
1949*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_packuswb_512: {
1950*e8d8bef9SDimitry Andric     auto *Ty0 = II.getArgOperand(0)->getType();
1951*e8d8bef9SDimitry Andric     unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
1952*e8d8bef9SDimitry Andric     assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
1953*e8d8bef9SDimitry Andric 
1954*e8d8bef9SDimitry Andric     unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
1955*e8d8bef9SDimitry Andric     unsigned VWidthPerLane = VWidth / NumLanes;
1956*e8d8bef9SDimitry Andric     unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
1957*e8d8bef9SDimitry Andric 
1958*e8d8bef9SDimitry Andric     // Per lane, pack the elements of the first input and then the second.
1959*e8d8bef9SDimitry Andric     // e.g.
1960*e8d8bef9SDimitry Andric     // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
1961*e8d8bef9SDimitry Andric     // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
1962*e8d8bef9SDimitry Andric     for (int OpNum = 0; OpNum != 2; ++OpNum) {
1963*e8d8bef9SDimitry Andric       APInt OpDemandedElts(InnerVWidth, 0);
1964*e8d8bef9SDimitry Andric       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1965*e8d8bef9SDimitry Andric         unsigned LaneIdx = Lane * VWidthPerLane;
1966*e8d8bef9SDimitry Andric         for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
1967*e8d8bef9SDimitry Andric           unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
1968*e8d8bef9SDimitry Andric           if (DemandedElts[Idx])
1969*e8d8bef9SDimitry Andric             OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
1970*e8d8bef9SDimitry Andric         }
1971*e8d8bef9SDimitry Andric       }
1972*e8d8bef9SDimitry Andric 
1973*e8d8bef9SDimitry Andric       // Demand elements from the operand.
1974*e8d8bef9SDimitry Andric       APInt OpUndefElts(InnerVWidth, 0);
1975*e8d8bef9SDimitry Andric       simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
1976*e8d8bef9SDimitry Andric 
1977*e8d8bef9SDimitry Andric       // Pack the operand's UNDEF elements, one lane at a time.
1978*e8d8bef9SDimitry Andric       OpUndefElts = OpUndefElts.zext(VWidth);
1979*e8d8bef9SDimitry Andric       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1980*e8d8bef9SDimitry Andric         APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
1981*e8d8bef9SDimitry Andric         LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
1982*e8d8bef9SDimitry Andric         LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
1983*e8d8bef9SDimitry Andric         UndefElts |= LaneElts;
1984*e8d8bef9SDimitry Andric       }
1985*e8d8bef9SDimitry Andric     }
1986*e8d8bef9SDimitry Andric     break;
1987*e8d8bef9SDimitry Andric   }
1988*e8d8bef9SDimitry Andric 
1989*e8d8bef9SDimitry Andric   // PSHUFB
1990*e8d8bef9SDimitry Andric   case Intrinsic::x86_ssse3_pshuf_b_128:
1991*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_pshuf_b:
1992*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_pshuf_b_512:
1993*e8d8bef9SDimitry Andric   // PERMILVAR
1994*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_vpermilvar_ps:
1995*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_vpermilvar_ps_256:
1996*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vpermilvar_ps_512:
1997*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_vpermilvar_pd:
1998*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx_vpermilvar_pd_256:
1999*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx512_vpermilvar_pd_512:
2000*e8d8bef9SDimitry Andric   // PERMV
2001*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_permd:
2002*e8d8bef9SDimitry Andric   case Intrinsic::x86_avx2_permps: {
2003*e8d8bef9SDimitry Andric     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
2004*e8d8bef9SDimitry Andric     break;
2005*e8d8bef9SDimitry Andric   }
2006*e8d8bef9SDimitry Andric 
2007*e8d8bef9SDimitry Andric   // SSE4A instructions leave the upper 64-bits of the 128-bit result
2008*e8d8bef9SDimitry Andric   // in an undefined state.
2009*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse4a_extrq:
2010*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse4a_extrqi:
2011*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse4a_insertq:
2012*e8d8bef9SDimitry Andric   case Intrinsic::x86_sse4a_insertqi:
2013*e8d8bef9SDimitry Andric     UndefElts.setHighBits(VWidth / 2);
2014*e8d8bef9SDimitry Andric     break;
2015*e8d8bef9SDimitry Andric   }
2016*e8d8bef9SDimitry Andric   return None;
2017*e8d8bef9SDimitry Andric }
2018