xref: /llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp (revision e1751a1087ce6058369e73d46184e844eaa3e3f3)
1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
21 #include <optional>
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "x86tti"
26 
27 /// Return a constant boolean vector that has true elements in all positions
28 /// where the input constant data vector has an element with the sign bit set.
29 static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) {
30   VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
31   V = ConstantExpr::getBitCast(V, IntTy);
32   V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT,
33                                       Constant::getNullValue(IntTy), V, DL);
34   assert(V && "Vector must be foldable");
35   return V;
36 }
37 
38 /// Convert the x86 XMM integer vector mask to a vector of bools based on
39 /// each element's most significant bit (the sign bit).
40 static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
41   // Fold Constant Mask.
42   if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
43     return getNegativeIsTrueBoolVec(ConstantMask, DL);
44 
45   // Mask was extended from a boolean vector.
46   Value *ExtMask;
47   if (PatternMatch::match(
48           Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
49       ExtMask->getType()->isIntOrIntVectorTy(1))
50     return ExtMask;
51 
52   return nullptr;
53 }
54 
55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
56 // XMM register mask efficiently, we could transform all x86 masked intrinsics
57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
58 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
59   Value *Ptr = II.getOperand(0);
60   Value *Mask = II.getOperand(1);
61   Constant *ZeroVec = Constant::getNullValue(II.getType());
62 
63   // Zero Mask - masked load instruction creates a zero vector.
64   if (isa<ConstantAggregateZero>(Mask))
65     return IC.replaceInstUsesWith(II, ZeroVec);
66 
67   // The mask is constant or extended from a bool vector. Convert this x86
68   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
69   if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
70     // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
71     // the LLVM intrinsic definition for the pointer argument.
72     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
73     PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
74     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
75 
76     // The pass-through vector for an x86 masked load is a zero vector.
77     CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
78         II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
79     return IC.replaceInstUsesWith(II, NewMaskedLoad);
80   }
81 
82   return nullptr;
83 }
84 
85 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
86 // XMM register mask efficiently, we could transform all x86 masked intrinsics
87 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
88 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
89   Value *Ptr = II.getOperand(0);
90   Value *Mask = II.getOperand(1);
91   Value *Vec = II.getOperand(2);
92 
93   // Zero Mask - this masked store instruction does nothing.
94   if (isa<ConstantAggregateZero>(Mask)) {
95     IC.eraseInstFromFunction(II);
96     return true;
97   }
98 
99   // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
100   // anything else at this level.
101   if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
102     return false;
103 
104   // The mask is constant or extended from a bool vector. Convert this x86
105   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
106   if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
107     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
108     PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
109     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
110 
111     IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
112 
113     // 'Replace uses' doesn't work for stores. Erase the original masked store.
114     IC.eraseInstFromFunction(II);
115     return true;
116   }
117 
118   return false;
119 }
120 
121 static Value *simplifyX86immShift(const IntrinsicInst &II,
122                                   InstCombiner::BuilderTy &Builder) {
123   bool LogicalShift = false;
124   bool ShiftLeft = false;
125   bool IsImm = false;
126 
127   switch (II.getIntrinsicID()) {
128   default:
129     llvm_unreachable("Unexpected intrinsic!");
130   case Intrinsic::x86_sse2_psrai_d:
131   case Intrinsic::x86_sse2_psrai_w:
132   case Intrinsic::x86_avx2_psrai_d:
133   case Intrinsic::x86_avx2_psrai_w:
134   case Intrinsic::x86_avx512_psrai_q_128:
135   case Intrinsic::x86_avx512_psrai_q_256:
136   case Intrinsic::x86_avx512_psrai_d_512:
137   case Intrinsic::x86_avx512_psrai_q_512:
138   case Intrinsic::x86_avx512_psrai_w_512:
139     IsImm = true;
140     [[fallthrough]];
141   case Intrinsic::x86_sse2_psra_d:
142   case Intrinsic::x86_sse2_psra_w:
143   case Intrinsic::x86_avx2_psra_d:
144   case Intrinsic::x86_avx2_psra_w:
145   case Intrinsic::x86_avx512_psra_q_128:
146   case Intrinsic::x86_avx512_psra_q_256:
147   case Intrinsic::x86_avx512_psra_d_512:
148   case Intrinsic::x86_avx512_psra_q_512:
149   case Intrinsic::x86_avx512_psra_w_512:
150     LogicalShift = false;
151     ShiftLeft = false;
152     break;
153   case Intrinsic::x86_sse2_psrli_d:
154   case Intrinsic::x86_sse2_psrli_q:
155   case Intrinsic::x86_sse2_psrli_w:
156   case Intrinsic::x86_avx2_psrli_d:
157   case Intrinsic::x86_avx2_psrli_q:
158   case Intrinsic::x86_avx2_psrli_w:
159   case Intrinsic::x86_avx512_psrli_d_512:
160   case Intrinsic::x86_avx512_psrli_q_512:
161   case Intrinsic::x86_avx512_psrli_w_512:
162     IsImm = true;
163     [[fallthrough]];
164   case Intrinsic::x86_sse2_psrl_d:
165   case Intrinsic::x86_sse2_psrl_q:
166   case Intrinsic::x86_sse2_psrl_w:
167   case Intrinsic::x86_avx2_psrl_d:
168   case Intrinsic::x86_avx2_psrl_q:
169   case Intrinsic::x86_avx2_psrl_w:
170   case Intrinsic::x86_avx512_psrl_d_512:
171   case Intrinsic::x86_avx512_psrl_q_512:
172   case Intrinsic::x86_avx512_psrl_w_512:
173     LogicalShift = true;
174     ShiftLeft = false;
175     break;
176   case Intrinsic::x86_sse2_pslli_d:
177   case Intrinsic::x86_sse2_pslli_q:
178   case Intrinsic::x86_sse2_pslli_w:
179   case Intrinsic::x86_avx2_pslli_d:
180   case Intrinsic::x86_avx2_pslli_q:
181   case Intrinsic::x86_avx2_pslli_w:
182   case Intrinsic::x86_avx512_pslli_d_512:
183   case Intrinsic::x86_avx512_pslli_q_512:
184   case Intrinsic::x86_avx512_pslli_w_512:
185     IsImm = true;
186     [[fallthrough]];
187   case Intrinsic::x86_sse2_psll_d:
188   case Intrinsic::x86_sse2_psll_q:
189   case Intrinsic::x86_sse2_psll_w:
190   case Intrinsic::x86_avx2_psll_d:
191   case Intrinsic::x86_avx2_psll_q:
192   case Intrinsic::x86_avx2_psll_w:
193   case Intrinsic::x86_avx512_psll_d_512:
194   case Intrinsic::x86_avx512_psll_q_512:
195   case Intrinsic::x86_avx512_psll_w_512:
196     LogicalShift = true;
197     ShiftLeft = true;
198     break;
199   }
200   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
201 
202   Value *Vec = II.getArgOperand(0);
203   Value *Amt = II.getArgOperand(1);
204   auto *VT = cast<FixedVectorType>(Vec->getType());
205   Type *SVT = VT->getElementType();
206   Type *AmtVT = Amt->getType();
207   unsigned VWidth = VT->getNumElements();
208   unsigned BitWidth = SVT->getPrimitiveSizeInBits();
209 
210   // If the shift amount is guaranteed to be in-range we can replace it with a
211   // generic shift. If its guaranteed to be out of range, logical shifts combine
212   // to zero and arithmetic shifts are clamped to (BitWidth - 1).
213   if (IsImm) {
214     assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
215     KnownBits KnownAmtBits =
216         llvm::computeKnownBits(Amt, II.getDataLayout());
217     if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
218       Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
219       Amt = Builder.CreateVectorSplat(VWidth, Amt);
220       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
221                                         : Builder.CreateLShr(Vec, Amt))
222                            : Builder.CreateAShr(Vec, Amt));
223     }
224     if (KnownAmtBits.getMinValue().uge(BitWidth)) {
225       if (LogicalShift)
226         return ConstantAggregateZero::get(VT);
227       Amt = ConstantInt::get(SVT, BitWidth - 1);
228       return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
229     }
230   } else {
231     // Ensure the first element has an in-range value and the rest of the
232     // elements in the bottom 64 bits are zero.
233     assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
234            cast<VectorType>(AmtVT)->getElementType() == SVT &&
235            "Unexpected shift-by-scalar type");
236     unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
237     APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
238     APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
239     KnownBits KnownLowerBits = llvm::computeKnownBits(
240         Amt, DemandedLower, II.getDataLayout());
241     KnownBits KnownUpperBits = llvm::computeKnownBits(
242         Amt, DemandedUpper, II.getDataLayout());
243     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
244         (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
245       SmallVector<int, 16> ZeroSplat(VWidth, 0);
246       Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
247       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
248                                         : Builder.CreateLShr(Vec, Amt))
249                            : Builder.CreateAShr(Vec, Amt));
250     }
251   }
252 
253   // Simplify if count is constant vector.
254   auto *CDV = dyn_cast<ConstantDataVector>(Amt);
255   if (!CDV)
256     return nullptr;
257 
258   // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
259   // operand to compute the shift amount.
260   assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
261          cast<VectorType>(AmtVT)->getElementType() == SVT &&
262          "Unexpected shift-by-scalar type");
263 
264   // Concatenate the sub-elements to create the 64-bit value.
265   APInt Count(64, 0);
266   for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
267     unsigned SubEltIdx = (NumSubElts - 1) - i;
268     auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
269     Count <<= BitWidth;
270     Count |= SubElt->getValue().zextOrTrunc(64);
271   }
272 
273   // If shift-by-zero then just return the original value.
274   if (Count.isZero())
275     return Vec;
276 
277   // Handle cases when Shift >= BitWidth.
278   if (Count.uge(BitWidth)) {
279     // If LogicalShift - just return zero.
280     if (LogicalShift)
281       return ConstantAggregateZero::get(VT);
282 
283     // If ArithmeticShift - clamp Shift to (BitWidth - 1).
284     Count = APInt(64, BitWidth - 1);
285   }
286 
287   // Get a constant vector of the same type as the first operand.
288   auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
289   auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
290 
291   if (ShiftLeft)
292     return Builder.CreateShl(Vec, ShiftVec);
293 
294   if (LogicalShift)
295     return Builder.CreateLShr(Vec, ShiftVec);
296 
297   return Builder.CreateAShr(Vec, ShiftVec);
298 }
299 
300 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
301 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
302 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
303 static Value *simplifyX86varShift(const IntrinsicInst &II,
304                                   InstCombiner::BuilderTy &Builder) {
305   bool LogicalShift = false;
306   bool ShiftLeft = false;
307 
308   switch (II.getIntrinsicID()) {
309   default:
310     llvm_unreachable("Unexpected intrinsic!");
311   case Intrinsic::x86_avx2_psrav_d:
312   case Intrinsic::x86_avx2_psrav_d_256:
313   case Intrinsic::x86_avx512_psrav_q_128:
314   case Intrinsic::x86_avx512_psrav_q_256:
315   case Intrinsic::x86_avx512_psrav_d_512:
316   case Intrinsic::x86_avx512_psrav_q_512:
317   case Intrinsic::x86_avx512_psrav_w_128:
318   case Intrinsic::x86_avx512_psrav_w_256:
319   case Intrinsic::x86_avx512_psrav_w_512:
320     LogicalShift = false;
321     ShiftLeft = false;
322     break;
323   case Intrinsic::x86_avx2_psrlv_d:
324   case Intrinsic::x86_avx2_psrlv_d_256:
325   case Intrinsic::x86_avx2_psrlv_q:
326   case Intrinsic::x86_avx2_psrlv_q_256:
327   case Intrinsic::x86_avx512_psrlv_d_512:
328   case Intrinsic::x86_avx512_psrlv_q_512:
329   case Intrinsic::x86_avx512_psrlv_w_128:
330   case Intrinsic::x86_avx512_psrlv_w_256:
331   case Intrinsic::x86_avx512_psrlv_w_512:
332     LogicalShift = true;
333     ShiftLeft = false;
334     break;
335   case Intrinsic::x86_avx2_psllv_d:
336   case Intrinsic::x86_avx2_psllv_d_256:
337   case Intrinsic::x86_avx2_psllv_q:
338   case Intrinsic::x86_avx2_psllv_q_256:
339   case Intrinsic::x86_avx512_psllv_d_512:
340   case Intrinsic::x86_avx512_psllv_q_512:
341   case Intrinsic::x86_avx512_psllv_w_128:
342   case Intrinsic::x86_avx512_psllv_w_256:
343   case Intrinsic::x86_avx512_psllv_w_512:
344     LogicalShift = true;
345     ShiftLeft = true;
346     break;
347   }
348   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
349 
350   Value *Vec = II.getArgOperand(0);
351   Value *Amt = II.getArgOperand(1);
352   auto *VT = cast<FixedVectorType>(II.getType());
353   Type *SVT = VT->getElementType();
354   int NumElts = VT->getNumElements();
355   int BitWidth = SVT->getIntegerBitWidth();
356 
357   // If the shift amount is guaranteed to be in-range we can replace it with a
358   // generic shift.
359   KnownBits KnownAmt =
360       llvm::computeKnownBits(Amt, II.getDataLayout());
361   if (KnownAmt.getMaxValue().ult(BitWidth)) {
362     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
363                                       : Builder.CreateLShr(Vec, Amt))
364                          : Builder.CreateAShr(Vec, Amt));
365   }
366 
367   // Simplify if all shift amounts are constant/undef.
368   auto *CShift = dyn_cast<Constant>(Amt);
369   if (!CShift)
370     return nullptr;
371 
372   // Collect each element's shift amount.
373   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
374   bool AnyOutOfRange = false;
375   SmallVector<int, 8> ShiftAmts;
376   for (int I = 0; I < NumElts; ++I) {
377     auto *CElt = CShift->getAggregateElement(I);
378     if (isa_and_nonnull<UndefValue>(CElt)) {
379       ShiftAmts.push_back(-1);
380       continue;
381     }
382 
383     auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
384     if (!COp)
385       return nullptr;
386 
387     // Handle out of range shifts.
388     // If LogicalShift - set to BitWidth (special case).
389     // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
390     APInt ShiftVal = COp->getValue();
391     if (ShiftVal.uge(BitWidth)) {
392       AnyOutOfRange = LogicalShift;
393       ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
394       continue;
395     }
396 
397     ShiftAmts.push_back((int)ShiftVal.getZExtValue());
398   }
399 
400   // If all elements out of range or UNDEF, return vector of zeros/undefs.
401   // ArithmeticShift should only hit this if they are all UNDEF.
402   auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
403   if (llvm::all_of(ShiftAmts, OutOfRange)) {
404     SmallVector<Constant *, 8> ConstantVec;
405     for (int Idx : ShiftAmts) {
406       if (Idx < 0) {
407         ConstantVec.push_back(UndefValue::get(SVT));
408       } else {
409         assert(LogicalShift && "Logical shift expected");
410         ConstantVec.push_back(ConstantInt::getNullValue(SVT));
411       }
412     }
413     return ConstantVector::get(ConstantVec);
414   }
415 
416   // We can't handle only some out of range values with generic logical shifts.
417   if (AnyOutOfRange)
418     return nullptr;
419 
420   // Build the shift amount constant vector.
421   SmallVector<Constant *, 8> ShiftVecAmts;
422   for (int Idx : ShiftAmts) {
423     if (Idx < 0)
424       ShiftVecAmts.push_back(UndefValue::get(SVT));
425     else
426       ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
427   }
428   auto ShiftVec = ConstantVector::get(ShiftVecAmts);
429 
430   if (ShiftLeft)
431     return Builder.CreateShl(Vec, ShiftVec);
432 
433   if (LogicalShift)
434     return Builder.CreateLShr(Vec, ShiftVec);
435 
436   return Builder.CreateAShr(Vec, ShiftVec);
437 }
438 
439 static Value *simplifyX86pack(IntrinsicInst &II,
440                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
441   Value *Arg0 = II.getArgOperand(0);
442   Value *Arg1 = II.getArgOperand(1);
443   Type *ResTy = II.getType();
444 
445   // Fast all undef handling.
446   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
447     return UndefValue::get(ResTy);
448 
449   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
450   unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
451   unsigned NumSrcElts = ArgTy->getNumElements();
452   assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
453          "Unexpected packing types");
454 
455   unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
456   unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
457   unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
458   assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
459          "Unexpected packing types");
460 
461   // Constant folding.
462   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
463     return nullptr;
464 
465   // Clamp Values - signed/unsigned both use signed clamp values, but they
466   // differ on the min/max values.
467   APInt MinValue, MaxValue;
468   if (IsSigned) {
469     // PACKSS: Truncate signed value with signed saturation.
470     // Source values less than dst minint are saturated to minint.
471     // Source values greater than dst maxint are saturated to maxint.
472     MinValue =
473         APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
474     MaxValue =
475         APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
476   } else {
477     // PACKUS: Truncate signed value with unsigned saturation.
478     // Source values less than zero are saturated to zero.
479     // Source values greater than dst maxuint are saturated to maxuint.
480     MinValue = APInt::getZero(SrcScalarSizeInBits);
481     MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
482   }
483 
484   auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
485   auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
486   Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
487   Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
488   Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
489   Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
490 
491   // Shuffle clamped args together at the lane level.
492   SmallVector<int, 32> PackMask;
493   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
494     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
495       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
496     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
497       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
498   }
499   auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
500 
501   // Truncate to dst size.
502   return Builder.CreateTrunc(Shuffle, ResTy);
503 }
504 
505 static Value *simplifyX86movmsk(const IntrinsicInst &II,
506                                 InstCombiner::BuilderTy &Builder) {
507   Value *Arg = II.getArgOperand(0);
508   Type *ResTy = II.getType();
509 
510   // movmsk(undef) -> zero as we must ensure the upper bits are zero.
511   if (isa<UndefValue>(Arg))
512     return Constant::getNullValue(ResTy);
513 
514   auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
515   // We can't easily peek through x86_mmx types.
516   if (!ArgTy)
517     return nullptr;
518 
519   // Expand MOVMSK to compare/bitcast/zext:
520   // e.g. PMOVMSKB(v16i8 x):
521   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
522   // %int = bitcast <16 x i1> %cmp to i16
523   // %res = zext i16 %int to i32
524   unsigned NumElts = ArgTy->getNumElements();
525   Type *IntegerTy = Builder.getIntNTy(NumElts);
526 
527   Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
528   Res = Builder.CreateIsNeg(Res);
529   Res = Builder.CreateBitCast(Res, IntegerTy);
530   Res = Builder.CreateZExtOrTrunc(Res, ResTy);
531   return Res;
532 }
533 
534 static Value *simplifyX86addcarry(const IntrinsicInst &II,
535                                   InstCombiner::BuilderTy &Builder) {
536   Value *CarryIn = II.getArgOperand(0);
537   Value *Op1 = II.getArgOperand(1);
538   Value *Op2 = II.getArgOperand(2);
539   Type *RetTy = II.getType();
540   Type *OpTy = Op1->getType();
541   assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
542          RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
543          "Unexpected types for x86 addcarry");
544 
545   // If carry-in is zero, this is just an unsigned add with overflow.
546   if (match(CarryIn, PatternMatch::m_ZeroInt())) {
547     Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
548                                           {Op1, Op2});
549     // The types have to be adjusted to match the x86 call types.
550     Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
551     Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
552                                        Builder.getInt8Ty());
553     Value *Res = PoisonValue::get(RetTy);
554     Res = Builder.CreateInsertValue(Res, UAddOV, 0);
555     return Builder.CreateInsertValue(Res, UAddResult, 1);
556   }
557 
558   return nullptr;
559 }
560 
561 static Value *simplifyTernarylogic(const IntrinsicInst &II,
562                                    InstCombiner::BuilderTy &Builder) {
563 
564   auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
565   if (!ArgImm || ArgImm->getValue().uge(256))
566     return nullptr;
567 
568   Value *ArgA = II.getArgOperand(0);
569   Value *ArgB = II.getArgOperand(1);
570   Value *ArgC = II.getArgOperand(2);
571 
572   Type *Ty = II.getType();
573 
574   auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
575     return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
576   };
577   auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
578     return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
579   };
580   auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
581     return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
582   };
583   auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
584     return {Builder.CreateNot(V.first), ~V.second};
585   };
586   auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
587   auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
588   auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
589 
590   bool AIsConst = match(ArgA, PatternMatch::m_ImmConstant());
591   bool BIsConst = match(ArgB, PatternMatch::m_ImmConstant());
592   bool CIsConst = match(ArgC, PatternMatch::m_ImmConstant());
593 
594   bool ABIsConst = AIsConst && BIsConst;
595   bool ACIsConst = AIsConst && CIsConst;
596   bool BCIsConst = BIsConst && CIsConst;
597   bool ABCIsConst = AIsConst && BIsConst && CIsConst;
598 
599   // Use for verification. Its a big table. Its difficult to go from Imm ->
600   // logic ops, but easy to verify that a set of logic ops is correct. We track
601   // the logic ops through the second value in the pair. At the end it should
602   // equal Imm.
603   std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
604   std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
605   std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
606   std::pair<Value *, uint8_t> Res = {nullptr, 0};
607 
608   // Currently we only handle cases that convert directly to another instruction
609   // or cases where all the ops are constant.  This is because we don't properly
610   // handle creating ternary ops in the backend, so splitting them here may
611   // cause regressions. As the backend improves, uncomment more cases.
612 
613   uint8_t Imm = ArgImm->getValue().getZExtValue();
614   switch (Imm) {
615   case 0x0:
616     Res = {Constant::getNullValue(Ty), 0};
617     break;
618   case 0x1:
619     if (ABCIsConst)
620       Res = Nor(Or(A, B), C);
621     break;
622   case 0x2:
623     if (ABCIsConst)
624       Res = And(Nor(A, B), C);
625     break;
626   case 0x3:
627     if (ABIsConst)
628       Res = Nor(A, B);
629     break;
630   case 0x4:
631     if (ABCIsConst)
632       Res = And(Nor(A, C), B);
633     break;
634   case 0x5:
635     if (ACIsConst)
636       Res = Nor(A, C);
637     break;
638   case 0x6:
639     if (ABCIsConst)
640       Res = Nor(A, Xnor(B, C));
641     break;
642   case 0x7:
643     if (ABCIsConst)
644       Res = Nor(A, And(B, C));
645     break;
646   case 0x8:
647     if (ABCIsConst)
648       Res = Nor(A, Nand(B, C));
649     break;
650   case 0x9:
651     if (ABCIsConst)
652       Res = Nor(A, Xor(B, C));
653     break;
654   case 0xa:
655     if (ACIsConst)
656       Res = Nor(A, Not(C));
657     break;
658   case 0xb:
659     if (ABCIsConst)
660       Res = Nor(A, Nor(C, Not(B)));
661     break;
662   case 0xc:
663     if (ABIsConst)
664       Res = Nor(A, Not(B));
665     break;
666   case 0xd:
667     if (ABCIsConst)
668       Res = Nor(A, Nor(B, Not(C)));
669     break;
670   case 0xe:
671     if (ABCIsConst)
672       Res = Nor(A, Nor(B, C));
673     break;
674   case 0xf:
675     Res = Not(A);
676     break;
677   case 0x10:
678     if (ABCIsConst)
679       Res = And(A, Nor(B, C));
680     break;
681   case 0x11:
682     if (BCIsConst)
683       Res = Nor(B, C);
684     break;
685   case 0x12:
686     if (ABCIsConst)
687       Res = Nor(Xnor(A, C), B);
688     break;
689   case 0x13:
690     if (ABCIsConst)
691       Res = Nor(And(A, C), B);
692     break;
693   case 0x14:
694     if (ABCIsConst)
695       Res = Nor(Xnor(A, B), C);
696     break;
697   case 0x15:
698     if (ABCIsConst)
699       Res = Nor(And(A, B), C);
700     break;
701   case 0x16:
702     if (ABCIsConst)
703       Res = Xor(Xor(A, B), And(Nand(A, B), C));
704     break;
705   case 0x17:
706     if (ABCIsConst)
707       Res = Xor(Or(A, B), Or(Xnor(A, B), C));
708     break;
709   case 0x18:
710     if (ABCIsConst)
711       Res = Nor(Xnor(A, B), Xnor(A, C));
712     break;
713   case 0x19:
714     if (ABCIsConst)
715       Res = And(Nand(A, B), Xnor(B, C));
716     break;
717   case 0x1a:
718     if (ABCIsConst)
719       Res = Xor(A, Or(And(A, B), C));
720     break;
721   case 0x1b:
722     if (ABCIsConst)
723       Res = Xor(A, Or(Xnor(A, B), C));
724     break;
725   case 0x1c:
726     if (ABCIsConst)
727       Res = Xor(A, Or(And(A, C), B));
728     break;
729   case 0x1d:
730     if (ABCIsConst)
731       Res = Xor(A, Or(Xnor(A, C), B));
732     break;
733   case 0x1e:
734     if (ABCIsConst)
735       Res = Xor(A, Or(B, C));
736     break;
737   case 0x1f:
738     if (ABCIsConst)
739       Res = Nand(A, Or(B, C));
740     break;
741   case 0x20:
742     if (ABCIsConst)
743       Res = Nor(Nand(A, C), B);
744     break;
745   case 0x21:
746     if (ABCIsConst)
747       Res = Nor(Xor(A, C), B);
748     break;
749   case 0x22:
750     if (BCIsConst)
751       Res = Nor(B, Not(C));
752     break;
753   case 0x23:
754     if (ABCIsConst)
755       Res = Nor(B, Nor(C, Not(A)));
756     break;
757   case 0x24:
758     if (ABCIsConst)
759       Res = Nor(Xnor(A, B), Xor(A, C));
760     break;
761   case 0x25:
762     if (ABCIsConst)
763       Res = Xor(A, Nand(Nand(A, B), C));
764     break;
765   case 0x26:
766     if (ABCIsConst)
767       Res = And(Nand(A, B), Xor(B, C));
768     break;
769   case 0x27:
770     if (ABCIsConst)
771       Res = Xor(Or(Xnor(A, B), C), B);
772     break;
773   case 0x28:
774     if (ABCIsConst)
775       Res = And(Xor(A, B), C);
776     break;
777   case 0x29:
778     if (ABCIsConst)
779       Res = Xor(Xor(A, B), Nor(And(A, B), C));
780     break;
781   case 0x2a:
782     if (ABCIsConst)
783       Res = And(Nand(A, B), C);
784     break;
785   case 0x2b:
786     if (ABCIsConst)
787       Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
788     break;
789   case 0x2c:
790     if (ABCIsConst)
791       Res = Nor(Xnor(A, B), Nor(B, C));
792     break;
793   case 0x2d:
794     if (ABCIsConst)
795       Res = Xor(A, Or(B, Not(C)));
796     break;
797   case 0x2e:
798     if (ABCIsConst)
799       Res = Xor(A, Or(Xor(A, C), B));
800     break;
801   case 0x2f:
802     if (ABCIsConst)
803       Res = Nand(A, Or(B, Not(C)));
804     break;
805   case 0x30:
806     if (ABIsConst)
807       Res = Nor(B, Not(A));
808     break;
809   case 0x31:
810     if (ABCIsConst)
811       Res = Nor(Nor(A, Not(C)), B);
812     break;
813   case 0x32:
814     if (ABCIsConst)
815       Res = Nor(Nor(A, C), B);
816     break;
817   case 0x33:
818     Res = Not(B);
819     break;
820   case 0x34:
821     if (ABCIsConst)
822       Res = And(Xor(A, B), Nand(B, C));
823     break;
824   case 0x35:
825     if (ABCIsConst)
826       Res = Xor(B, Or(A, Xnor(B, C)));
827     break;
828   case 0x36:
829     if (ABCIsConst)
830       Res = Xor(Or(A, C), B);
831     break;
832   case 0x37:
833     if (ABCIsConst)
834       Res = Nand(Or(A, C), B);
835     break;
836   case 0x38:
837     if (ABCIsConst)
838       Res = Nor(Xnor(A, B), Nor(A, C));
839     break;
840   case 0x39:
841     if (ABCIsConst)
842       Res = Xor(Or(A, Not(C)), B);
843     break;
844   case 0x3a:
845     if (ABCIsConst)
846       Res = Xor(B, Or(A, Xor(B, C)));
847     break;
848   case 0x3b:
849     if (ABCIsConst)
850       Res = Nand(Or(A, Not(C)), B);
851     break;
852   case 0x3c:
853     Res = Xor(A, B);
854     break;
855   case 0x3d:
856     if (ABCIsConst)
857       Res = Xor(A, Or(Nor(A, C), B));
858     break;
859   case 0x3e:
860     if (ABCIsConst)
861       Res = Xor(A, Or(Nor(A, Not(C)), B));
862     break;
863   case 0x3f:
864     if (ABIsConst)
865       Res = Nand(A, B);
866     break;
867   case 0x40:
868     if (ABCIsConst)
869       Res = Nor(Nand(A, B), C);
870     break;
871   case 0x41:
872     if (ABCIsConst)
873       Res = Nor(Xor(A, B), C);
874     break;
875   case 0x42:
876     if (ABCIsConst)
877       Res = Nor(Xor(A, B), Xnor(A, C));
878     break;
879   case 0x43:
880     if (ABCIsConst)
881       Res = Xor(A, Nand(Nand(A, C), B));
882     break;
883   case 0x44:
884     if (BCIsConst)
885       Res = Nor(C, Not(B));
886     break;
887   case 0x45:
888     if (ABCIsConst)
889       Res = Nor(Nor(B, Not(A)), C);
890     break;
891   case 0x46:
892     if (ABCIsConst)
893       Res = Xor(Or(And(A, C), B), C);
894     break;
895   case 0x47:
896     if (ABCIsConst)
897       Res = Xor(Or(Xnor(A, C), B), C);
898     break;
899   case 0x48:
900     if (ABCIsConst)
901       Res = And(Xor(A, C), B);
902     break;
903   case 0x49:
904     if (ABCIsConst)
905       Res = Xor(Or(Xnor(A, B), And(A, C)), C);
906     break;
907   case 0x4a:
908     if (ABCIsConst)
909       Res = Nor(Xnor(A, C), Nor(B, C));
910     break;
911   case 0x4b:
912     if (ABCIsConst)
913       Res = Xor(A, Or(C, Not(B)));
914     break;
915   case 0x4c:
916     if (ABCIsConst)
917       Res = And(Nand(A, C), B);
918     break;
919   case 0x4d:
920     if (ABCIsConst)
921       Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
922     break;
923   case 0x4e:
924     if (ABCIsConst)
925       Res = Xor(A, Or(Xor(A, B), C));
926     break;
927   case 0x4f:
928     if (ABCIsConst)
929       Res = Nand(A, Nand(B, Not(C)));
930     break;
931   case 0x50:
932     if (ACIsConst)
933       Res = Nor(C, Not(A));
934     break;
935   case 0x51:
936     if (ABCIsConst)
937       Res = Nor(Nor(A, Not(B)), C);
938     break;
939   case 0x52:
940     if (ABCIsConst)
941       Res = And(Xor(A, C), Nand(B, C));
942     break;
943   case 0x53:
944     if (ABCIsConst)
945       Res = Xor(Or(Xnor(B, C), A), C);
946     break;
947   case 0x54:
948     if (ABCIsConst)
949       Res = Nor(Nor(A, B), C);
950     break;
951   case 0x55:
952     Res = Not(C);
953     break;
954   case 0x56:
955     if (ABCIsConst)
956       Res = Xor(Or(A, B), C);
957     break;
958   case 0x57:
959     if (ABCIsConst)
960       Res = Nand(Or(A, B), C);
961     break;
962   case 0x58:
963     if (ABCIsConst)
964       Res = Nor(Nor(A, B), Xnor(A, C));
965     break;
966   case 0x59:
967     if (ABCIsConst)
968       Res = Xor(Or(A, Not(B)), C);
969     break;
970   case 0x5a:
971     Res = Xor(A, C);
972     break;
973   case 0x5b:
974     if (ABCIsConst)
975       Res = Xor(A, Or(Nor(A, B), C));
976     break;
977   case 0x5c:
978     if (ABCIsConst)
979       Res = Xor(Or(Xor(B, C), A), C);
980     break;
981   case 0x5d:
982     if (ABCIsConst)
983       Res = Nand(Or(A, Not(B)), C);
984     break;
985   case 0x5e:
986     if (ABCIsConst)
987       Res = Xor(A, Or(Nor(A, Not(B)), C));
988     break;
989   case 0x5f:
990     if (ACIsConst)
991       Res = Nand(A, C);
992     break;
993   case 0x60:
994     if (ABCIsConst)
995       Res = And(A, Xor(B, C));
996     break;
997   case 0x61:
998     if (ABCIsConst)
999       Res = Xor(Or(Xnor(A, B), And(B, C)), C);
1000     break;
1001   case 0x62:
1002     if (ABCIsConst)
1003       Res = Nor(Nor(A, C), Xnor(B, C));
1004     break;
1005   case 0x63:
1006     if (ABCIsConst)
1007       Res = Xor(B, Or(C, Not(A)));
1008     break;
1009   case 0x64:
1010     if (ABCIsConst)
1011       Res = Nor(Nor(A, B), Xnor(B, C));
1012     break;
1013   case 0x65:
1014     if (ABCIsConst)
1015       Res = Xor(Or(B, Not(A)), C);
1016     break;
1017   case 0x66:
1018     Res = Xor(B, C);
1019     break;
1020   case 0x67:
1021     if (ABCIsConst)
1022       Res = Or(Nor(A, B), Xor(B, C));
1023     break;
1024   case 0x68:
1025     if (ABCIsConst)
1026       Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1027     break;
1028   case 0x69:
1029     if (ABCIsConst)
1030       Res = Xor(Xnor(A, B), C);
1031     break;
1032   case 0x6a:
1033     if (ABCIsConst)
1034       Res = Xor(And(A, B), C);
1035     break;
1036   case 0x6b:
1037     if (ABCIsConst)
1038       Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1039     break;
1040   case 0x6c:
1041     if (ABCIsConst)
1042       Res = Xor(And(A, C), B);
1043     break;
1044   case 0x6d:
1045     if (ABCIsConst)
1046       Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1047     break;
1048   case 0x6e:
1049     if (ABCIsConst)
1050       Res = Or(Nor(A, Not(B)), Xor(B, C));
1051     break;
1052   case 0x6f:
1053     if (ABCIsConst)
1054       Res = Nand(A, Xnor(B, C));
1055     break;
1056   case 0x70:
1057     if (ABCIsConst)
1058       Res = And(A, Nand(B, C));
1059     break;
1060   case 0x71:
1061     if (ABCIsConst)
1062       Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1063     break;
1064   case 0x72:
1065     if (ABCIsConst)
1066       Res = Xor(Or(Xor(A, B), C), B);
1067     break;
1068   case 0x73:
1069     if (ABCIsConst)
1070       Res = Nand(Nand(A, Not(C)), B);
1071     break;
1072   case 0x74:
1073     if (ABCIsConst)
1074       Res = Xor(Or(Xor(A, C), B), C);
1075     break;
1076   case 0x75:
1077     if (ABCIsConst)
1078       Res = Nand(Nand(A, Not(B)), C);
1079     break;
1080   case 0x76:
1081     if (ABCIsConst)
1082       Res = Xor(B, Or(Nor(B, Not(A)), C));
1083     break;
1084   case 0x77:
1085     if (BCIsConst)
1086       Res = Nand(B, C);
1087     break;
1088   case 0x78:
1089     if (ABCIsConst)
1090       Res = Xor(A, And(B, C));
1091     break;
1092   case 0x79:
1093     if (ABCIsConst)
1094       Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1095     break;
1096   case 0x7a:
1097     if (ABCIsConst)
1098       Res = Or(Xor(A, C), Nor(B, Not(A)));
1099     break;
1100   case 0x7b:
1101     if (ABCIsConst)
1102       Res = Nand(Xnor(A, C), B);
1103     break;
1104   case 0x7c:
1105     if (ABCIsConst)
1106       Res = Or(Xor(A, B), Nor(C, Not(A)));
1107     break;
1108   case 0x7d:
1109     if (ABCIsConst)
1110       Res = Nand(Xnor(A, B), C);
1111     break;
1112   case 0x7e:
1113     if (ABCIsConst)
1114       Res = Or(Xor(A, B), Xor(A, C));
1115     break;
1116   case 0x7f:
1117     if (ABCIsConst)
1118       Res = Nand(And(A, B), C);
1119     break;
1120   case 0x80:
1121     if (ABCIsConst)
1122       Res = And(And(A, B), C);
1123     break;
1124   case 0x81:
1125     if (ABCIsConst)
1126       Res = Nor(Xor(A, B), Xor(A, C));
1127     break;
1128   case 0x82:
1129     if (ABCIsConst)
1130       Res = And(Xnor(A, B), C);
1131     break;
1132   case 0x83:
1133     if (ABCIsConst)
1134       Res = Nor(Xor(A, B), Nor(C, Not(A)));
1135     break;
1136   case 0x84:
1137     if (ABCIsConst)
1138       Res = And(Xnor(A, C), B);
1139     break;
1140   case 0x85:
1141     if (ABCIsConst)
1142       Res = Nor(Xor(A, C), Nor(B, Not(A)));
1143     break;
1144   case 0x86:
1145     if (ABCIsConst)
1146       Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1147     break;
1148   case 0x87:
1149     if (ABCIsConst)
1150       Res = Xor(A, Nand(B, C));
1151     break;
1152   case 0x88:
1153     Res = And(B, C);
1154     break;
1155   case 0x89:
1156     if (ABCIsConst)
1157       Res = Xor(B, Nor(Nor(B, Not(A)), C));
1158     break;
1159   case 0x8a:
1160     if (ABCIsConst)
1161       Res = And(Nand(A, Not(B)), C);
1162     break;
1163   case 0x8b:
1164     if (ABCIsConst)
1165       Res = Xor(Nor(Xor(A, C), B), C);
1166     break;
1167   case 0x8c:
1168     if (ABCIsConst)
1169       Res = And(Nand(A, Not(C)), B);
1170     break;
1171   case 0x8d:
1172     if (ABCIsConst)
1173       Res = Xor(Nor(Xor(A, B), C), B);
1174     break;
1175   case 0x8e:
1176     if (ABCIsConst)
1177       Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1178     break;
1179   case 0x8f:
1180     if (ABCIsConst)
1181       Res = Nand(A, Nand(B, C));
1182     break;
1183   case 0x90:
1184     if (ABCIsConst)
1185       Res = And(A, Xnor(B, C));
1186     break;
1187   case 0x91:
1188     if (ABCIsConst)
1189       Res = Nor(Nor(A, Not(B)), Xor(B, C));
1190     break;
1191   case 0x92:
1192     if (ABCIsConst)
1193       Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1194     break;
1195   case 0x93:
1196     if (ABCIsConst)
1197       Res = Xor(Nand(A, C), B);
1198     break;
1199   case 0x94:
1200     if (ABCIsConst)
1201       Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1202     break;
1203   case 0x95:
1204     if (ABCIsConst)
1205       Res = Xor(Nand(A, B), C);
1206     break;
1207   case 0x96:
1208     if (ABCIsConst)
1209       Res = Xor(Xor(A, B), C);
1210     break;
1211   case 0x97:
1212     if (ABCIsConst)
1213       Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1214     break;
1215   case 0x98:
1216     if (ABCIsConst)
1217       Res = Nor(Nor(A, B), Xor(B, C));
1218     break;
1219   case 0x99:
1220     if (BCIsConst)
1221       Res = Xnor(B, C);
1222     break;
1223   case 0x9a:
1224     if (ABCIsConst)
1225       Res = Xor(Nor(B, Not(A)), C);
1226     break;
1227   case 0x9b:
1228     if (ABCIsConst)
1229       Res = Or(Nor(A, B), Xnor(B, C));
1230     break;
1231   case 0x9c:
1232     if (ABCIsConst)
1233       Res = Xor(B, Nor(C, Not(A)));
1234     break;
1235   case 0x9d:
1236     if (ABCIsConst)
1237       Res = Or(Nor(A, C), Xnor(B, C));
1238     break;
1239   case 0x9e:
1240     if (ABCIsConst)
1241       Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1242     break;
1243   case 0x9f:
1244     if (ABCIsConst)
1245       Res = Nand(A, Xor(B, C));
1246     break;
1247   case 0xa0:
1248     Res = And(A, C);
1249     break;
1250   case 0xa1:
1251     if (ABCIsConst)
1252       Res = Xor(A, Nor(Nor(A, Not(B)), C));
1253     break;
1254   case 0xa2:
1255     if (ABCIsConst)
1256       Res = And(Or(A, Not(B)), C);
1257     break;
1258   case 0xa3:
1259     if (ABCIsConst)
1260       Res = Xor(Nor(Xor(B, C), A), C);
1261     break;
1262   case 0xa4:
1263     if (ABCIsConst)
1264       Res = Xor(A, Nor(Nor(A, B), C));
1265     break;
1266   case 0xa5:
1267     if (ACIsConst)
1268       Res = Xnor(A, C);
1269     break;
1270   case 0xa6:
1271     if (ABCIsConst)
1272       Res = Xor(Nor(A, Not(B)), C);
1273     break;
1274   case 0xa7:
1275     if (ABCIsConst)
1276       Res = Or(Nor(A, B), Xnor(A, C));
1277     break;
1278   case 0xa8:
1279     if (ABCIsConst)
1280       Res = And(Or(A, B), C);
1281     break;
1282   case 0xa9:
1283     if (ABCIsConst)
1284       Res = Xor(Nor(A, B), C);
1285     break;
1286   case 0xaa:
1287     Res = C;
1288     break;
1289   case 0xab:
1290     if (ABCIsConst)
1291       Res = Or(Nor(A, B), C);
1292     break;
1293   case 0xac:
1294     if (ABCIsConst)
1295       Res = Xor(Nor(Xnor(B, C), A), C);
1296     break;
1297   case 0xad:
1298     if (ABCIsConst)
1299       Res = Or(Xnor(A, C), And(B, C));
1300     break;
1301   case 0xae:
1302     if (ABCIsConst)
1303       Res = Or(Nor(A, Not(B)), C);
1304     break;
1305   case 0xaf:
1306     if (ACIsConst)
1307       Res = Or(C, Not(A));
1308     break;
1309   case 0xb0:
1310     if (ABCIsConst)
1311       Res = And(A, Nand(B, Not(C)));
1312     break;
1313   case 0xb1:
1314     if (ABCIsConst)
1315       Res = Xor(A, Nor(Xor(A, B), C));
1316     break;
1317   case 0xb2:
1318     if (ABCIsConst)
1319       Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1320     break;
1321   case 0xb3:
1322     if (ABCIsConst)
1323       Res = Nand(Nand(A, C), B);
1324     break;
1325   case 0xb4:
1326     if (ABCIsConst)
1327       Res = Xor(A, Nor(C, Not(B)));
1328     break;
1329   case 0xb5:
1330     if (ABCIsConst)
1331       Res = Or(Xnor(A, C), Nor(B, C));
1332     break;
1333   case 0xb6:
1334     if (ABCIsConst)
1335       Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1336     break;
1337   case 0xb7:
1338     if (ABCIsConst)
1339       Res = Nand(Xor(A, C), B);
1340     break;
1341   case 0xb8:
1342     if (ABCIsConst)
1343       Res = Xor(Nor(Xnor(A, C), B), C);
1344     break;
1345   case 0xb9:
1346     if (ABCIsConst)
1347       Res = Xor(Nor(And(A, C), B), C);
1348     break;
1349   case 0xba:
1350     if (ABCIsConst)
1351       Res = Or(Nor(B, Not(A)), C);
1352     break;
1353   case 0xbb:
1354     if (BCIsConst)
1355       Res = Or(C, Not(B));
1356     break;
1357   case 0xbc:
1358     if (ABCIsConst)
1359       Res = Xor(A, And(Nand(A, C), B));
1360     break;
1361   case 0xbd:
1362     if (ABCIsConst)
1363       Res = Or(Xor(A, B), Xnor(A, C));
1364     break;
1365   case 0xbe:
1366     if (ABCIsConst)
1367       Res = Or(Xor(A, B), C);
1368     break;
1369   case 0xbf:
1370     if (ABCIsConst)
1371       Res = Or(Nand(A, B), C);
1372     break;
1373   case 0xc0:
1374     Res = And(A, B);
1375     break;
1376   case 0xc1:
1377     if (ABCIsConst)
1378       Res = Xor(A, Nor(Nor(A, Not(C)), B));
1379     break;
1380   case 0xc2:
1381     if (ABCIsConst)
1382       Res = Xor(A, Nor(Nor(A, C), B));
1383     break;
1384   case 0xc3:
1385     if (ABIsConst)
1386       Res = Xnor(A, B);
1387     break;
1388   case 0xc4:
1389     if (ABCIsConst)
1390       Res = And(Or(A, Not(C)), B);
1391     break;
1392   case 0xc5:
1393     if (ABCIsConst)
1394       Res = Xor(B, Nor(A, Xor(B, C)));
1395     break;
1396   case 0xc6:
1397     if (ABCIsConst)
1398       Res = Xor(Nor(A, Not(C)), B);
1399     break;
1400   case 0xc7:
1401     if (ABCIsConst)
1402       Res = Or(Xnor(A, B), Nor(A, C));
1403     break;
1404   case 0xc8:
1405     if (ABCIsConst)
1406       Res = And(Or(A, C), B);
1407     break;
1408   case 0xc9:
1409     if (ABCIsConst)
1410       Res = Xor(Nor(A, C), B);
1411     break;
1412   case 0xca:
1413     if (ABCIsConst)
1414       Res = Xor(B, Nor(A, Xnor(B, C)));
1415     break;
1416   case 0xcb:
1417     if (ABCIsConst)
1418       Res = Or(Xnor(A, B), And(B, C));
1419     break;
1420   case 0xcc:
1421     Res = B;
1422     break;
1423   case 0xcd:
1424     if (ABCIsConst)
1425       Res = Or(Nor(A, C), B);
1426     break;
1427   case 0xce:
1428     if (ABCIsConst)
1429       Res = Or(Nor(A, Not(C)), B);
1430     break;
1431   case 0xcf:
1432     if (ABIsConst)
1433       Res = Or(B, Not(A));
1434     break;
1435   case 0xd0:
1436     if (ABCIsConst)
1437       Res = And(A, Or(B, Not(C)));
1438     break;
1439   case 0xd1:
1440     if (ABCIsConst)
1441       Res = Xor(A, Nor(Xor(A, C), B));
1442     break;
1443   case 0xd2:
1444     if (ABCIsConst)
1445       Res = Xor(A, Nor(B, Not(C)));
1446     break;
1447   case 0xd3:
1448     if (ABCIsConst)
1449       Res = Or(Xnor(A, B), Nor(B, C));
1450     break;
1451   case 0xd4:
1452     if (ABCIsConst)
1453       Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1454     break;
1455   case 0xd5:
1456     if (ABCIsConst)
1457       Res = Nand(Nand(A, B), C);
1458     break;
1459   case 0xd6:
1460     if (ABCIsConst)
1461       Res = Xor(Xor(A, B), Or(And(A, B), C));
1462     break;
1463   case 0xd7:
1464     if (ABCIsConst)
1465       Res = Nand(Xor(A, B), C);
1466     break;
1467   case 0xd8:
1468     if (ABCIsConst)
1469       Res = Xor(Nor(Xnor(A, B), C), B);
1470     break;
1471   case 0xd9:
1472     if (ABCIsConst)
1473       Res = Or(And(A, B), Xnor(B, C));
1474     break;
1475   case 0xda:
1476     if (ABCIsConst)
1477       Res = Xor(A, And(Nand(A, B), C));
1478     break;
1479   case 0xdb:
1480     if (ABCIsConst)
1481       Res = Or(Xnor(A, B), Xor(A, C));
1482     break;
1483   case 0xdc:
1484     if (ABCIsConst)
1485       Res = Or(B, Nor(C, Not(A)));
1486     break;
1487   case 0xdd:
1488     if (BCIsConst)
1489       Res = Or(B, Not(C));
1490     break;
1491   case 0xde:
1492     if (ABCIsConst)
1493       Res = Or(Xor(A, C), B);
1494     break;
1495   case 0xdf:
1496     if (ABCIsConst)
1497       Res = Or(Nand(A, C), B);
1498     break;
1499   case 0xe0:
1500     if (ABCIsConst)
1501       Res = And(A, Or(B, C));
1502     break;
1503   case 0xe1:
1504     if (ABCIsConst)
1505       Res = Xor(A, Nor(B, C));
1506     break;
1507   case 0xe2:
1508     if (ABCIsConst)
1509       Res = Xor(A, Nor(Xnor(A, C), B));
1510     break;
1511   case 0xe3:
1512     if (ABCIsConst)
1513       Res = Xor(A, Nor(And(A, C), B));
1514     break;
1515   case 0xe4:
1516     if (ABCIsConst)
1517       Res = Xor(A, Nor(Xnor(A, B), C));
1518     break;
1519   case 0xe5:
1520     if (ABCIsConst)
1521       Res = Xor(A, Nor(And(A, B), C));
1522     break;
1523   case 0xe6:
1524     if (ABCIsConst)
1525       Res = Or(And(A, B), Xor(B, C));
1526     break;
1527   case 0xe7:
1528     if (ABCIsConst)
1529       Res = Or(Xnor(A, B), Xnor(A, C));
1530     break;
1531   case 0xe8:
1532     if (ABCIsConst)
1533       Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1534     break;
1535   case 0xe9:
1536     if (ABCIsConst)
1537       Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1538     break;
1539   case 0xea:
1540     if (ABCIsConst)
1541       Res = Or(And(A, B), C);
1542     break;
1543   case 0xeb:
1544     if (ABCIsConst)
1545       Res = Or(Xnor(A, B), C);
1546     break;
1547   case 0xec:
1548     if (ABCIsConst)
1549       Res = Or(And(A, C), B);
1550     break;
1551   case 0xed:
1552     if (ABCIsConst)
1553       Res = Or(Xnor(A, C), B);
1554     break;
1555   case 0xee:
1556     Res = Or(B, C);
1557     break;
1558   case 0xef:
1559     if (ABCIsConst)
1560       Res = Nand(A, Nor(B, C));
1561     break;
1562   case 0xf0:
1563     Res = A;
1564     break;
1565   case 0xf1:
1566     if (ABCIsConst)
1567       Res = Or(A, Nor(B, C));
1568     break;
1569   case 0xf2:
1570     if (ABCIsConst)
1571       Res = Or(A, Nor(B, Not(C)));
1572     break;
1573   case 0xf3:
1574     if (ABIsConst)
1575       Res = Or(A, Not(B));
1576     break;
1577   case 0xf4:
1578     if (ABCIsConst)
1579       Res = Or(A, Nor(C, Not(B)));
1580     break;
1581   case 0xf5:
1582     if (ACIsConst)
1583       Res = Or(A, Not(C));
1584     break;
1585   case 0xf6:
1586     if (ABCIsConst)
1587       Res = Or(A, Xor(B, C));
1588     break;
1589   case 0xf7:
1590     if (ABCIsConst)
1591       Res = Or(A, Nand(B, C));
1592     break;
1593   case 0xf8:
1594     if (ABCIsConst)
1595       Res = Or(A, And(B, C));
1596     break;
1597   case 0xf9:
1598     if (ABCIsConst)
1599       Res = Or(A, Xnor(B, C));
1600     break;
1601   case 0xfa:
1602     Res = Or(A, C);
1603     break;
1604   case 0xfb:
1605     if (ABCIsConst)
1606       Res = Nand(Nor(A, C), B);
1607     break;
1608   case 0xfc:
1609     Res = Or(A, B);
1610     break;
1611   case 0xfd:
1612     if (ABCIsConst)
1613       Res = Nand(Nor(A, B), C);
1614     break;
1615   case 0xfe:
1616     if (ABCIsConst)
1617       Res = Or(Or(A, B), C);
1618     break;
1619   case 0xff:
1620     Res = {Constant::getAllOnesValue(Ty), 0xff};
1621     break;
1622   }
1623 
1624   assert((Res.first == nullptr || Res.second == Imm) &&
1625          "Simplification of ternary logic does not verify!");
1626   return Res.first;
1627 }
1628 
1629 static Value *simplifyX86insertps(const IntrinsicInst &II,
1630                                   InstCombiner::BuilderTy &Builder) {
1631   auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1632   if (!CInt)
1633     return nullptr;
1634 
1635   auto *VecTy = cast<FixedVectorType>(II.getType());
1636   assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1637 
1638   // The immediate permute control byte looks like this:
1639   //    [3:0] - zero mask for each 32-bit lane
1640   //    [5:4] - select one 32-bit destination lane
1641   //    [7:6] - select one 32-bit source lane
1642 
1643   uint8_t Imm = CInt->getZExtValue();
1644   uint8_t ZMask = Imm & 0xf;
1645   uint8_t DestLane = (Imm >> 4) & 0x3;
1646   uint8_t SourceLane = (Imm >> 6) & 0x3;
1647 
1648   ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
1649 
1650   // If all zero mask bits are set, this was just a weird way to
1651   // generate a zero vector.
1652   if (ZMask == 0xf)
1653     return ZeroVector;
1654 
1655   // Initialize by passing all of the first source bits through.
1656   int ShuffleMask[4] = {0, 1, 2, 3};
1657 
1658   // We may replace the second operand with the zero vector.
1659   Value *V1 = II.getArgOperand(1);
1660 
1661   if (ZMask) {
1662     // If the zero mask is being used with a single input or the zero mask
1663     // overrides the destination lane, this is a shuffle with the zero vector.
1664     if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1665         (ZMask & (1 << DestLane))) {
1666       V1 = ZeroVector;
1667       // We may still move 32-bits of the first source vector from one lane
1668       // to another.
1669       ShuffleMask[DestLane] = SourceLane;
1670       // The zero mask may override the previous insert operation.
1671       for (unsigned i = 0; i < 4; ++i)
1672         if ((ZMask >> i) & 0x1)
1673           ShuffleMask[i] = i + 4;
1674     } else {
1675       // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1676       return nullptr;
1677     }
1678   } else {
1679     // Replace the selected destination lane with the selected source lane.
1680     ShuffleMask[DestLane] = SourceLane + 4;
1681   }
1682 
1683   return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1684 }
1685 
1686 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1687 /// or conversion to a shuffle vector.
1688 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
1689                                ConstantInt *CILength, ConstantInt *CIIndex,
1690                                InstCombiner::BuilderTy &Builder) {
1691   auto LowConstantHighUndef = [&](uint64_t Val) {
1692     Type *IntTy64 = Type::getInt64Ty(II.getContext());
1693     Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1694                         UndefValue::get(IntTy64)};
1695     return ConstantVector::get(Args);
1696   };
1697 
1698   // See if we're dealing with constant values.
1699   auto *C0 = dyn_cast<Constant>(Op0);
1700   auto *CI0 =
1701       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1702          : nullptr;
1703 
1704   // Attempt to constant fold.
1705   if (CILength && CIIndex) {
1706     // From AMD documentation: "The bit index and field length are each six
1707     // bits in length other bits of the field are ignored."
1708     APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1709     APInt APLength = CILength->getValue().zextOrTrunc(6);
1710 
1711     unsigned Index = APIndex.getZExtValue();
1712 
1713     // From AMD documentation: "a value of zero in the field length is
1714     // defined as length of 64".
1715     unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1716 
1717     // From AMD documentation: "If the sum of the bit index + length field
1718     // is greater than 64, the results are undefined".
1719     unsigned End = Index + Length;
1720 
1721     // Note that both field index and field length are 8-bit quantities.
1722     // Since variables 'Index' and 'Length' are unsigned values
1723     // obtained from zero-extending field index and field length
1724     // respectively, their sum should never wrap around.
1725     if (End > 64)
1726       return UndefValue::get(II.getType());
1727 
1728     // If we are inserting whole bytes, we can convert this to a shuffle.
1729     // Lowering can recognize EXTRQI shuffle masks.
1730     if ((Length % 8) == 0 && (Index % 8) == 0) {
1731       // Convert bit indices to byte indices.
1732       Length /= 8;
1733       Index /= 8;
1734 
1735       Type *IntTy8 = Type::getInt8Ty(II.getContext());
1736       auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1737 
1738       SmallVector<int, 16> ShuffleMask;
1739       for (int i = 0; i != (int)Length; ++i)
1740         ShuffleMask.push_back(i + Index);
1741       for (int i = Length; i != 8; ++i)
1742         ShuffleMask.push_back(i + 16);
1743       for (int i = 8; i != 16; ++i)
1744         ShuffleMask.push_back(-1);
1745 
1746       Value *SV = Builder.CreateShuffleVector(
1747           Builder.CreateBitCast(Op0, ShufTy),
1748           ConstantAggregateZero::get(ShufTy), ShuffleMask);
1749       return Builder.CreateBitCast(SV, II.getType());
1750     }
1751 
1752     // Constant Fold - shift Index'th bit to lowest position and mask off
1753     // Length bits.
1754     if (CI0) {
1755       APInt Elt = CI0->getValue();
1756       Elt.lshrInPlace(Index);
1757       Elt = Elt.zextOrTrunc(Length);
1758       return LowConstantHighUndef(Elt.getZExtValue());
1759     }
1760 
1761     // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1762     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1763       Value *Args[] = {Op0, CILength, CIIndex};
1764       Module *M = II.getModule();
1765       Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
1766       return Builder.CreateCall(F, Args);
1767     }
1768   }
1769 
1770   // Constant Fold - extraction from zero is always {zero, undef}.
1771   if (CI0 && CI0->isZero())
1772     return LowConstantHighUndef(0);
1773 
1774   return nullptr;
1775 }
1776 
1777 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1778 /// folding or conversion to a shuffle vector.
1779 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
1780                                  APInt APLength, APInt APIndex,
1781                                  InstCombiner::BuilderTy &Builder) {
1782   // From AMD documentation: "The bit index and field length are each six bits
1783   // in length other bits of the field are ignored."
1784   APIndex = APIndex.zextOrTrunc(6);
1785   APLength = APLength.zextOrTrunc(6);
1786 
1787   // Attempt to constant fold.
1788   unsigned Index = APIndex.getZExtValue();
1789 
1790   // From AMD documentation: "a value of zero in the field length is
1791   // defined as length of 64".
1792   unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1793 
1794   // From AMD documentation: "If the sum of the bit index + length field
1795   // is greater than 64, the results are undefined".
1796   unsigned End = Index + Length;
1797 
1798   // Note that both field index and field length are 8-bit quantities.
1799   // Since variables 'Index' and 'Length' are unsigned values
1800   // obtained from zero-extending field index and field length
1801   // respectively, their sum should never wrap around.
1802   if (End > 64)
1803     return UndefValue::get(II.getType());
1804 
1805   // If we are inserting whole bytes, we can convert this to a shuffle.
1806   // Lowering can recognize INSERTQI shuffle masks.
1807   if ((Length % 8) == 0 && (Index % 8) == 0) {
1808     // Convert bit indices to byte indices.
1809     Length /= 8;
1810     Index /= 8;
1811 
1812     Type *IntTy8 = Type::getInt8Ty(II.getContext());
1813     auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1814 
1815     SmallVector<int, 16> ShuffleMask;
1816     for (int i = 0; i != (int)Index; ++i)
1817       ShuffleMask.push_back(i);
1818     for (int i = 0; i != (int)Length; ++i)
1819       ShuffleMask.push_back(i + 16);
1820     for (int i = Index + Length; i != 8; ++i)
1821       ShuffleMask.push_back(i);
1822     for (int i = 8; i != 16; ++i)
1823       ShuffleMask.push_back(-1);
1824 
1825     Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1826                                             Builder.CreateBitCast(Op1, ShufTy),
1827                                             ShuffleMask);
1828     return Builder.CreateBitCast(SV, II.getType());
1829   }
1830 
1831   // See if we're dealing with constant values.
1832   auto *C0 = dyn_cast<Constant>(Op0);
1833   auto *C1 = dyn_cast<Constant>(Op1);
1834   auto *CI00 =
1835       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1836          : nullptr;
1837   auto *CI10 =
1838       C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1839          : nullptr;
1840 
1841   // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1842   if (CI00 && CI10) {
1843     APInt V00 = CI00->getValue();
1844     APInt V10 = CI10->getValue();
1845     APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
1846     V00 = V00 & ~Mask;
1847     V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
1848     APInt Val = V00 | V10;
1849     Type *IntTy64 = Type::getInt64Ty(II.getContext());
1850     Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
1851                         UndefValue::get(IntTy64)};
1852     return ConstantVector::get(Args);
1853   }
1854 
1855   // If we were an INSERTQ call, we'll save demanded elements if we convert to
1856   // INSERTQI.
1857   if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
1858     Type *IntTy8 = Type::getInt8Ty(II.getContext());
1859     Constant *CILength = ConstantInt::get(IntTy8, Length, false);
1860     Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
1861 
1862     Value *Args[] = {Op0, Op1, CILength, CIIndex};
1863     Module *M = II.getModule();
1864     Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
1865     return Builder.CreateCall(F, Args);
1866   }
1867 
1868   return nullptr;
1869 }
1870 
1871 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
1872 static Value *simplifyX86pshufb(const IntrinsicInst &II,
1873                                 InstCombiner::BuilderTy &Builder) {
1874   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1875   if (!V)
1876     return nullptr;
1877 
1878   auto *VecTy = cast<FixedVectorType>(II.getType());
1879   unsigned NumElts = VecTy->getNumElements();
1880   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
1881          "Unexpected number of elements in shuffle mask!");
1882 
1883   // Construct a shuffle mask from constant integers or UNDEFs.
1884   int Indexes[64];
1885 
1886   // Each byte in the shuffle control mask forms an index to permute the
1887   // corresponding byte in the destination operand.
1888   for (unsigned I = 0; I < NumElts; ++I) {
1889     Constant *COp = V->getAggregateElement(I);
1890     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1891       return nullptr;
1892 
1893     if (isa<UndefValue>(COp)) {
1894       Indexes[I] = -1;
1895       continue;
1896     }
1897 
1898     int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
1899 
1900     // If the most significant bit (bit[7]) of each byte of the shuffle
1901     // control mask is set, then zero is written in the result byte.
1902     // The zero vector is in the right-hand side of the resulting
1903     // shufflevector.
1904 
1905     // The value of each index for the high 128-bit lane is the least
1906     // significant 4 bits of the respective shuffle control byte.
1907     Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
1908     Indexes[I] = Index;
1909   }
1910 
1911   auto V1 = II.getArgOperand(0);
1912   auto V2 = Constant::getNullValue(VecTy);
1913   return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
1914 }
1915 
1916 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
1917 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
1918                                     InstCombiner::BuilderTy &Builder) {
1919   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1920   if (!V)
1921     return nullptr;
1922 
1923   auto *VecTy = cast<FixedVectorType>(II.getType());
1924   unsigned NumElts = VecTy->getNumElements();
1925   bool IsPD = VecTy->getScalarType()->isDoubleTy();
1926   unsigned NumLaneElts = IsPD ? 2 : 4;
1927   assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
1928 
1929   // Construct a shuffle mask from constant integers or UNDEFs.
1930   int Indexes[16];
1931 
1932   // The intrinsics only read one or two bits, clear the rest.
1933   for (unsigned I = 0; I < NumElts; ++I) {
1934     Constant *COp = V->getAggregateElement(I);
1935     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1936       return nullptr;
1937 
1938     if (isa<UndefValue>(COp)) {
1939       Indexes[I] = -1;
1940       continue;
1941     }
1942 
1943     APInt Index = cast<ConstantInt>(COp)->getValue();
1944     Index = Index.zextOrTrunc(32).getLoBits(2);
1945 
1946     // The PD variants uses bit 1 to select per-lane element index, so
1947     // shift down to convert to generic shuffle mask index.
1948     if (IsPD)
1949       Index.lshrInPlace(1);
1950 
1951     // The _256 variants are a bit trickier since the mask bits always index
1952     // into the corresponding 128 half. In order to convert to a generic
1953     // shuffle, we have to make that explicit.
1954     Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
1955 
1956     Indexes[I] = Index.getZExtValue();
1957   }
1958 
1959   auto V1 = II.getArgOperand(0);
1960   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
1961 }
1962 
1963 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
1964 static Value *simplifyX86vpermv(const IntrinsicInst &II,
1965                                 InstCombiner::BuilderTy &Builder) {
1966   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1967   if (!V)
1968     return nullptr;
1969 
1970   auto *VecTy = cast<FixedVectorType>(II.getType());
1971   unsigned Size = VecTy->getNumElements();
1972   assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
1973          "Unexpected shuffle mask size");
1974 
1975   // Construct a shuffle mask from constant integers or UNDEFs.
1976   int Indexes[64];
1977 
1978   for (unsigned I = 0; I < Size; ++I) {
1979     Constant *COp = V->getAggregateElement(I);
1980     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1981       return nullptr;
1982 
1983     if (isa<UndefValue>(COp)) {
1984       Indexes[I] = -1;
1985       continue;
1986     }
1987 
1988     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
1989     Index &= Size - 1;
1990     Indexes[I] = Index;
1991   }
1992 
1993   auto V1 = II.getArgOperand(0);
1994   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
1995 }
1996 
1997 std::optional<Instruction *>
1998 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
1999   auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
2000                                              unsigned DemandedWidth) {
2001     APInt UndefElts(Width, 0);
2002     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2003     return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2004   };
2005 
2006   Intrinsic::ID IID = II.getIntrinsicID();
2007   switch (IID) {
2008   case Intrinsic::x86_bmi_bextr_32:
2009   case Intrinsic::x86_bmi_bextr_64:
2010   case Intrinsic::x86_tbm_bextri_u32:
2011   case Intrinsic::x86_tbm_bextri_u64:
2012     // If the RHS is a constant we can try some simplifications.
2013     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2014       uint64_t Shift = C->getZExtValue();
2015       uint64_t Length = (Shift >> 8) & 0xff;
2016       Shift &= 0xff;
2017       unsigned BitWidth = II.getType()->getIntegerBitWidth();
2018       // If the length is 0 or the shift is out of range, replace with zero.
2019       if (Length == 0 || Shift >= BitWidth) {
2020         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2021       }
2022       // If the LHS is also a constant, we can completely constant fold this.
2023       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2024         uint64_t Result = InC->getZExtValue() >> Shift;
2025         if (Length > BitWidth)
2026           Length = BitWidth;
2027         Result &= maskTrailingOnes<uint64_t>(Length);
2028         return IC.replaceInstUsesWith(II,
2029                                       ConstantInt::get(II.getType(), Result));
2030       }
2031       // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2032       // are only masking bits that a shift already cleared?
2033     }
2034     break;
2035 
2036   case Intrinsic::x86_bmi_bzhi_32:
2037   case Intrinsic::x86_bmi_bzhi_64:
2038     // If the RHS is a constant we can try some simplifications.
2039     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2040       uint64_t Index = C->getZExtValue() & 0xff;
2041       unsigned BitWidth = II.getType()->getIntegerBitWidth();
2042       if (Index >= BitWidth) {
2043         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2044       }
2045       if (Index == 0) {
2046         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2047       }
2048       // If the LHS is also a constant, we can completely constant fold this.
2049       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2050         uint64_t Result = InC->getZExtValue();
2051         Result &= maskTrailingOnes<uint64_t>(Index);
2052         return IC.replaceInstUsesWith(II,
2053                                       ConstantInt::get(II.getType(), Result));
2054       }
2055       // TODO should we convert this to an AND if the RHS is constant?
2056     }
2057     break;
2058   case Intrinsic::x86_bmi_pext_32:
2059   case Intrinsic::x86_bmi_pext_64:
2060     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2061       if (MaskC->isNullValue()) {
2062         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2063       }
2064       if (MaskC->isAllOnesValue()) {
2065         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2066       }
2067 
2068       unsigned MaskIdx, MaskLen;
2069       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2070         // any single contingous sequence of 1s anywhere in the mask simply
2071         // describes a subset of the input bits shifted to the appropriate
2072         // position.  Replace with the straight forward IR.
2073         Value *Input = II.getArgOperand(0);
2074         Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2075         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2076         Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2077         return IC.replaceInstUsesWith(II, Shifted);
2078       }
2079 
2080       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2081         uint64_t Src = SrcC->getZExtValue();
2082         uint64_t Mask = MaskC->getZExtValue();
2083         uint64_t Result = 0;
2084         uint64_t BitToSet = 1;
2085 
2086         while (Mask) {
2087           // Isolate lowest set bit.
2088           uint64_t BitToTest = Mask & -Mask;
2089           if (BitToTest & Src)
2090             Result |= BitToSet;
2091 
2092           BitToSet <<= 1;
2093           // Clear lowest set bit.
2094           Mask &= Mask - 1;
2095         }
2096 
2097         return IC.replaceInstUsesWith(II,
2098                                       ConstantInt::get(II.getType(), Result));
2099       }
2100     }
2101     break;
2102   case Intrinsic::x86_bmi_pdep_32:
2103   case Intrinsic::x86_bmi_pdep_64:
2104     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2105       if (MaskC->isNullValue()) {
2106         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2107       }
2108       if (MaskC->isAllOnesValue()) {
2109         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2110       }
2111 
2112       unsigned MaskIdx, MaskLen;
2113       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2114         // any single contingous sequence of 1s anywhere in the mask simply
2115         // describes a subset of the input bits shifted to the appropriate
2116         // position.  Replace with the straight forward IR.
2117         Value *Input = II.getArgOperand(0);
2118         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2119         Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2120         Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2121         return IC.replaceInstUsesWith(II, Masked);
2122       }
2123 
2124       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2125         uint64_t Src = SrcC->getZExtValue();
2126         uint64_t Mask = MaskC->getZExtValue();
2127         uint64_t Result = 0;
2128         uint64_t BitToTest = 1;
2129 
2130         while (Mask) {
2131           // Isolate lowest set bit.
2132           uint64_t BitToSet = Mask & -Mask;
2133           if (BitToTest & Src)
2134             Result |= BitToSet;
2135 
2136           BitToTest <<= 1;
2137           // Clear lowest set bit;
2138           Mask &= Mask - 1;
2139         }
2140 
2141         return IC.replaceInstUsesWith(II,
2142                                       ConstantInt::get(II.getType(), Result));
2143       }
2144     }
2145     break;
2146 
2147   case Intrinsic::x86_sse_cvtss2si:
2148   case Intrinsic::x86_sse_cvtss2si64:
2149   case Intrinsic::x86_sse_cvttss2si:
2150   case Intrinsic::x86_sse_cvttss2si64:
2151   case Intrinsic::x86_sse2_cvtsd2si:
2152   case Intrinsic::x86_sse2_cvtsd2si64:
2153   case Intrinsic::x86_sse2_cvttsd2si:
2154   case Intrinsic::x86_sse2_cvttsd2si64:
2155   case Intrinsic::x86_avx512_vcvtss2si32:
2156   case Intrinsic::x86_avx512_vcvtss2si64:
2157   case Intrinsic::x86_avx512_vcvtss2usi32:
2158   case Intrinsic::x86_avx512_vcvtss2usi64:
2159   case Intrinsic::x86_avx512_vcvtsd2si32:
2160   case Intrinsic::x86_avx512_vcvtsd2si64:
2161   case Intrinsic::x86_avx512_vcvtsd2usi32:
2162   case Intrinsic::x86_avx512_vcvtsd2usi64:
2163   case Intrinsic::x86_avx512_cvttss2si:
2164   case Intrinsic::x86_avx512_cvttss2si64:
2165   case Intrinsic::x86_avx512_cvttss2usi:
2166   case Intrinsic::x86_avx512_cvttss2usi64:
2167   case Intrinsic::x86_avx512_cvttsd2si:
2168   case Intrinsic::x86_avx512_cvttsd2si64:
2169   case Intrinsic::x86_avx512_cvttsd2usi:
2170   case Intrinsic::x86_avx512_cvttsd2usi64: {
2171     // These intrinsics only demand the 0th element of their input vectors. If
2172     // we can simplify the input based on that, do so now.
2173     Value *Arg = II.getArgOperand(0);
2174     unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2175     if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2176       return IC.replaceOperand(II, 0, V);
2177     }
2178     break;
2179   }
2180 
2181   case Intrinsic::x86_mmx_pmovmskb:
2182   case Intrinsic::x86_sse_movmsk_ps:
2183   case Intrinsic::x86_sse2_movmsk_pd:
2184   case Intrinsic::x86_sse2_pmovmskb_128:
2185   case Intrinsic::x86_avx_movmsk_pd_256:
2186   case Intrinsic::x86_avx_movmsk_ps_256:
2187   case Intrinsic::x86_avx2_pmovmskb:
2188     if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2189       return IC.replaceInstUsesWith(II, V);
2190     }
2191     break;
2192 
2193   case Intrinsic::x86_sse_comieq_ss:
2194   case Intrinsic::x86_sse_comige_ss:
2195   case Intrinsic::x86_sse_comigt_ss:
2196   case Intrinsic::x86_sse_comile_ss:
2197   case Intrinsic::x86_sse_comilt_ss:
2198   case Intrinsic::x86_sse_comineq_ss:
2199   case Intrinsic::x86_sse_ucomieq_ss:
2200   case Intrinsic::x86_sse_ucomige_ss:
2201   case Intrinsic::x86_sse_ucomigt_ss:
2202   case Intrinsic::x86_sse_ucomile_ss:
2203   case Intrinsic::x86_sse_ucomilt_ss:
2204   case Intrinsic::x86_sse_ucomineq_ss:
2205   case Intrinsic::x86_sse2_comieq_sd:
2206   case Intrinsic::x86_sse2_comige_sd:
2207   case Intrinsic::x86_sse2_comigt_sd:
2208   case Intrinsic::x86_sse2_comile_sd:
2209   case Intrinsic::x86_sse2_comilt_sd:
2210   case Intrinsic::x86_sse2_comineq_sd:
2211   case Intrinsic::x86_sse2_ucomieq_sd:
2212   case Intrinsic::x86_sse2_ucomige_sd:
2213   case Intrinsic::x86_sse2_ucomigt_sd:
2214   case Intrinsic::x86_sse2_ucomile_sd:
2215   case Intrinsic::x86_sse2_ucomilt_sd:
2216   case Intrinsic::x86_sse2_ucomineq_sd:
2217   case Intrinsic::x86_avx512_vcomi_ss:
2218   case Intrinsic::x86_avx512_vcomi_sd:
2219   case Intrinsic::x86_avx512_mask_cmp_ss:
2220   case Intrinsic::x86_avx512_mask_cmp_sd: {
2221     // These intrinsics only demand the 0th element of their input vectors. If
2222     // we can simplify the input based on that, do so now.
2223     bool MadeChange = false;
2224     Value *Arg0 = II.getArgOperand(0);
2225     Value *Arg1 = II.getArgOperand(1);
2226     unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2227     if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2228       IC.replaceOperand(II, 0, V);
2229       MadeChange = true;
2230     }
2231     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2232       IC.replaceOperand(II, 1, V);
2233       MadeChange = true;
2234     }
2235     if (MadeChange) {
2236       return &II;
2237     }
2238     break;
2239   }
2240 
2241   case Intrinsic::x86_avx512_add_ps_512:
2242   case Intrinsic::x86_avx512_div_ps_512:
2243   case Intrinsic::x86_avx512_mul_ps_512:
2244   case Intrinsic::x86_avx512_sub_ps_512:
2245   case Intrinsic::x86_avx512_add_pd_512:
2246   case Intrinsic::x86_avx512_div_pd_512:
2247   case Intrinsic::x86_avx512_mul_pd_512:
2248   case Intrinsic::x86_avx512_sub_pd_512:
2249     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2250     // IR operations.
2251     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2252       if (R->getValue() == 4) {
2253         Value *Arg0 = II.getArgOperand(0);
2254         Value *Arg1 = II.getArgOperand(1);
2255 
2256         Value *V;
2257         switch (IID) {
2258         default:
2259           llvm_unreachable("Case stmts out of sync!");
2260         case Intrinsic::x86_avx512_add_ps_512:
2261         case Intrinsic::x86_avx512_add_pd_512:
2262           V = IC.Builder.CreateFAdd(Arg0, Arg1);
2263           break;
2264         case Intrinsic::x86_avx512_sub_ps_512:
2265         case Intrinsic::x86_avx512_sub_pd_512:
2266           V = IC.Builder.CreateFSub(Arg0, Arg1);
2267           break;
2268         case Intrinsic::x86_avx512_mul_ps_512:
2269         case Intrinsic::x86_avx512_mul_pd_512:
2270           V = IC.Builder.CreateFMul(Arg0, Arg1);
2271           break;
2272         case Intrinsic::x86_avx512_div_ps_512:
2273         case Intrinsic::x86_avx512_div_pd_512:
2274           V = IC.Builder.CreateFDiv(Arg0, Arg1);
2275           break;
2276         }
2277 
2278         return IC.replaceInstUsesWith(II, V);
2279       }
2280     }
2281     break;
2282 
2283   case Intrinsic::x86_avx512_mask_add_ss_round:
2284   case Intrinsic::x86_avx512_mask_div_ss_round:
2285   case Intrinsic::x86_avx512_mask_mul_ss_round:
2286   case Intrinsic::x86_avx512_mask_sub_ss_round:
2287   case Intrinsic::x86_avx512_mask_add_sd_round:
2288   case Intrinsic::x86_avx512_mask_div_sd_round:
2289   case Intrinsic::x86_avx512_mask_mul_sd_round:
2290   case Intrinsic::x86_avx512_mask_sub_sd_round:
2291     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2292     // IR operations.
2293     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2294       if (R->getValue() == 4) {
2295         // Extract the element as scalars.
2296         Value *Arg0 = II.getArgOperand(0);
2297         Value *Arg1 = II.getArgOperand(1);
2298         Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
2299         Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
2300 
2301         Value *V;
2302         switch (IID) {
2303         default:
2304           llvm_unreachable("Case stmts out of sync!");
2305         case Intrinsic::x86_avx512_mask_add_ss_round:
2306         case Intrinsic::x86_avx512_mask_add_sd_round:
2307           V = IC.Builder.CreateFAdd(LHS, RHS);
2308           break;
2309         case Intrinsic::x86_avx512_mask_sub_ss_round:
2310         case Intrinsic::x86_avx512_mask_sub_sd_round:
2311           V = IC.Builder.CreateFSub(LHS, RHS);
2312           break;
2313         case Intrinsic::x86_avx512_mask_mul_ss_round:
2314         case Intrinsic::x86_avx512_mask_mul_sd_round:
2315           V = IC.Builder.CreateFMul(LHS, RHS);
2316           break;
2317         case Intrinsic::x86_avx512_mask_div_ss_round:
2318         case Intrinsic::x86_avx512_mask_div_sd_round:
2319           V = IC.Builder.CreateFDiv(LHS, RHS);
2320           break;
2321         }
2322 
2323         // Handle the masking aspect of the intrinsic.
2324         Value *Mask = II.getArgOperand(3);
2325         auto *C = dyn_cast<ConstantInt>(Mask);
2326         // We don't need a select if we know the mask bit is a 1.
2327         if (!C || !C->getValue()[0]) {
2328           // Cast the mask to an i1 vector and then extract the lowest element.
2329           auto *MaskTy = FixedVectorType::get(
2330               IC.Builder.getInt1Ty(),
2331               cast<IntegerType>(Mask->getType())->getBitWidth());
2332           Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2333           Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2334           // Extract the lowest element from the passthru operand.
2335           Value *Passthru =
2336               IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2337           V = IC.Builder.CreateSelect(Mask, V, Passthru);
2338         }
2339 
2340         // Insert the result back into the original argument 0.
2341         V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2342 
2343         return IC.replaceInstUsesWith(II, V);
2344       }
2345     }
2346     break;
2347 
2348   // Constant fold ashr( <A x Bi>, Ci ).
2349   // Constant fold lshr( <A x Bi>, Ci ).
2350   // Constant fold shl( <A x Bi>, Ci ).
2351   case Intrinsic::x86_sse2_psrai_d:
2352   case Intrinsic::x86_sse2_psrai_w:
2353   case Intrinsic::x86_avx2_psrai_d:
2354   case Intrinsic::x86_avx2_psrai_w:
2355   case Intrinsic::x86_avx512_psrai_q_128:
2356   case Intrinsic::x86_avx512_psrai_q_256:
2357   case Intrinsic::x86_avx512_psrai_d_512:
2358   case Intrinsic::x86_avx512_psrai_q_512:
2359   case Intrinsic::x86_avx512_psrai_w_512:
2360   case Intrinsic::x86_sse2_psrli_d:
2361   case Intrinsic::x86_sse2_psrli_q:
2362   case Intrinsic::x86_sse2_psrli_w:
2363   case Intrinsic::x86_avx2_psrli_d:
2364   case Intrinsic::x86_avx2_psrli_q:
2365   case Intrinsic::x86_avx2_psrli_w:
2366   case Intrinsic::x86_avx512_psrli_d_512:
2367   case Intrinsic::x86_avx512_psrli_q_512:
2368   case Intrinsic::x86_avx512_psrli_w_512:
2369   case Intrinsic::x86_sse2_pslli_d:
2370   case Intrinsic::x86_sse2_pslli_q:
2371   case Intrinsic::x86_sse2_pslli_w:
2372   case Intrinsic::x86_avx2_pslli_d:
2373   case Intrinsic::x86_avx2_pslli_q:
2374   case Intrinsic::x86_avx2_pslli_w:
2375   case Intrinsic::x86_avx512_pslli_d_512:
2376   case Intrinsic::x86_avx512_pslli_q_512:
2377   case Intrinsic::x86_avx512_pslli_w_512:
2378     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2379       return IC.replaceInstUsesWith(II, V);
2380     }
2381     break;
2382 
2383   case Intrinsic::x86_sse2_psra_d:
2384   case Intrinsic::x86_sse2_psra_w:
2385   case Intrinsic::x86_avx2_psra_d:
2386   case Intrinsic::x86_avx2_psra_w:
2387   case Intrinsic::x86_avx512_psra_q_128:
2388   case Intrinsic::x86_avx512_psra_q_256:
2389   case Intrinsic::x86_avx512_psra_d_512:
2390   case Intrinsic::x86_avx512_psra_q_512:
2391   case Intrinsic::x86_avx512_psra_w_512:
2392   case Intrinsic::x86_sse2_psrl_d:
2393   case Intrinsic::x86_sse2_psrl_q:
2394   case Intrinsic::x86_sse2_psrl_w:
2395   case Intrinsic::x86_avx2_psrl_d:
2396   case Intrinsic::x86_avx2_psrl_q:
2397   case Intrinsic::x86_avx2_psrl_w:
2398   case Intrinsic::x86_avx512_psrl_d_512:
2399   case Intrinsic::x86_avx512_psrl_q_512:
2400   case Intrinsic::x86_avx512_psrl_w_512:
2401   case Intrinsic::x86_sse2_psll_d:
2402   case Intrinsic::x86_sse2_psll_q:
2403   case Intrinsic::x86_sse2_psll_w:
2404   case Intrinsic::x86_avx2_psll_d:
2405   case Intrinsic::x86_avx2_psll_q:
2406   case Intrinsic::x86_avx2_psll_w:
2407   case Intrinsic::x86_avx512_psll_d_512:
2408   case Intrinsic::x86_avx512_psll_q_512:
2409   case Intrinsic::x86_avx512_psll_w_512: {
2410     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2411       return IC.replaceInstUsesWith(II, V);
2412     }
2413 
2414     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2415     // operand to compute the shift amount.
2416     Value *Arg1 = II.getArgOperand(1);
2417     assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2418            "Unexpected packed shift size");
2419     unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2420 
2421     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2422       return IC.replaceOperand(II, 1, V);
2423     }
2424     break;
2425   }
2426 
2427   case Intrinsic::x86_avx2_psllv_d:
2428   case Intrinsic::x86_avx2_psllv_d_256:
2429   case Intrinsic::x86_avx2_psllv_q:
2430   case Intrinsic::x86_avx2_psllv_q_256:
2431   case Intrinsic::x86_avx512_psllv_d_512:
2432   case Intrinsic::x86_avx512_psllv_q_512:
2433   case Intrinsic::x86_avx512_psllv_w_128:
2434   case Intrinsic::x86_avx512_psllv_w_256:
2435   case Intrinsic::x86_avx512_psllv_w_512:
2436   case Intrinsic::x86_avx2_psrav_d:
2437   case Intrinsic::x86_avx2_psrav_d_256:
2438   case Intrinsic::x86_avx512_psrav_q_128:
2439   case Intrinsic::x86_avx512_psrav_q_256:
2440   case Intrinsic::x86_avx512_psrav_d_512:
2441   case Intrinsic::x86_avx512_psrav_q_512:
2442   case Intrinsic::x86_avx512_psrav_w_128:
2443   case Intrinsic::x86_avx512_psrav_w_256:
2444   case Intrinsic::x86_avx512_psrav_w_512:
2445   case Intrinsic::x86_avx2_psrlv_d:
2446   case Intrinsic::x86_avx2_psrlv_d_256:
2447   case Intrinsic::x86_avx2_psrlv_q:
2448   case Intrinsic::x86_avx2_psrlv_q_256:
2449   case Intrinsic::x86_avx512_psrlv_d_512:
2450   case Intrinsic::x86_avx512_psrlv_q_512:
2451   case Intrinsic::x86_avx512_psrlv_w_128:
2452   case Intrinsic::x86_avx512_psrlv_w_256:
2453   case Intrinsic::x86_avx512_psrlv_w_512:
2454     if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2455       return IC.replaceInstUsesWith(II, V);
2456     }
2457     break;
2458 
2459   case Intrinsic::x86_sse2_packssdw_128:
2460   case Intrinsic::x86_sse2_packsswb_128:
2461   case Intrinsic::x86_avx2_packssdw:
2462   case Intrinsic::x86_avx2_packsswb:
2463   case Intrinsic::x86_avx512_packssdw_512:
2464   case Intrinsic::x86_avx512_packsswb_512:
2465     if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2466       return IC.replaceInstUsesWith(II, V);
2467     }
2468     break;
2469 
2470   case Intrinsic::x86_sse2_packuswb_128:
2471   case Intrinsic::x86_sse41_packusdw:
2472   case Intrinsic::x86_avx2_packusdw:
2473   case Intrinsic::x86_avx2_packuswb:
2474   case Intrinsic::x86_avx512_packusdw_512:
2475   case Intrinsic::x86_avx512_packuswb_512:
2476     if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2477       return IC.replaceInstUsesWith(II, V);
2478     }
2479     break;
2480 
2481   case Intrinsic::x86_pclmulqdq:
2482   case Intrinsic::x86_pclmulqdq_256:
2483   case Intrinsic::x86_pclmulqdq_512: {
2484     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2485       unsigned Imm = C->getZExtValue();
2486 
2487       bool MadeChange = false;
2488       Value *Arg0 = II.getArgOperand(0);
2489       Value *Arg1 = II.getArgOperand(1);
2490       unsigned VWidth =
2491           cast<FixedVectorType>(Arg0->getType())->getNumElements();
2492 
2493       APInt UndefElts1(VWidth, 0);
2494       APInt DemandedElts1 =
2495           APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2496       if (Value *V =
2497               IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2498         IC.replaceOperand(II, 0, V);
2499         MadeChange = true;
2500       }
2501 
2502       APInt UndefElts2(VWidth, 0);
2503       APInt DemandedElts2 =
2504           APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2505       if (Value *V =
2506               IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2507         IC.replaceOperand(II, 1, V);
2508         MadeChange = true;
2509       }
2510 
2511       // If either input elements are undef, the result is zero.
2512       if (DemandedElts1.isSubsetOf(UndefElts1) ||
2513           DemandedElts2.isSubsetOf(UndefElts2)) {
2514         return IC.replaceInstUsesWith(II,
2515                                       ConstantAggregateZero::get(II.getType()));
2516       }
2517 
2518       if (MadeChange) {
2519         return &II;
2520       }
2521     }
2522     break;
2523   }
2524 
2525   case Intrinsic::x86_sse41_insertps:
2526     if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2527       return IC.replaceInstUsesWith(II, V);
2528     }
2529     break;
2530 
2531   case Intrinsic::x86_sse4a_extrq: {
2532     Value *Op0 = II.getArgOperand(0);
2533     Value *Op1 = II.getArgOperand(1);
2534     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2535     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2536     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2537            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2538            VWidth1 == 16 && "Unexpected operand sizes");
2539 
2540     // See if we're dealing with constant values.
2541     auto *C1 = dyn_cast<Constant>(Op1);
2542     auto *CILength =
2543         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2544            : nullptr;
2545     auto *CIIndex =
2546         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2547            : nullptr;
2548 
2549     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2550     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2551       return IC.replaceInstUsesWith(II, V);
2552     }
2553 
2554     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2555     // operands and the lowest 16-bits of the second.
2556     bool MadeChange = false;
2557     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2558       IC.replaceOperand(II, 0, V);
2559       MadeChange = true;
2560     }
2561     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2562       IC.replaceOperand(II, 1, V);
2563       MadeChange = true;
2564     }
2565     if (MadeChange) {
2566       return &II;
2567     }
2568     break;
2569   }
2570 
2571   case Intrinsic::x86_sse4a_extrqi: {
2572     // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2573     // bits of the lower 64-bits. The upper 64-bits are undefined.
2574     Value *Op0 = II.getArgOperand(0);
2575     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2576     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2577            "Unexpected operand size");
2578 
2579     // See if we're dealing with constant values.
2580     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2581     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2582 
2583     // Attempt to simplify to a constant or shuffle vector.
2584     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2585       return IC.replaceInstUsesWith(II, V);
2586     }
2587 
2588     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2589     // operand.
2590     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2591       return IC.replaceOperand(II, 0, V);
2592     }
2593     break;
2594   }
2595 
2596   case Intrinsic::x86_sse4a_insertq: {
2597     Value *Op0 = II.getArgOperand(0);
2598     Value *Op1 = II.getArgOperand(1);
2599     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2600     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2601            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2602            cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2603            "Unexpected operand size");
2604 
2605     // See if we're dealing with constant values.
2606     auto *C1 = dyn_cast<Constant>(Op1);
2607     auto *CI11 =
2608         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2609            : nullptr;
2610 
2611     // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2612     if (CI11) {
2613       const APInt &V11 = CI11->getValue();
2614       APInt Len = V11.zextOrTrunc(6);
2615       APInt Idx = V11.lshr(8).zextOrTrunc(6);
2616       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2617         return IC.replaceInstUsesWith(II, V);
2618       }
2619     }
2620 
2621     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2622     // operand.
2623     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2624       return IC.replaceOperand(II, 0, V);
2625     }
2626     break;
2627   }
2628 
2629   case Intrinsic::x86_sse4a_insertqi: {
2630     // INSERTQI: Extract lowest Length bits from lower half of second source and
2631     // insert over first source starting at Index bit. The upper 64-bits are
2632     // undefined.
2633     Value *Op0 = II.getArgOperand(0);
2634     Value *Op1 = II.getArgOperand(1);
2635     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2636     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2637     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2638            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2639            VWidth1 == 2 && "Unexpected operand sizes");
2640 
2641     // See if we're dealing with constant values.
2642     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2643     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2644 
2645     // Attempt to simplify to a constant or shuffle vector.
2646     if (CILength && CIIndex) {
2647       APInt Len = CILength->getValue().zextOrTrunc(6);
2648       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2649       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2650         return IC.replaceInstUsesWith(II, V);
2651       }
2652     }
2653 
2654     // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2655     // operands.
2656     bool MadeChange = false;
2657     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2658       IC.replaceOperand(II, 0, V);
2659       MadeChange = true;
2660     }
2661     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2662       IC.replaceOperand(II, 1, V);
2663       MadeChange = true;
2664     }
2665     if (MadeChange) {
2666       return &II;
2667     }
2668     break;
2669   }
2670 
2671   case Intrinsic::x86_sse41_pblendvb:
2672   case Intrinsic::x86_sse41_blendvps:
2673   case Intrinsic::x86_sse41_blendvpd:
2674   case Intrinsic::x86_avx_blendv_ps_256:
2675   case Intrinsic::x86_avx_blendv_pd_256:
2676   case Intrinsic::x86_avx2_pblendvb: {
2677     // fold (blend A, A, Mask) -> A
2678     Value *Op0 = II.getArgOperand(0);
2679     Value *Op1 = II.getArgOperand(1);
2680     Value *Mask = II.getArgOperand(2);
2681     if (Op0 == Op1) {
2682       return IC.replaceInstUsesWith(II, Op0);
2683     }
2684 
2685     // Zero Mask - select 1st argument.
2686     if (isa<ConstantAggregateZero>(Mask)) {
2687       return IC.replaceInstUsesWith(II, Op0);
2688     }
2689 
2690     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2691     if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2692       Constant *NewSelector =
2693           getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
2694       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2695     }
2696 
2697     // Convert to a vector select if we can bypass casts and find a boolean
2698     // vector condition value.
2699     Value *BoolVec;
2700     Mask = InstCombiner::peekThroughBitcast(Mask);
2701     if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
2702         BoolVec->getType()->isVectorTy() &&
2703         BoolVec->getType()->getScalarSizeInBits() == 1) {
2704       auto *MaskTy = cast<FixedVectorType>(Mask->getType());
2705       auto *OpTy = cast<FixedVectorType>(II.getType());
2706       assert(MaskTy->getPrimitiveSizeInBits() ==
2707                  OpTy->getPrimitiveSizeInBits() &&
2708              "Not expecting mask and operands with different sizes");
2709       unsigned NumMaskElts = MaskTy->getNumElements();
2710       unsigned NumOperandElts = OpTy->getNumElements();
2711 
2712       if (NumMaskElts == NumOperandElts) {
2713         return SelectInst::Create(BoolVec, Op1, Op0);
2714       }
2715 
2716       // If the mask has less elements than the operands, each mask bit maps to
2717       // multiple elements of the operands. Bitcast back and forth.
2718       if (NumMaskElts < NumOperandElts) {
2719         Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);
2720         Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);
2721         Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
2722         return new BitCastInst(Sel, II.getType());
2723       }
2724     }
2725 
2726     break;
2727   }
2728 
2729   case Intrinsic::x86_ssse3_pshuf_b_128:
2730   case Intrinsic::x86_avx2_pshuf_b:
2731   case Intrinsic::x86_avx512_pshuf_b_512:
2732     if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
2733       return IC.replaceInstUsesWith(II, V);
2734     }
2735     break;
2736 
2737   case Intrinsic::x86_avx_vpermilvar_ps:
2738   case Intrinsic::x86_avx_vpermilvar_ps_256:
2739   case Intrinsic::x86_avx512_vpermilvar_ps_512:
2740   case Intrinsic::x86_avx_vpermilvar_pd:
2741   case Intrinsic::x86_avx_vpermilvar_pd_256:
2742   case Intrinsic::x86_avx512_vpermilvar_pd_512:
2743     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2744       return IC.replaceInstUsesWith(II, V);
2745     }
2746     break;
2747 
2748   case Intrinsic::x86_avx2_permd:
2749   case Intrinsic::x86_avx2_permps:
2750   case Intrinsic::x86_avx512_permvar_df_256:
2751   case Intrinsic::x86_avx512_permvar_df_512:
2752   case Intrinsic::x86_avx512_permvar_di_256:
2753   case Intrinsic::x86_avx512_permvar_di_512:
2754   case Intrinsic::x86_avx512_permvar_hi_128:
2755   case Intrinsic::x86_avx512_permvar_hi_256:
2756   case Intrinsic::x86_avx512_permvar_hi_512:
2757   case Intrinsic::x86_avx512_permvar_qi_128:
2758   case Intrinsic::x86_avx512_permvar_qi_256:
2759   case Intrinsic::x86_avx512_permvar_qi_512:
2760   case Intrinsic::x86_avx512_permvar_sf_512:
2761   case Intrinsic::x86_avx512_permvar_si_512:
2762     if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
2763       return IC.replaceInstUsesWith(II, V);
2764     }
2765     break;
2766 
2767   case Intrinsic::x86_avx_maskload_ps:
2768   case Intrinsic::x86_avx_maskload_pd:
2769   case Intrinsic::x86_avx_maskload_ps_256:
2770   case Intrinsic::x86_avx_maskload_pd_256:
2771   case Intrinsic::x86_avx2_maskload_d:
2772   case Intrinsic::x86_avx2_maskload_q:
2773   case Intrinsic::x86_avx2_maskload_d_256:
2774   case Intrinsic::x86_avx2_maskload_q_256:
2775     if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
2776       return I;
2777     }
2778     break;
2779 
2780   case Intrinsic::x86_sse2_maskmov_dqu:
2781   case Intrinsic::x86_avx_maskstore_ps:
2782   case Intrinsic::x86_avx_maskstore_pd:
2783   case Intrinsic::x86_avx_maskstore_ps_256:
2784   case Intrinsic::x86_avx_maskstore_pd_256:
2785   case Intrinsic::x86_avx2_maskstore_d:
2786   case Intrinsic::x86_avx2_maskstore_q:
2787   case Intrinsic::x86_avx2_maskstore_d_256:
2788   case Intrinsic::x86_avx2_maskstore_q_256:
2789     if (simplifyX86MaskedStore(II, IC)) {
2790       return nullptr;
2791     }
2792     break;
2793 
2794   case Intrinsic::x86_addcarry_32:
2795   case Intrinsic::x86_addcarry_64:
2796     if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
2797       return IC.replaceInstUsesWith(II, V);
2798     }
2799     break;
2800 
2801   case Intrinsic::x86_avx512_pternlog_d_128:
2802   case Intrinsic::x86_avx512_pternlog_d_256:
2803   case Intrinsic::x86_avx512_pternlog_d_512:
2804   case Intrinsic::x86_avx512_pternlog_q_128:
2805   case Intrinsic::x86_avx512_pternlog_q_256:
2806   case Intrinsic::x86_avx512_pternlog_q_512:
2807     if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
2808       return IC.replaceInstUsesWith(II, V);
2809     }
2810     break;
2811   default:
2812     break;
2813   }
2814   return std::nullopt;
2815 }
2816 
2817 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
2818     InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
2819     bool &KnownBitsComputed) const {
2820   switch (II.getIntrinsicID()) {
2821   default:
2822     break;
2823   case Intrinsic::x86_mmx_pmovmskb:
2824   case Intrinsic::x86_sse_movmsk_ps:
2825   case Intrinsic::x86_sse2_movmsk_pd:
2826   case Intrinsic::x86_sse2_pmovmskb_128:
2827   case Intrinsic::x86_avx_movmsk_ps_256:
2828   case Intrinsic::x86_avx_movmsk_pd_256:
2829   case Intrinsic::x86_avx2_pmovmskb: {
2830     // MOVMSK copies the vector elements' sign bits to the low bits
2831     // and zeros the high bits.
2832     unsigned ArgWidth;
2833     if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
2834       ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
2835     } else {
2836       auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
2837       ArgWidth = ArgType->getNumElements();
2838     }
2839 
2840     // If we don't need any of low bits then return zero,
2841     // we know that DemandedMask is non-zero already.
2842     APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
2843     Type *VTy = II.getType();
2844     if (DemandedElts.isZero()) {
2845       return ConstantInt::getNullValue(VTy);
2846     }
2847 
2848     // We know that the upper bits are set to zero.
2849     Known.Zero.setBitsFrom(ArgWidth);
2850     KnownBitsComputed = true;
2851     break;
2852   }
2853   }
2854   return std::nullopt;
2855 }
2856 
2857 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
2858     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2859     APInt &UndefElts2, APInt &UndefElts3,
2860     std::function<void(Instruction *, unsigned, APInt, APInt &)>
2861         simplifyAndSetOp) const {
2862   unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
2863   switch (II.getIntrinsicID()) {
2864   default:
2865     break;
2866   case Intrinsic::x86_xop_vfrcz_ss:
2867   case Intrinsic::x86_xop_vfrcz_sd:
2868     // The instructions for these intrinsics are speced to zero upper bits not
2869     // pass them through like other scalar intrinsics. So we shouldn't just
2870     // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
2871     // Instead we should return a zero vector.
2872     if (!DemandedElts[0]) {
2873       IC.addToWorklist(&II);
2874       return ConstantAggregateZero::get(II.getType());
2875     }
2876 
2877     // Only the lower element is used.
2878     DemandedElts = 1;
2879     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2880 
2881     // Only the lower element is undefined. The high elements are zero.
2882     UndefElts = UndefElts[0];
2883     break;
2884 
2885   // Unary scalar-as-vector operations that work column-wise.
2886   case Intrinsic::x86_sse_rcp_ss:
2887   case Intrinsic::x86_sse_rsqrt_ss:
2888     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2889 
2890     // If lowest element of a scalar op isn't used then use Arg0.
2891     if (!DemandedElts[0]) {
2892       IC.addToWorklist(&II);
2893       return II.getArgOperand(0);
2894     }
2895     // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
2896     // checks).
2897     break;
2898 
2899   // Binary scalar-as-vector operations that work column-wise. The high
2900   // elements come from operand 0. The low element is a function of both
2901   // operands.
2902   case Intrinsic::x86_sse_min_ss:
2903   case Intrinsic::x86_sse_max_ss:
2904   case Intrinsic::x86_sse_cmp_ss:
2905   case Intrinsic::x86_sse2_min_sd:
2906   case Intrinsic::x86_sse2_max_sd:
2907   case Intrinsic::x86_sse2_cmp_sd: {
2908     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2909 
2910     // If lowest element of a scalar op isn't used then use Arg0.
2911     if (!DemandedElts[0]) {
2912       IC.addToWorklist(&II);
2913       return II.getArgOperand(0);
2914     }
2915 
2916     // Only lower element is used for operand 1.
2917     DemandedElts = 1;
2918     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2919 
2920     // Lower element is undefined if both lower elements are undefined.
2921     // Consider things like undef&0.  The result is known zero, not undef.
2922     if (!UndefElts2[0])
2923       UndefElts.clearBit(0);
2924 
2925     break;
2926   }
2927 
2928   // Binary scalar-as-vector operations that work column-wise. The high
2929   // elements come from operand 0 and the low element comes from operand 1.
2930   case Intrinsic::x86_sse41_round_ss:
2931   case Intrinsic::x86_sse41_round_sd: {
2932     // Don't use the low element of operand 0.
2933     APInt DemandedElts2 = DemandedElts;
2934     DemandedElts2.clearBit(0);
2935     simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
2936 
2937     // If lowest element of a scalar op isn't used then use Arg0.
2938     if (!DemandedElts[0]) {
2939       IC.addToWorklist(&II);
2940       return II.getArgOperand(0);
2941     }
2942 
2943     // Only lower element is used for operand 1.
2944     DemandedElts = 1;
2945     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2946 
2947     // Take the high undef elements from operand 0 and take the lower element
2948     // from operand 1.
2949     UndefElts.clearBit(0);
2950     UndefElts |= UndefElts2[0];
2951     break;
2952   }
2953 
2954   // Three input scalar-as-vector operations that work column-wise. The high
2955   // elements come from operand 0 and the low element is a function of all
2956   // three inputs.
2957   case Intrinsic::x86_avx512_mask_add_ss_round:
2958   case Intrinsic::x86_avx512_mask_div_ss_round:
2959   case Intrinsic::x86_avx512_mask_mul_ss_round:
2960   case Intrinsic::x86_avx512_mask_sub_ss_round:
2961   case Intrinsic::x86_avx512_mask_max_ss_round:
2962   case Intrinsic::x86_avx512_mask_min_ss_round:
2963   case Intrinsic::x86_avx512_mask_add_sd_round:
2964   case Intrinsic::x86_avx512_mask_div_sd_round:
2965   case Intrinsic::x86_avx512_mask_mul_sd_round:
2966   case Intrinsic::x86_avx512_mask_sub_sd_round:
2967   case Intrinsic::x86_avx512_mask_max_sd_round:
2968   case Intrinsic::x86_avx512_mask_min_sd_round:
2969     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2970 
2971     // If lowest element of a scalar op isn't used then use Arg0.
2972     if (!DemandedElts[0]) {
2973       IC.addToWorklist(&II);
2974       return II.getArgOperand(0);
2975     }
2976 
2977     // Only lower element is used for operand 1 and 2.
2978     DemandedElts = 1;
2979     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2980     simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
2981 
2982     // Lower element is undefined if all three lower elements are undefined.
2983     // Consider things like undef&0.  The result is known zero, not undef.
2984     if (!UndefElts2[0] || !UndefElts3[0])
2985       UndefElts.clearBit(0);
2986     break;
2987 
2988   // TODO: Add fmaddsub support?
2989   case Intrinsic::x86_sse3_addsub_pd:
2990   case Intrinsic::x86_sse3_addsub_ps:
2991   case Intrinsic::x86_avx_addsub_pd_256:
2992   case Intrinsic::x86_avx_addsub_ps_256: {
2993     // If none of the even or none of the odd lanes are required, turn this
2994     // into a generic FP math instruction.
2995     APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
2996     APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
2997     bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
2998     bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
2999     if (IsSubOnly || IsAddOnly) {
3000       assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
3001       IRBuilderBase::InsertPointGuard Guard(IC.Builder);
3002       IC.Builder.SetInsertPoint(&II);
3003       Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3004       return IC.Builder.CreateBinOp(
3005           IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3006     }
3007 
3008     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3009     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3010     UndefElts &= UndefElts2;
3011     break;
3012   }
3013 
3014   // General per-element vector operations.
3015   case Intrinsic::x86_avx2_psllv_d:
3016   case Intrinsic::x86_avx2_psllv_d_256:
3017   case Intrinsic::x86_avx2_psllv_q:
3018   case Intrinsic::x86_avx2_psllv_q_256:
3019   case Intrinsic::x86_avx2_psrlv_d:
3020   case Intrinsic::x86_avx2_psrlv_d_256:
3021   case Intrinsic::x86_avx2_psrlv_q:
3022   case Intrinsic::x86_avx2_psrlv_q_256:
3023   case Intrinsic::x86_avx2_psrav_d:
3024   case Intrinsic::x86_avx2_psrav_d_256: {
3025     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3026     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3027     UndefElts &= UndefElts2;
3028     break;
3029   }
3030 
3031   case Intrinsic::x86_sse2_packssdw_128:
3032   case Intrinsic::x86_sse2_packsswb_128:
3033   case Intrinsic::x86_sse2_packuswb_128:
3034   case Intrinsic::x86_sse41_packusdw:
3035   case Intrinsic::x86_avx2_packssdw:
3036   case Intrinsic::x86_avx2_packsswb:
3037   case Intrinsic::x86_avx2_packusdw:
3038   case Intrinsic::x86_avx2_packuswb:
3039   case Intrinsic::x86_avx512_packssdw_512:
3040   case Intrinsic::x86_avx512_packsswb_512:
3041   case Intrinsic::x86_avx512_packusdw_512:
3042   case Intrinsic::x86_avx512_packuswb_512: {
3043     auto *Ty0 = II.getArgOperand(0)->getType();
3044     unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3045     assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3046 
3047     unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3048     unsigned VWidthPerLane = VWidth / NumLanes;
3049     unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3050 
3051     // Per lane, pack the elements of the first input and then the second.
3052     // e.g.
3053     // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3054     // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3055     for (int OpNum = 0; OpNum != 2; ++OpNum) {
3056       APInt OpDemandedElts(InnerVWidth, 0);
3057       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3058         unsigned LaneIdx = Lane * VWidthPerLane;
3059         for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3060           unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3061           if (DemandedElts[Idx])
3062             OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3063         }
3064       }
3065 
3066       // Demand elements from the operand.
3067       APInt OpUndefElts(InnerVWidth, 0);
3068       simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3069 
3070       // Pack the operand's UNDEF elements, one lane at a time.
3071       OpUndefElts = OpUndefElts.zext(VWidth);
3072       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3073         APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3074         LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3075         LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3076         UndefElts |= LaneElts;
3077       }
3078     }
3079     break;
3080   }
3081 
3082   case Intrinsic::x86_sse2_pmadd_wd:
3083   case Intrinsic::x86_avx2_pmadd_wd:
3084   case Intrinsic::x86_avx512_pmaddw_d_512:
3085   case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
3086   case Intrinsic::x86_avx2_pmadd_ub_sw:
3087   case Intrinsic::x86_avx512_pmaddubs_w_512: {
3088     // PMADD - demand both src elements that map to each dst element.
3089     auto *ArgTy = II.getArgOperand(0)->getType();
3090     unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements();
3091     assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
3092     APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth);
3093     APInt Op0UndefElts(InnerVWidth, 0);
3094     APInt Op1UndefElts(InnerVWidth, 0);
3095     simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts);
3096     simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts);
3097     break;
3098   }
3099 
3100   // PSHUFB
3101   case Intrinsic::x86_ssse3_pshuf_b_128:
3102   case Intrinsic::x86_avx2_pshuf_b:
3103   case Intrinsic::x86_avx512_pshuf_b_512:
3104   // PERMILVAR
3105   case Intrinsic::x86_avx_vpermilvar_ps:
3106   case Intrinsic::x86_avx_vpermilvar_ps_256:
3107   case Intrinsic::x86_avx512_vpermilvar_ps_512:
3108   case Intrinsic::x86_avx_vpermilvar_pd:
3109   case Intrinsic::x86_avx_vpermilvar_pd_256:
3110   case Intrinsic::x86_avx512_vpermilvar_pd_512:
3111   // PERMV
3112   case Intrinsic::x86_avx2_permd:
3113   case Intrinsic::x86_avx2_permps: {
3114     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3115     break;
3116   }
3117 
3118   // SSE4A instructions leave the upper 64-bits of the 128-bit result
3119   // in an undefined state.
3120   case Intrinsic::x86_sse4a_extrq:
3121   case Intrinsic::x86_sse4a_extrqi:
3122   case Intrinsic::x86_sse4a_insertq:
3123   case Intrinsic::x86_sse4a_insertqi:
3124     UndefElts.setHighBits(VWidth / 2);
3125     break;
3126   }
3127   return std::nullopt;
3128 }
3129