xref: /llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp (revision c20695a44817d52abda3dee495ef2a172ca315fa)
1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
21 #include <optional>
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "x86tti"
26 
27 /// Return a constant boolean vector that has true elements in all positions
28 /// where the input constant data vector has an element with the sign bit set.
29 static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) {
30   VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
31   V = ConstantExpr::getBitCast(V, IntTy);
32   V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT,
33                                       Constant::getNullValue(IntTy), V, DL);
34   assert(V && "Vector must be foldable");
35   return V;
36 }
37 
38 /// Convert the x86 XMM integer vector mask to a vector of bools based on
39 /// each element's most significant bit (the sign bit).
40 static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
41   // Fold Constant Mask.
42   if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
43     return getNegativeIsTrueBoolVec(ConstantMask, DL);
44 
45   // Mask was extended from a boolean vector.
46   Value *ExtMask;
47   if (PatternMatch::match(
48           Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
49       ExtMask->getType()->isIntOrIntVectorTy(1))
50     return ExtMask;
51 
52   return nullptr;
53 }
54 
55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
56 // XMM register mask efficiently, we could transform all x86 masked intrinsics
57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
58 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
59   Value *Ptr = II.getOperand(0);
60   Value *Mask = II.getOperand(1);
61   Constant *ZeroVec = Constant::getNullValue(II.getType());
62 
63   // Zero Mask - masked load instruction creates a zero vector.
64   if (isa<ConstantAggregateZero>(Mask))
65     return IC.replaceInstUsesWith(II, ZeroVec);
66 
67   // The mask is constant or extended from a bool vector. Convert this x86
68   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
69   if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
70     // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
71     // the LLVM intrinsic definition for the pointer argument.
72     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
73     PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
74     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
75 
76     // The pass-through vector for an x86 masked load is a zero vector.
77     CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
78         II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
79     return IC.replaceInstUsesWith(II, NewMaskedLoad);
80   }
81 
82   return nullptr;
83 }
84 
85 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
86 // XMM register mask efficiently, we could transform all x86 masked intrinsics
87 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
88 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
89   Value *Ptr = II.getOperand(0);
90   Value *Mask = II.getOperand(1);
91   Value *Vec = II.getOperand(2);
92 
93   // Zero Mask - this masked store instruction does nothing.
94   if (isa<ConstantAggregateZero>(Mask)) {
95     IC.eraseInstFromFunction(II);
96     return true;
97   }
98 
99   // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
100   // anything else at this level.
101   if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
102     return false;
103 
104   // The mask is constant or extended from a bool vector. Convert this x86
105   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
106   if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
107     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
108     PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
109     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
110 
111     IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
112 
113     // 'Replace uses' doesn't work for stores. Erase the original masked store.
114     IC.eraseInstFromFunction(II);
115     return true;
116   }
117 
118   return false;
119 }
120 
121 static Value *simplifyX86immShift(const IntrinsicInst &II,
122                                   InstCombiner::BuilderTy &Builder) {
123   bool LogicalShift = false;
124   bool ShiftLeft = false;
125   bool IsImm = false;
126 
127   switch (II.getIntrinsicID()) {
128   default:
129     llvm_unreachable("Unexpected intrinsic!");
130   case Intrinsic::x86_sse2_psrai_d:
131   case Intrinsic::x86_sse2_psrai_w:
132   case Intrinsic::x86_avx2_psrai_d:
133   case Intrinsic::x86_avx2_psrai_w:
134   case Intrinsic::x86_avx512_psrai_q_128:
135   case Intrinsic::x86_avx512_psrai_q_256:
136   case Intrinsic::x86_avx512_psrai_d_512:
137   case Intrinsic::x86_avx512_psrai_q_512:
138   case Intrinsic::x86_avx512_psrai_w_512:
139     IsImm = true;
140     [[fallthrough]];
141   case Intrinsic::x86_sse2_psra_d:
142   case Intrinsic::x86_sse2_psra_w:
143   case Intrinsic::x86_avx2_psra_d:
144   case Intrinsic::x86_avx2_psra_w:
145   case Intrinsic::x86_avx512_psra_q_128:
146   case Intrinsic::x86_avx512_psra_q_256:
147   case Intrinsic::x86_avx512_psra_d_512:
148   case Intrinsic::x86_avx512_psra_q_512:
149   case Intrinsic::x86_avx512_psra_w_512:
150     LogicalShift = false;
151     ShiftLeft = false;
152     break;
153   case Intrinsic::x86_sse2_psrli_d:
154   case Intrinsic::x86_sse2_psrli_q:
155   case Intrinsic::x86_sse2_psrli_w:
156   case Intrinsic::x86_avx2_psrli_d:
157   case Intrinsic::x86_avx2_psrli_q:
158   case Intrinsic::x86_avx2_psrli_w:
159   case Intrinsic::x86_avx512_psrli_d_512:
160   case Intrinsic::x86_avx512_psrli_q_512:
161   case Intrinsic::x86_avx512_psrli_w_512:
162     IsImm = true;
163     [[fallthrough]];
164   case Intrinsic::x86_sse2_psrl_d:
165   case Intrinsic::x86_sse2_psrl_q:
166   case Intrinsic::x86_sse2_psrl_w:
167   case Intrinsic::x86_avx2_psrl_d:
168   case Intrinsic::x86_avx2_psrl_q:
169   case Intrinsic::x86_avx2_psrl_w:
170   case Intrinsic::x86_avx512_psrl_d_512:
171   case Intrinsic::x86_avx512_psrl_q_512:
172   case Intrinsic::x86_avx512_psrl_w_512:
173     LogicalShift = true;
174     ShiftLeft = false;
175     break;
176   case Intrinsic::x86_sse2_pslli_d:
177   case Intrinsic::x86_sse2_pslli_q:
178   case Intrinsic::x86_sse2_pslli_w:
179   case Intrinsic::x86_avx2_pslli_d:
180   case Intrinsic::x86_avx2_pslli_q:
181   case Intrinsic::x86_avx2_pslli_w:
182   case Intrinsic::x86_avx512_pslli_d_512:
183   case Intrinsic::x86_avx512_pslli_q_512:
184   case Intrinsic::x86_avx512_pslli_w_512:
185     IsImm = true;
186     [[fallthrough]];
187   case Intrinsic::x86_sse2_psll_d:
188   case Intrinsic::x86_sse2_psll_q:
189   case Intrinsic::x86_sse2_psll_w:
190   case Intrinsic::x86_avx2_psll_d:
191   case Intrinsic::x86_avx2_psll_q:
192   case Intrinsic::x86_avx2_psll_w:
193   case Intrinsic::x86_avx512_psll_d_512:
194   case Intrinsic::x86_avx512_psll_q_512:
195   case Intrinsic::x86_avx512_psll_w_512:
196     LogicalShift = true;
197     ShiftLeft = true;
198     break;
199   }
200   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
201 
202   Value *Vec = II.getArgOperand(0);
203   Value *Amt = II.getArgOperand(1);
204   auto *VT = cast<FixedVectorType>(Vec->getType());
205   Type *SVT = VT->getElementType();
206   Type *AmtVT = Amt->getType();
207   unsigned VWidth = VT->getNumElements();
208   unsigned BitWidth = SVT->getPrimitiveSizeInBits();
209 
210   // If the shift amount is guaranteed to be in-range we can replace it with a
211   // generic shift. If its guaranteed to be out of range, logical shifts combine
212   // to zero and arithmetic shifts are clamped to (BitWidth - 1).
213   if (IsImm) {
214     assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
215     KnownBits KnownAmtBits =
216         llvm::computeKnownBits(Amt, II.getDataLayout());
217     if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
218       Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
219       Amt = Builder.CreateVectorSplat(VWidth, Amt);
220       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
221                                         : Builder.CreateLShr(Vec, Amt))
222                            : Builder.CreateAShr(Vec, Amt));
223     }
224     if (KnownAmtBits.getMinValue().uge(BitWidth)) {
225       if (LogicalShift)
226         return ConstantAggregateZero::get(VT);
227       Amt = ConstantInt::get(SVT, BitWidth - 1);
228       return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
229     }
230   } else {
231     // Ensure the first element has an in-range value and the rest of the
232     // elements in the bottom 64 bits are zero.
233     assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
234            cast<VectorType>(AmtVT)->getElementType() == SVT &&
235            "Unexpected shift-by-scalar type");
236     unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
237     APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
238     APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
239     KnownBits KnownLowerBits = llvm::computeKnownBits(
240         Amt, DemandedLower, II.getDataLayout());
241     KnownBits KnownUpperBits = llvm::computeKnownBits(
242         Amt, DemandedUpper, II.getDataLayout());
243     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
244         (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
245       SmallVector<int, 16> ZeroSplat(VWidth, 0);
246       Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
247       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
248                                         : Builder.CreateLShr(Vec, Amt))
249                            : Builder.CreateAShr(Vec, Amt));
250     }
251   }
252 
253   // Simplify if count is constant vector.
254   auto *CDV = dyn_cast<ConstantDataVector>(Amt);
255   if (!CDV)
256     return nullptr;
257 
258   // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
259   // operand to compute the shift amount.
260   assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
261          cast<VectorType>(AmtVT)->getElementType() == SVT &&
262          "Unexpected shift-by-scalar type");
263 
264   // Concatenate the sub-elements to create the 64-bit value.
265   APInt Count(64, 0);
266   for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
267     unsigned SubEltIdx = (NumSubElts - 1) - i;
268     auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
269     Count <<= BitWidth;
270     Count |= SubElt->getValue().zextOrTrunc(64);
271   }
272 
273   // If shift-by-zero then just return the original value.
274   if (Count.isZero())
275     return Vec;
276 
277   // Handle cases when Shift >= BitWidth.
278   if (Count.uge(BitWidth)) {
279     // If LogicalShift - just return zero.
280     if (LogicalShift)
281       return ConstantAggregateZero::get(VT);
282 
283     // If ArithmeticShift - clamp Shift to (BitWidth - 1).
284     Count = APInt(64, BitWidth - 1);
285   }
286 
287   // Get a constant vector of the same type as the first operand.
288   auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
289   auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
290 
291   if (ShiftLeft)
292     return Builder.CreateShl(Vec, ShiftVec);
293 
294   if (LogicalShift)
295     return Builder.CreateLShr(Vec, ShiftVec);
296 
297   return Builder.CreateAShr(Vec, ShiftVec);
298 }
299 
300 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
301 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
302 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
303 static Value *simplifyX86varShift(const IntrinsicInst &II,
304                                   InstCombiner::BuilderTy &Builder) {
305   bool LogicalShift = false;
306   bool ShiftLeft = false;
307 
308   switch (II.getIntrinsicID()) {
309   default:
310     llvm_unreachable("Unexpected intrinsic!");
311   case Intrinsic::x86_avx2_psrav_d:
312   case Intrinsic::x86_avx2_psrav_d_256:
313   case Intrinsic::x86_avx512_psrav_q_128:
314   case Intrinsic::x86_avx512_psrav_q_256:
315   case Intrinsic::x86_avx512_psrav_d_512:
316   case Intrinsic::x86_avx512_psrav_q_512:
317   case Intrinsic::x86_avx512_psrav_w_128:
318   case Intrinsic::x86_avx512_psrav_w_256:
319   case Intrinsic::x86_avx512_psrav_w_512:
320     LogicalShift = false;
321     ShiftLeft = false;
322     break;
323   case Intrinsic::x86_avx2_psrlv_d:
324   case Intrinsic::x86_avx2_psrlv_d_256:
325   case Intrinsic::x86_avx2_psrlv_q:
326   case Intrinsic::x86_avx2_psrlv_q_256:
327   case Intrinsic::x86_avx512_psrlv_d_512:
328   case Intrinsic::x86_avx512_psrlv_q_512:
329   case Intrinsic::x86_avx512_psrlv_w_128:
330   case Intrinsic::x86_avx512_psrlv_w_256:
331   case Intrinsic::x86_avx512_psrlv_w_512:
332     LogicalShift = true;
333     ShiftLeft = false;
334     break;
335   case Intrinsic::x86_avx2_psllv_d:
336   case Intrinsic::x86_avx2_psllv_d_256:
337   case Intrinsic::x86_avx2_psllv_q:
338   case Intrinsic::x86_avx2_psllv_q_256:
339   case Intrinsic::x86_avx512_psllv_d_512:
340   case Intrinsic::x86_avx512_psllv_q_512:
341   case Intrinsic::x86_avx512_psllv_w_128:
342   case Intrinsic::x86_avx512_psllv_w_256:
343   case Intrinsic::x86_avx512_psllv_w_512:
344     LogicalShift = true;
345     ShiftLeft = true;
346     break;
347   }
348   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
349 
350   Value *Vec = II.getArgOperand(0);
351   Value *Amt = II.getArgOperand(1);
352   auto *VT = cast<FixedVectorType>(II.getType());
353   Type *SVT = VT->getElementType();
354   int NumElts = VT->getNumElements();
355   int BitWidth = SVT->getIntegerBitWidth();
356 
357   // If the shift amount is guaranteed to be in-range we can replace it with a
358   // generic shift.
359   KnownBits KnownAmt =
360       llvm::computeKnownBits(Amt, II.getDataLayout());
361   if (KnownAmt.getMaxValue().ult(BitWidth)) {
362     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
363                                       : Builder.CreateLShr(Vec, Amt))
364                          : Builder.CreateAShr(Vec, Amt));
365   }
366 
367   // Simplify if all shift amounts are constant/undef.
368   auto *CShift = dyn_cast<Constant>(Amt);
369   if (!CShift)
370     return nullptr;
371 
372   // Collect each element's shift amount.
373   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
374   bool AnyOutOfRange = false;
375   SmallVector<int, 8> ShiftAmts;
376   for (int I = 0; I < NumElts; ++I) {
377     auto *CElt = CShift->getAggregateElement(I);
378     if (isa_and_nonnull<UndefValue>(CElt)) {
379       ShiftAmts.push_back(-1);
380       continue;
381     }
382 
383     auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
384     if (!COp)
385       return nullptr;
386 
387     // Handle out of range shifts.
388     // If LogicalShift - set to BitWidth (special case).
389     // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
390     APInt ShiftVal = COp->getValue();
391     if (ShiftVal.uge(BitWidth)) {
392       AnyOutOfRange = LogicalShift;
393       ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
394       continue;
395     }
396 
397     ShiftAmts.push_back((int)ShiftVal.getZExtValue());
398   }
399 
400   // If all elements out of range or UNDEF, return vector of zeros/undefs.
401   // ArithmeticShift should only hit this if they are all UNDEF.
402   auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
403   if (llvm::all_of(ShiftAmts, OutOfRange)) {
404     SmallVector<Constant *, 8> ConstantVec;
405     for (int Idx : ShiftAmts) {
406       if (Idx < 0) {
407         ConstantVec.push_back(UndefValue::get(SVT));
408       } else {
409         assert(LogicalShift && "Logical shift expected");
410         ConstantVec.push_back(ConstantInt::getNullValue(SVT));
411       }
412     }
413     return ConstantVector::get(ConstantVec);
414   }
415 
416   // We can't handle only some out of range values with generic logical shifts.
417   if (AnyOutOfRange)
418     return nullptr;
419 
420   // Build the shift amount constant vector.
421   SmallVector<Constant *, 8> ShiftVecAmts;
422   for (int Idx : ShiftAmts) {
423     if (Idx < 0)
424       ShiftVecAmts.push_back(UndefValue::get(SVT));
425     else
426       ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
427   }
428   auto ShiftVec = ConstantVector::get(ShiftVecAmts);
429 
430   if (ShiftLeft)
431     return Builder.CreateShl(Vec, ShiftVec);
432 
433   if (LogicalShift)
434     return Builder.CreateLShr(Vec, ShiftVec);
435 
436   return Builder.CreateAShr(Vec, ShiftVec);
437 }
438 
439 static Value *simplifyX86pack(IntrinsicInst &II,
440                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
441   Value *Arg0 = II.getArgOperand(0);
442   Value *Arg1 = II.getArgOperand(1);
443   Type *ResTy = II.getType();
444 
445   // Fast all undef handling.
446   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
447     return UndefValue::get(ResTy);
448 
449   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
450   unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
451   unsigned NumSrcElts = ArgTy->getNumElements();
452   assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
453          "Unexpected packing types");
454 
455   unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
456   unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
457   unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
458   assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
459          "Unexpected packing types");
460 
461   // Constant folding.
462   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
463     return nullptr;
464 
465   // Clamp Values - signed/unsigned both use signed clamp values, but they
466   // differ on the min/max values.
467   APInt MinValue, MaxValue;
468   if (IsSigned) {
469     // PACKSS: Truncate signed value with signed saturation.
470     // Source values less than dst minint are saturated to minint.
471     // Source values greater than dst maxint are saturated to maxint.
472     MinValue =
473         APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
474     MaxValue =
475         APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
476   } else {
477     // PACKUS: Truncate signed value with unsigned saturation.
478     // Source values less than zero are saturated to zero.
479     // Source values greater than dst maxuint are saturated to maxuint.
480     MinValue = APInt::getZero(SrcScalarSizeInBits);
481     MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
482   }
483 
484   auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
485   auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
486   Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
487   Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
488   Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
489   Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
490 
491   // Shuffle clamped args together at the lane level.
492   SmallVector<int, 32> PackMask;
493   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
494     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
495       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
496     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
497       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
498   }
499   auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
500 
501   // Truncate to dst size.
502   return Builder.CreateTrunc(Shuffle, ResTy);
503 }
504 
505 static Value *simplifyX86pmadd(IntrinsicInst &II,
506                                InstCombiner::BuilderTy &Builder,
507                                bool IsPMADDWD) {
508   Value *Arg0 = II.getArgOperand(0);
509   Value *Arg1 = II.getArgOperand(1);
510   auto *ResTy = cast<FixedVectorType>(II.getType());
511   [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
512 
513   unsigned NumDstElts = ResTy->getNumElements();
514   assert(ArgTy->getNumElements() == (2 * NumDstElts) &&
515          ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) &&
516          "Unexpected PMADD types");
517 
518   // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
519   if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
520     return ConstantAggregateZero::get(ResTy);
521 
522   // Multiply by zero.
523   if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
524     return ConstantAggregateZero::get(ResTy);
525 
526   // Constant folding.
527   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
528     return nullptr;
529 
530   // Split Lo/Hi elements pairs, extend and add together.
531   // PMADDWD(X,Y) =
532   // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1])))
533   // PMADDUBSW(X,Y) =
534   // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1])))
535   SmallVector<int> LoMask, HiMask;
536   for (unsigned I = 0; I != NumDstElts; ++I) {
537     LoMask.push_back(2 * I + 0);
538     HiMask.push_back(2 * I + 1);
539   }
540 
541   auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask);
542   auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask);
543   auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask);
544   auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask);
545 
546   auto LHSCast =
547       IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
548   LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy);
549   LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy);
550   RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy);
551   RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy);
552   Value *Lo = Builder.CreateMul(LHSLo, RHSLo);
553   Value *Hi = Builder.CreateMul(LHSHi, RHSHi);
554   return IsPMADDWD
555              ? Builder.CreateAdd(Lo, Hi)
556              : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi});
557 }
558 
559 static Value *simplifyX86movmsk(const IntrinsicInst &II,
560                                 InstCombiner::BuilderTy &Builder) {
561   Value *Arg = II.getArgOperand(0);
562   Type *ResTy = II.getType();
563 
564   // movmsk(undef) -> zero as we must ensure the upper bits are zero.
565   if (isa<UndefValue>(Arg))
566     return Constant::getNullValue(ResTy);
567 
568   auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
569   // We can't easily peek through x86_mmx types.
570   if (!ArgTy)
571     return nullptr;
572 
573   // Expand MOVMSK to compare/bitcast/zext:
574   // e.g. PMOVMSKB(v16i8 x):
575   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
576   // %int = bitcast <16 x i1> %cmp to i16
577   // %res = zext i16 %int to i32
578   unsigned NumElts = ArgTy->getNumElements();
579   Type *IntegerTy = Builder.getIntNTy(NumElts);
580 
581   Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
582   Res = Builder.CreateIsNeg(Res);
583   Res = Builder.CreateBitCast(Res, IntegerTy);
584   Res = Builder.CreateZExtOrTrunc(Res, ResTy);
585   return Res;
586 }
587 
588 static Value *simplifyX86addcarry(const IntrinsicInst &II,
589                                   InstCombiner::BuilderTy &Builder) {
590   Value *CarryIn = II.getArgOperand(0);
591   Value *Op1 = II.getArgOperand(1);
592   Value *Op2 = II.getArgOperand(2);
593   Type *RetTy = II.getType();
594   Type *OpTy = Op1->getType();
595   assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
596          RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
597          "Unexpected types for x86 addcarry");
598 
599   // If carry-in is zero, this is just an unsigned add with overflow.
600   if (match(CarryIn, PatternMatch::m_ZeroInt())) {
601     Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
602                                           {Op1, Op2});
603     // The types have to be adjusted to match the x86 call types.
604     Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
605     Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
606                                        Builder.getInt8Ty());
607     Value *Res = PoisonValue::get(RetTy);
608     Res = Builder.CreateInsertValue(Res, UAddOV, 0);
609     return Builder.CreateInsertValue(Res, UAddResult, 1);
610   }
611 
612   return nullptr;
613 }
614 
615 static Value *simplifyTernarylogic(const IntrinsicInst &II,
616                                    InstCombiner::BuilderTy &Builder) {
617 
618   auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
619   if (!ArgImm || ArgImm->getValue().uge(256))
620     return nullptr;
621 
622   Value *ArgA = II.getArgOperand(0);
623   Value *ArgB = II.getArgOperand(1);
624   Value *ArgC = II.getArgOperand(2);
625 
626   Type *Ty = II.getType();
627 
628   auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
629     return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
630   };
631   auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
632     return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
633   };
634   auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
635     return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
636   };
637   auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
638     return {Builder.CreateNot(V.first), ~V.second};
639   };
640   auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
641   auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
642   auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
643 
644   bool AIsConst = match(ArgA, PatternMatch::m_ImmConstant());
645   bool BIsConst = match(ArgB, PatternMatch::m_ImmConstant());
646   bool CIsConst = match(ArgC, PatternMatch::m_ImmConstant());
647 
648   bool ABIsConst = AIsConst && BIsConst;
649   bool ACIsConst = AIsConst && CIsConst;
650   bool BCIsConst = BIsConst && CIsConst;
651   bool ABCIsConst = AIsConst && BIsConst && CIsConst;
652 
653   // Use for verification. Its a big table. Its difficult to go from Imm ->
654   // logic ops, but easy to verify that a set of logic ops is correct. We track
655   // the logic ops through the second value in the pair. At the end it should
656   // equal Imm.
657   std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
658   std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
659   std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
660   std::pair<Value *, uint8_t> Res = {nullptr, 0};
661 
662   // Currently we only handle cases that convert directly to another instruction
663   // or cases where all the ops are constant.  This is because we don't properly
664   // handle creating ternary ops in the backend, so splitting them here may
665   // cause regressions. As the backend improves, uncomment more cases.
666 
667   uint8_t Imm = ArgImm->getValue().getZExtValue();
668   switch (Imm) {
669   case 0x0:
670     Res = {Constant::getNullValue(Ty), 0};
671     break;
672   case 0x1:
673     if (ABCIsConst)
674       Res = Nor(Or(A, B), C);
675     break;
676   case 0x2:
677     if (ABCIsConst)
678       Res = And(Nor(A, B), C);
679     break;
680   case 0x3:
681     if (ABIsConst)
682       Res = Nor(A, B);
683     break;
684   case 0x4:
685     if (ABCIsConst)
686       Res = And(Nor(A, C), B);
687     break;
688   case 0x5:
689     if (ACIsConst)
690       Res = Nor(A, C);
691     break;
692   case 0x6:
693     if (ABCIsConst)
694       Res = Nor(A, Xnor(B, C));
695     break;
696   case 0x7:
697     if (ABCIsConst)
698       Res = Nor(A, And(B, C));
699     break;
700   case 0x8:
701     if (ABCIsConst)
702       Res = Nor(A, Nand(B, C));
703     break;
704   case 0x9:
705     if (ABCIsConst)
706       Res = Nor(A, Xor(B, C));
707     break;
708   case 0xa:
709     if (ACIsConst)
710       Res = Nor(A, Not(C));
711     break;
712   case 0xb:
713     if (ABCIsConst)
714       Res = Nor(A, Nor(C, Not(B)));
715     break;
716   case 0xc:
717     if (ABIsConst)
718       Res = Nor(A, Not(B));
719     break;
720   case 0xd:
721     if (ABCIsConst)
722       Res = Nor(A, Nor(B, Not(C)));
723     break;
724   case 0xe:
725     if (ABCIsConst)
726       Res = Nor(A, Nor(B, C));
727     break;
728   case 0xf:
729     Res = Not(A);
730     break;
731   case 0x10:
732     if (ABCIsConst)
733       Res = And(A, Nor(B, C));
734     break;
735   case 0x11:
736     if (BCIsConst)
737       Res = Nor(B, C);
738     break;
739   case 0x12:
740     if (ABCIsConst)
741       Res = Nor(Xnor(A, C), B);
742     break;
743   case 0x13:
744     if (ABCIsConst)
745       Res = Nor(And(A, C), B);
746     break;
747   case 0x14:
748     if (ABCIsConst)
749       Res = Nor(Xnor(A, B), C);
750     break;
751   case 0x15:
752     if (ABCIsConst)
753       Res = Nor(And(A, B), C);
754     break;
755   case 0x16:
756     if (ABCIsConst)
757       Res = Xor(Xor(A, B), And(Nand(A, B), C));
758     break;
759   case 0x17:
760     if (ABCIsConst)
761       Res = Xor(Or(A, B), Or(Xnor(A, B), C));
762     break;
763   case 0x18:
764     if (ABCIsConst)
765       Res = Nor(Xnor(A, B), Xnor(A, C));
766     break;
767   case 0x19:
768     if (ABCIsConst)
769       Res = And(Nand(A, B), Xnor(B, C));
770     break;
771   case 0x1a:
772     if (ABCIsConst)
773       Res = Xor(A, Or(And(A, B), C));
774     break;
775   case 0x1b:
776     if (ABCIsConst)
777       Res = Xor(A, Or(Xnor(A, B), C));
778     break;
779   case 0x1c:
780     if (ABCIsConst)
781       Res = Xor(A, Or(And(A, C), B));
782     break;
783   case 0x1d:
784     if (ABCIsConst)
785       Res = Xor(A, Or(Xnor(A, C), B));
786     break;
787   case 0x1e:
788     if (ABCIsConst)
789       Res = Xor(A, Or(B, C));
790     break;
791   case 0x1f:
792     if (ABCIsConst)
793       Res = Nand(A, Or(B, C));
794     break;
795   case 0x20:
796     if (ABCIsConst)
797       Res = Nor(Nand(A, C), B);
798     break;
799   case 0x21:
800     if (ABCIsConst)
801       Res = Nor(Xor(A, C), B);
802     break;
803   case 0x22:
804     if (BCIsConst)
805       Res = Nor(B, Not(C));
806     break;
807   case 0x23:
808     if (ABCIsConst)
809       Res = Nor(B, Nor(C, Not(A)));
810     break;
811   case 0x24:
812     if (ABCIsConst)
813       Res = Nor(Xnor(A, B), Xor(A, C));
814     break;
815   case 0x25:
816     if (ABCIsConst)
817       Res = Xor(A, Nand(Nand(A, B), C));
818     break;
819   case 0x26:
820     if (ABCIsConst)
821       Res = And(Nand(A, B), Xor(B, C));
822     break;
823   case 0x27:
824     if (ABCIsConst)
825       Res = Xor(Or(Xnor(A, B), C), B);
826     break;
827   case 0x28:
828     if (ABCIsConst)
829       Res = And(Xor(A, B), C);
830     break;
831   case 0x29:
832     if (ABCIsConst)
833       Res = Xor(Xor(A, B), Nor(And(A, B), C));
834     break;
835   case 0x2a:
836     if (ABCIsConst)
837       Res = And(Nand(A, B), C);
838     break;
839   case 0x2b:
840     if (ABCIsConst)
841       Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
842     break;
843   case 0x2c:
844     if (ABCIsConst)
845       Res = Nor(Xnor(A, B), Nor(B, C));
846     break;
847   case 0x2d:
848     if (ABCIsConst)
849       Res = Xor(A, Or(B, Not(C)));
850     break;
851   case 0x2e:
852     if (ABCIsConst)
853       Res = Xor(A, Or(Xor(A, C), B));
854     break;
855   case 0x2f:
856     if (ABCIsConst)
857       Res = Nand(A, Or(B, Not(C)));
858     break;
859   case 0x30:
860     if (ABIsConst)
861       Res = Nor(B, Not(A));
862     break;
863   case 0x31:
864     if (ABCIsConst)
865       Res = Nor(Nor(A, Not(C)), B);
866     break;
867   case 0x32:
868     if (ABCIsConst)
869       Res = Nor(Nor(A, C), B);
870     break;
871   case 0x33:
872     Res = Not(B);
873     break;
874   case 0x34:
875     if (ABCIsConst)
876       Res = And(Xor(A, B), Nand(B, C));
877     break;
878   case 0x35:
879     if (ABCIsConst)
880       Res = Xor(B, Or(A, Xnor(B, C)));
881     break;
882   case 0x36:
883     if (ABCIsConst)
884       Res = Xor(Or(A, C), B);
885     break;
886   case 0x37:
887     if (ABCIsConst)
888       Res = Nand(Or(A, C), B);
889     break;
890   case 0x38:
891     if (ABCIsConst)
892       Res = Nor(Xnor(A, B), Nor(A, C));
893     break;
894   case 0x39:
895     if (ABCIsConst)
896       Res = Xor(Or(A, Not(C)), B);
897     break;
898   case 0x3a:
899     if (ABCIsConst)
900       Res = Xor(B, Or(A, Xor(B, C)));
901     break;
902   case 0x3b:
903     if (ABCIsConst)
904       Res = Nand(Or(A, Not(C)), B);
905     break;
906   case 0x3c:
907     Res = Xor(A, B);
908     break;
909   case 0x3d:
910     if (ABCIsConst)
911       Res = Xor(A, Or(Nor(A, C), B));
912     break;
913   case 0x3e:
914     if (ABCIsConst)
915       Res = Xor(A, Or(Nor(A, Not(C)), B));
916     break;
917   case 0x3f:
918     if (ABIsConst)
919       Res = Nand(A, B);
920     break;
921   case 0x40:
922     if (ABCIsConst)
923       Res = Nor(Nand(A, B), C);
924     break;
925   case 0x41:
926     if (ABCIsConst)
927       Res = Nor(Xor(A, B), C);
928     break;
929   case 0x42:
930     if (ABCIsConst)
931       Res = Nor(Xor(A, B), Xnor(A, C));
932     break;
933   case 0x43:
934     if (ABCIsConst)
935       Res = Xor(A, Nand(Nand(A, C), B));
936     break;
937   case 0x44:
938     if (BCIsConst)
939       Res = Nor(C, Not(B));
940     break;
941   case 0x45:
942     if (ABCIsConst)
943       Res = Nor(Nor(B, Not(A)), C);
944     break;
945   case 0x46:
946     if (ABCIsConst)
947       Res = Xor(Or(And(A, C), B), C);
948     break;
949   case 0x47:
950     if (ABCIsConst)
951       Res = Xor(Or(Xnor(A, C), B), C);
952     break;
953   case 0x48:
954     if (ABCIsConst)
955       Res = And(Xor(A, C), B);
956     break;
957   case 0x49:
958     if (ABCIsConst)
959       Res = Xor(Or(Xnor(A, B), And(A, C)), C);
960     break;
961   case 0x4a:
962     if (ABCIsConst)
963       Res = Nor(Xnor(A, C), Nor(B, C));
964     break;
965   case 0x4b:
966     if (ABCIsConst)
967       Res = Xor(A, Or(C, Not(B)));
968     break;
969   case 0x4c:
970     if (ABCIsConst)
971       Res = And(Nand(A, C), B);
972     break;
973   case 0x4d:
974     if (ABCIsConst)
975       Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
976     break;
977   case 0x4e:
978     if (ABCIsConst)
979       Res = Xor(A, Or(Xor(A, B), C));
980     break;
981   case 0x4f:
982     if (ABCIsConst)
983       Res = Nand(A, Nand(B, Not(C)));
984     break;
985   case 0x50:
986     if (ACIsConst)
987       Res = Nor(C, Not(A));
988     break;
989   case 0x51:
990     if (ABCIsConst)
991       Res = Nor(Nor(A, Not(B)), C);
992     break;
993   case 0x52:
994     if (ABCIsConst)
995       Res = And(Xor(A, C), Nand(B, C));
996     break;
997   case 0x53:
998     if (ABCIsConst)
999       Res = Xor(Or(Xnor(B, C), A), C);
1000     break;
1001   case 0x54:
1002     if (ABCIsConst)
1003       Res = Nor(Nor(A, B), C);
1004     break;
1005   case 0x55:
1006     Res = Not(C);
1007     break;
1008   case 0x56:
1009     if (ABCIsConst)
1010       Res = Xor(Or(A, B), C);
1011     break;
1012   case 0x57:
1013     if (ABCIsConst)
1014       Res = Nand(Or(A, B), C);
1015     break;
1016   case 0x58:
1017     if (ABCIsConst)
1018       Res = Nor(Nor(A, B), Xnor(A, C));
1019     break;
1020   case 0x59:
1021     if (ABCIsConst)
1022       Res = Xor(Or(A, Not(B)), C);
1023     break;
1024   case 0x5a:
1025     Res = Xor(A, C);
1026     break;
1027   case 0x5b:
1028     if (ABCIsConst)
1029       Res = Xor(A, Or(Nor(A, B), C));
1030     break;
1031   case 0x5c:
1032     if (ABCIsConst)
1033       Res = Xor(Or(Xor(B, C), A), C);
1034     break;
1035   case 0x5d:
1036     if (ABCIsConst)
1037       Res = Nand(Or(A, Not(B)), C);
1038     break;
1039   case 0x5e:
1040     if (ABCIsConst)
1041       Res = Xor(A, Or(Nor(A, Not(B)), C));
1042     break;
1043   case 0x5f:
1044     if (ACIsConst)
1045       Res = Nand(A, C);
1046     break;
1047   case 0x60:
1048     if (ABCIsConst)
1049       Res = And(A, Xor(B, C));
1050     break;
1051   case 0x61:
1052     if (ABCIsConst)
1053       Res = Xor(Or(Xnor(A, B), And(B, C)), C);
1054     break;
1055   case 0x62:
1056     if (ABCIsConst)
1057       Res = Nor(Nor(A, C), Xnor(B, C));
1058     break;
1059   case 0x63:
1060     if (ABCIsConst)
1061       Res = Xor(B, Or(C, Not(A)));
1062     break;
1063   case 0x64:
1064     if (ABCIsConst)
1065       Res = Nor(Nor(A, B), Xnor(B, C));
1066     break;
1067   case 0x65:
1068     if (ABCIsConst)
1069       Res = Xor(Or(B, Not(A)), C);
1070     break;
1071   case 0x66:
1072     Res = Xor(B, C);
1073     break;
1074   case 0x67:
1075     if (ABCIsConst)
1076       Res = Or(Nor(A, B), Xor(B, C));
1077     break;
1078   case 0x68:
1079     if (ABCIsConst)
1080       Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1081     break;
1082   case 0x69:
1083     if (ABCIsConst)
1084       Res = Xor(Xnor(A, B), C);
1085     break;
1086   case 0x6a:
1087     if (ABCIsConst)
1088       Res = Xor(And(A, B), C);
1089     break;
1090   case 0x6b:
1091     if (ABCIsConst)
1092       Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1093     break;
1094   case 0x6c:
1095     if (ABCIsConst)
1096       Res = Xor(And(A, C), B);
1097     break;
1098   case 0x6d:
1099     if (ABCIsConst)
1100       Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1101     break;
1102   case 0x6e:
1103     if (ABCIsConst)
1104       Res = Or(Nor(A, Not(B)), Xor(B, C));
1105     break;
1106   case 0x6f:
1107     if (ABCIsConst)
1108       Res = Nand(A, Xnor(B, C));
1109     break;
1110   case 0x70:
1111     if (ABCIsConst)
1112       Res = And(A, Nand(B, C));
1113     break;
1114   case 0x71:
1115     if (ABCIsConst)
1116       Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1117     break;
1118   case 0x72:
1119     if (ABCIsConst)
1120       Res = Xor(Or(Xor(A, B), C), B);
1121     break;
1122   case 0x73:
1123     if (ABCIsConst)
1124       Res = Nand(Nand(A, Not(C)), B);
1125     break;
1126   case 0x74:
1127     if (ABCIsConst)
1128       Res = Xor(Or(Xor(A, C), B), C);
1129     break;
1130   case 0x75:
1131     if (ABCIsConst)
1132       Res = Nand(Nand(A, Not(B)), C);
1133     break;
1134   case 0x76:
1135     if (ABCIsConst)
1136       Res = Xor(B, Or(Nor(B, Not(A)), C));
1137     break;
1138   case 0x77:
1139     if (BCIsConst)
1140       Res = Nand(B, C);
1141     break;
1142   case 0x78:
1143     if (ABCIsConst)
1144       Res = Xor(A, And(B, C));
1145     break;
1146   case 0x79:
1147     if (ABCIsConst)
1148       Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1149     break;
1150   case 0x7a:
1151     if (ABCIsConst)
1152       Res = Or(Xor(A, C), Nor(B, Not(A)));
1153     break;
1154   case 0x7b:
1155     if (ABCIsConst)
1156       Res = Nand(Xnor(A, C), B);
1157     break;
1158   case 0x7c:
1159     if (ABCIsConst)
1160       Res = Or(Xor(A, B), Nor(C, Not(A)));
1161     break;
1162   case 0x7d:
1163     if (ABCIsConst)
1164       Res = Nand(Xnor(A, B), C);
1165     break;
1166   case 0x7e:
1167     if (ABCIsConst)
1168       Res = Or(Xor(A, B), Xor(A, C));
1169     break;
1170   case 0x7f:
1171     if (ABCIsConst)
1172       Res = Nand(And(A, B), C);
1173     break;
1174   case 0x80:
1175     if (ABCIsConst)
1176       Res = And(And(A, B), C);
1177     break;
1178   case 0x81:
1179     if (ABCIsConst)
1180       Res = Nor(Xor(A, B), Xor(A, C));
1181     break;
1182   case 0x82:
1183     if (ABCIsConst)
1184       Res = And(Xnor(A, B), C);
1185     break;
1186   case 0x83:
1187     if (ABCIsConst)
1188       Res = Nor(Xor(A, B), Nor(C, Not(A)));
1189     break;
1190   case 0x84:
1191     if (ABCIsConst)
1192       Res = And(Xnor(A, C), B);
1193     break;
1194   case 0x85:
1195     if (ABCIsConst)
1196       Res = Nor(Xor(A, C), Nor(B, Not(A)));
1197     break;
1198   case 0x86:
1199     if (ABCIsConst)
1200       Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1201     break;
1202   case 0x87:
1203     if (ABCIsConst)
1204       Res = Xor(A, Nand(B, C));
1205     break;
1206   case 0x88:
1207     Res = And(B, C);
1208     break;
1209   case 0x89:
1210     if (ABCIsConst)
1211       Res = Xor(B, Nor(Nor(B, Not(A)), C));
1212     break;
1213   case 0x8a:
1214     if (ABCIsConst)
1215       Res = And(Nand(A, Not(B)), C);
1216     break;
1217   case 0x8b:
1218     if (ABCIsConst)
1219       Res = Xor(Nor(Xor(A, C), B), C);
1220     break;
1221   case 0x8c:
1222     if (ABCIsConst)
1223       Res = And(Nand(A, Not(C)), B);
1224     break;
1225   case 0x8d:
1226     if (ABCIsConst)
1227       Res = Xor(Nor(Xor(A, B), C), B);
1228     break;
1229   case 0x8e:
1230     if (ABCIsConst)
1231       Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1232     break;
1233   case 0x8f:
1234     if (ABCIsConst)
1235       Res = Nand(A, Nand(B, C));
1236     break;
1237   case 0x90:
1238     if (ABCIsConst)
1239       Res = And(A, Xnor(B, C));
1240     break;
1241   case 0x91:
1242     if (ABCIsConst)
1243       Res = Nor(Nor(A, Not(B)), Xor(B, C));
1244     break;
1245   case 0x92:
1246     if (ABCIsConst)
1247       Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1248     break;
1249   case 0x93:
1250     if (ABCIsConst)
1251       Res = Xor(Nand(A, C), B);
1252     break;
1253   case 0x94:
1254     if (ABCIsConst)
1255       Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1256     break;
1257   case 0x95:
1258     if (ABCIsConst)
1259       Res = Xor(Nand(A, B), C);
1260     break;
1261   case 0x96:
1262     if (ABCIsConst)
1263       Res = Xor(Xor(A, B), C);
1264     break;
1265   case 0x97:
1266     if (ABCIsConst)
1267       Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1268     break;
1269   case 0x98:
1270     if (ABCIsConst)
1271       Res = Nor(Nor(A, B), Xor(B, C));
1272     break;
1273   case 0x99:
1274     if (BCIsConst)
1275       Res = Xnor(B, C);
1276     break;
1277   case 0x9a:
1278     if (ABCIsConst)
1279       Res = Xor(Nor(B, Not(A)), C);
1280     break;
1281   case 0x9b:
1282     if (ABCIsConst)
1283       Res = Or(Nor(A, B), Xnor(B, C));
1284     break;
1285   case 0x9c:
1286     if (ABCIsConst)
1287       Res = Xor(B, Nor(C, Not(A)));
1288     break;
1289   case 0x9d:
1290     if (ABCIsConst)
1291       Res = Or(Nor(A, C), Xnor(B, C));
1292     break;
1293   case 0x9e:
1294     if (ABCIsConst)
1295       Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1296     break;
1297   case 0x9f:
1298     if (ABCIsConst)
1299       Res = Nand(A, Xor(B, C));
1300     break;
1301   case 0xa0:
1302     Res = And(A, C);
1303     break;
1304   case 0xa1:
1305     if (ABCIsConst)
1306       Res = Xor(A, Nor(Nor(A, Not(B)), C));
1307     break;
1308   case 0xa2:
1309     if (ABCIsConst)
1310       Res = And(Or(A, Not(B)), C);
1311     break;
1312   case 0xa3:
1313     if (ABCIsConst)
1314       Res = Xor(Nor(Xor(B, C), A), C);
1315     break;
1316   case 0xa4:
1317     if (ABCIsConst)
1318       Res = Xor(A, Nor(Nor(A, B), C));
1319     break;
1320   case 0xa5:
1321     if (ACIsConst)
1322       Res = Xnor(A, C);
1323     break;
1324   case 0xa6:
1325     if (ABCIsConst)
1326       Res = Xor(Nor(A, Not(B)), C);
1327     break;
1328   case 0xa7:
1329     if (ABCIsConst)
1330       Res = Or(Nor(A, B), Xnor(A, C));
1331     break;
1332   case 0xa8:
1333     if (ABCIsConst)
1334       Res = And(Or(A, B), C);
1335     break;
1336   case 0xa9:
1337     if (ABCIsConst)
1338       Res = Xor(Nor(A, B), C);
1339     break;
1340   case 0xaa:
1341     Res = C;
1342     break;
1343   case 0xab:
1344     if (ABCIsConst)
1345       Res = Or(Nor(A, B), C);
1346     break;
1347   case 0xac:
1348     if (ABCIsConst)
1349       Res = Xor(Nor(Xnor(B, C), A), C);
1350     break;
1351   case 0xad:
1352     if (ABCIsConst)
1353       Res = Or(Xnor(A, C), And(B, C));
1354     break;
1355   case 0xae:
1356     if (ABCIsConst)
1357       Res = Or(Nor(A, Not(B)), C);
1358     break;
1359   case 0xaf:
1360     if (ACIsConst)
1361       Res = Or(C, Not(A));
1362     break;
1363   case 0xb0:
1364     if (ABCIsConst)
1365       Res = And(A, Nand(B, Not(C)));
1366     break;
1367   case 0xb1:
1368     if (ABCIsConst)
1369       Res = Xor(A, Nor(Xor(A, B), C));
1370     break;
1371   case 0xb2:
1372     if (ABCIsConst)
1373       Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1374     break;
1375   case 0xb3:
1376     if (ABCIsConst)
1377       Res = Nand(Nand(A, C), B);
1378     break;
1379   case 0xb4:
1380     if (ABCIsConst)
1381       Res = Xor(A, Nor(C, Not(B)));
1382     break;
1383   case 0xb5:
1384     if (ABCIsConst)
1385       Res = Or(Xnor(A, C), Nor(B, C));
1386     break;
1387   case 0xb6:
1388     if (ABCIsConst)
1389       Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1390     break;
1391   case 0xb7:
1392     if (ABCIsConst)
1393       Res = Nand(Xor(A, C), B);
1394     break;
1395   case 0xb8:
1396     if (ABCIsConst)
1397       Res = Xor(Nor(Xnor(A, C), B), C);
1398     break;
1399   case 0xb9:
1400     if (ABCIsConst)
1401       Res = Xor(Nor(And(A, C), B), C);
1402     break;
1403   case 0xba:
1404     if (ABCIsConst)
1405       Res = Or(Nor(B, Not(A)), C);
1406     break;
1407   case 0xbb:
1408     if (BCIsConst)
1409       Res = Or(C, Not(B));
1410     break;
1411   case 0xbc:
1412     if (ABCIsConst)
1413       Res = Xor(A, And(Nand(A, C), B));
1414     break;
1415   case 0xbd:
1416     if (ABCIsConst)
1417       Res = Or(Xor(A, B), Xnor(A, C));
1418     break;
1419   case 0xbe:
1420     if (ABCIsConst)
1421       Res = Or(Xor(A, B), C);
1422     break;
1423   case 0xbf:
1424     if (ABCIsConst)
1425       Res = Or(Nand(A, B), C);
1426     break;
1427   case 0xc0:
1428     Res = And(A, B);
1429     break;
1430   case 0xc1:
1431     if (ABCIsConst)
1432       Res = Xor(A, Nor(Nor(A, Not(C)), B));
1433     break;
1434   case 0xc2:
1435     if (ABCIsConst)
1436       Res = Xor(A, Nor(Nor(A, C), B));
1437     break;
1438   case 0xc3:
1439     if (ABIsConst)
1440       Res = Xnor(A, B);
1441     break;
1442   case 0xc4:
1443     if (ABCIsConst)
1444       Res = And(Or(A, Not(C)), B);
1445     break;
1446   case 0xc5:
1447     if (ABCIsConst)
1448       Res = Xor(B, Nor(A, Xor(B, C)));
1449     break;
1450   case 0xc6:
1451     if (ABCIsConst)
1452       Res = Xor(Nor(A, Not(C)), B);
1453     break;
1454   case 0xc7:
1455     if (ABCIsConst)
1456       Res = Or(Xnor(A, B), Nor(A, C));
1457     break;
1458   case 0xc8:
1459     if (ABCIsConst)
1460       Res = And(Or(A, C), B);
1461     break;
1462   case 0xc9:
1463     if (ABCIsConst)
1464       Res = Xor(Nor(A, C), B);
1465     break;
1466   case 0xca:
1467     if (ABCIsConst)
1468       Res = Xor(B, Nor(A, Xnor(B, C)));
1469     break;
1470   case 0xcb:
1471     if (ABCIsConst)
1472       Res = Or(Xnor(A, B), And(B, C));
1473     break;
1474   case 0xcc:
1475     Res = B;
1476     break;
1477   case 0xcd:
1478     if (ABCIsConst)
1479       Res = Or(Nor(A, C), B);
1480     break;
1481   case 0xce:
1482     if (ABCIsConst)
1483       Res = Or(Nor(A, Not(C)), B);
1484     break;
1485   case 0xcf:
1486     if (ABIsConst)
1487       Res = Or(B, Not(A));
1488     break;
1489   case 0xd0:
1490     if (ABCIsConst)
1491       Res = And(A, Or(B, Not(C)));
1492     break;
1493   case 0xd1:
1494     if (ABCIsConst)
1495       Res = Xor(A, Nor(Xor(A, C), B));
1496     break;
1497   case 0xd2:
1498     if (ABCIsConst)
1499       Res = Xor(A, Nor(B, Not(C)));
1500     break;
1501   case 0xd3:
1502     if (ABCIsConst)
1503       Res = Or(Xnor(A, B), Nor(B, C));
1504     break;
1505   case 0xd4:
1506     if (ABCIsConst)
1507       Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1508     break;
1509   case 0xd5:
1510     if (ABCIsConst)
1511       Res = Nand(Nand(A, B), C);
1512     break;
1513   case 0xd6:
1514     if (ABCIsConst)
1515       Res = Xor(Xor(A, B), Or(And(A, B), C));
1516     break;
1517   case 0xd7:
1518     if (ABCIsConst)
1519       Res = Nand(Xor(A, B), C);
1520     break;
1521   case 0xd8:
1522     if (ABCIsConst)
1523       Res = Xor(Nor(Xnor(A, B), C), B);
1524     break;
1525   case 0xd9:
1526     if (ABCIsConst)
1527       Res = Or(And(A, B), Xnor(B, C));
1528     break;
1529   case 0xda:
1530     if (ABCIsConst)
1531       Res = Xor(A, And(Nand(A, B), C));
1532     break;
1533   case 0xdb:
1534     if (ABCIsConst)
1535       Res = Or(Xnor(A, B), Xor(A, C));
1536     break;
1537   case 0xdc:
1538     if (ABCIsConst)
1539       Res = Or(B, Nor(C, Not(A)));
1540     break;
1541   case 0xdd:
1542     if (BCIsConst)
1543       Res = Or(B, Not(C));
1544     break;
1545   case 0xde:
1546     if (ABCIsConst)
1547       Res = Or(Xor(A, C), B);
1548     break;
1549   case 0xdf:
1550     if (ABCIsConst)
1551       Res = Or(Nand(A, C), B);
1552     break;
1553   case 0xe0:
1554     if (ABCIsConst)
1555       Res = And(A, Or(B, C));
1556     break;
1557   case 0xe1:
1558     if (ABCIsConst)
1559       Res = Xor(A, Nor(B, C));
1560     break;
1561   case 0xe2:
1562     if (ABCIsConst)
1563       Res = Xor(A, Nor(Xnor(A, C), B));
1564     break;
1565   case 0xe3:
1566     if (ABCIsConst)
1567       Res = Xor(A, Nor(And(A, C), B));
1568     break;
1569   case 0xe4:
1570     if (ABCIsConst)
1571       Res = Xor(A, Nor(Xnor(A, B), C));
1572     break;
1573   case 0xe5:
1574     if (ABCIsConst)
1575       Res = Xor(A, Nor(And(A, B), C));
1576     break;
1577   case 0xe6:
1578     if (ABCIsConst)
1579       Res = Or(And(A, B), Xor(B, C));
1580     break;
1581   case 0xe7:
1582     if (ABCIsConst)
1583       Res = Or(Xnor(A, B), Xnor(A, C));
1584     break;
1585   case 0xe8:
1586     if (ABCIsConst)
1587       Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1588     break;
1589   case 0xe9:
1590     if (ABCIsConst)
1591       Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1592     break;
1593   case 0xea:
1594     if (ABCIsConst)
1595       Res = Or(And(A, B), C);
1596     break;
1597   case 0xeb:
1598     if (ABCIsConst)
1599       Res = Or(Xnor(A, B), C);
1600     break;
1601   case 0xec:
1602     if (ABCIsConst)
1603       Res = Or(And(A, C), B);
1604     break;
1605   case 0xed:
1606     if (ABCIsConst)
1607       Res = Or(Xnor(A, C), B);
1608     break;
1609   case 0xee:
1610     Res = Or(B, C);
1611     break;
1612   case 0xef:
1613     if (ABCIsConst)
1614       Res = Nand(A, Nor(B, C));
1615     break;
1616   case 0xf0:
1617     Res = A;
1618     break;
1619   case 0xf1:
1620     if (ABCIsConst)
1621       Res = Or(A, Nor(B, C));
1622     break;
1623   case 0xf2:
1624     if (ABCIsConst)
1625       Res = Or(A, Nor(B, Not(C)));
1626     break;
1627   case 0xf3:
1628     if (ABIsConst)
1629       Res = Or(A, Not(B));
1630     break;
1631   case 0xf4:
1632     if (ABCIsConst)
1633       Res = Or(A, Nor(C, Not(B)));
1634     break;
1635   case 0xf5:
1636     if (ACIsConst)
1637       Res = Or(A, Not(C));
1638     break;
1639   case 0xf6:
1640     if (ABCIsConst)
1641       Res = Or(A, Xor(B, C));
1642     break;
1643   case 0xf7:
1644     if (ABCIsConst)
1645       Res = Or(A, Nand(B, C));
1646     break;
1647   case 0xf8:
1648     if (ABCIsConst)
1649       Res = Or(A, And(B, C));
1650     break;
1651   case 0xf9:
1652     if (ABCIsConst)
1653       Res = Or(A, Xnor(B, C));
1654     break;
1655   case 0xfa:
1656     Res = Or(A, C);
1657     break;
1658   case 0xfb:
1659     if (ABCIsConst)
1660       Res = Nand(Nor(A, C), B);
1661     break;
1662   case 0xfc:
1663     Res = Or(A, B);
1664     break;
1665   case 0xfd:
1666     if (ABCIsConst)
1667       Res = Nand(Nor(A, B), C);
1668     break;
1669   case 0xfe:
1670     if (ABCIsConst)
1671       Res = Or(Or(A, B), C);
1672     break;
1673   case 0xff:
1674     Res = {Constant::getAllOnesValue(Ty), 0xff};
1675     break;
1676   }
1677 
1678   assert((Res.first == nullptr || Res.second == Imm) &&
1679          "Simplification of ternary logic does not verify!");
1680   return Res.first;
1681 }
1682 
1683 static Value *simplifyX86insertps(const IntrinsicInst &II,
1684                                   InstCombiner::BuilderTy &Builder) {
1685   auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1686   if (!CInt)
1687     return nullptr;
1688 
1689   auto *VecTy = cast<FixedVectorType>(II.getType());
1690   assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1691 
1692   // The immediate permute control byte looks like this:
1693   //    [3:0] - zero mask for each 32-bit lane
1694   //    [5:4] - select one 32-bit destination lane
1695   //    [7:6] - select one 32-bit source lane
1696 
1697   uint8_t Imm = CInt->getZExtValue();
1698   uint8_t ZMask = Imm & 0xf;
1699   uint8_t DestLane = (Imm >> 4) & 0x3;
1700   uint8_t SourceLane = (Imm >> 6) & 0x3;
1701 
1702   ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
1703 
1704   // If all zero mask bits are set, this was just a weird way to
1705   // generate a zero vector.
1706   if (ZMask == 0xf)
1707     return ZeroVector;
1708 
1709   // Initialize by passing all of the first source bits through.
1710   int ShuffleMask[4] = {0, 1, 2, 3};
1711 
1712   // We may replace the second operand with the zero vector.
1713   Value *V1 = II.getArgOperand(1);
1714 
1715   if (ZMask) {
1716     // If the zero mask is being used with a single input or the zero mask
1717     // overrides the destination lane, this is a shuffle with the zero vector.
1718     if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1719         (ZMask & (1 << DestLane))) {
1720       V1 = ZeroVector;
1721       // We may still move 32-bits of the first source vector from one lane
1722       // to another.
1723       ShuffleMask[DestLane] = SourceLane;
1724       // The zero mask may override the previous insert operation.
1725       for (unsigned i = 0; i < 4; ++i)
1726         if ((ZMask >> i) & 0x1)
1727           ShuffleMask[i] = i + 4;
1728     } else {
1729       // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1730       return nullptr;
1731     }
1732   } else {
1733     // Replace the selected destination lane with the selected source lane.
1734     ShuffleMask[DestLane] = SourceLane + 4;
1735   }
1736 
1737   return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1738 }
1739 
1740 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1741 /// or conversion to a shuffle vector.
1742 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
1743                                ConstantInt *CILength, ConstantInt *CIIndex,
1744                                InstCombiner::BuilderTy &Builder) {
1745   auto LowConstantHighUndef = [&](uint64_t Val) {
1746     Type *IntTy64 = Type::getInt64Ty(II.getContext());
1747     Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1748                         UndefValue::get(IntTy64)};
1749     return ConstantVector::get(Args);
1750   };
1751 
1752   // See if we're dealing with constant values.
1753   auto *C0 = dyn_cast<Constant>(Op0);
1754   auto *CI0 =
1755       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1756          : nullptr;
1757 
1758   // Attempt to constant fold.
1759   if (CILength && CIIndex) {
1760     // From AMD documentation: "The bit index and field length are each six
1761     // bits in length other bits of the field are ignored."
1762     APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1763     APInt APLength = CILength->getValue().zextOrTrunc(6);
1764 
1765     unsigned Index = APIndex.getZExtValue();
1766 
1767     // From AMD documentation: "a value of zero in the field length is
1768     // defined as length of 64".
1769     unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1770 
1771     // From AMD documentation: "If the sum of the bit index + length field
1772     // is greater than 64, the results are undefined".
1773     unsigned End = Index + Length;
1774 
1775     // Note that both field index and field length are 8-bit quantities.
1776     // Since variables 'Index' and 'Length' are unsigned values
1777     // obtained from zero-extending field index and field length
1778     // respectively, their sum should never wrap around.
1779     if (End > 64)
1780       return UndefValue::get(II.getType());
1781 
1782     // If we are inserting whole bytes, we can convert this to a shuffle.
1783     // Lowering can recognize EXTRQI shuffle masks.
1784     if ((Length % 8) == 0 && (Index % 8) == 0) {
1785       // Convert bit indices to byte indices.
1786       Length /= 8;
1787       Index /= 8;
1788 
1789       Type *IntTy8 = Type::getInt8Ty(II.getContext());
1790       auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1791 
1792       SmallVector<int, 16> ShuffleMask;
1793       for (int i = 0; i != (int)Length; ++i)
1794         ShuffleMask.push_back(i + Index);
1795       for (int i = Length; i != 8; ++i)
1796         ShuffleMask.push_back(i + 16);
1797       for (int i = 8; i != 16; ++i)
1798         ShuffleMask.push_back(-1);
1799 
1800       Value *SV = Builder.CreateShuffleVector(
1801           Builder.CreateBitCast(Op0, ShufTy),
1802           ConstantAggregateZero::get(ShufTy), ShuffleMask);
1803       return Builder.CreateBitCast(SV, II.getType());
1804     }
1805 
1806     // Constant Fold - shift Index'th bit to lowest position and mask off
1807     // Length bits.
1808     if (CI0) {
1809       APInt Elt = CI0->getValue();
1810       Elt.lshrInPlace(Index);
1811       Elt = Elt.zextOrTrunc(Length);
1812       return LowConstantHighUndef(Elt.getZExtValue());
1813     }
1814 
1815     // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1816     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1817       Value *Args[] = {Op0, CILength, CIIndex};
1818       Module *M = II.getModule();
1819       Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
1820       return Builder.CreateCall(F, Args);
1821     }
1822   }
1823 
1824   // Constant Fold - extraction from zero is always {zero, undef}.
1825   if (CI0 && CI0->isZero())
1826     return LowConstantHighUndef(0);
1827 
1828   return nullptr;
1829 }
1830 
1831 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1832 /// folding or conversion to a shuffle vector.
1833 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
1834                                  APInt APLength, APInt APIndex,
1835                                  InstCombiner::BuilderTy &Builder) {
1836   // From AMD documentation: "The bit index and field length are each six bits
1837   // in length other bits of the field are ignored."
1838   APIndex = APIndex.zextOrTrunc(6);
1839   APLength = APLength.zextOrTrunc(6);
1840 
1841   // Attempt to constant fold.
1842   unsigned Index = APIndex.getZExtValue();
1843 
1844   // From AMD documentation: "a value of zero in the field length is
1845   // defined as length of 64".
1846   unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1847 
1848   // From AMD documentation: "If the sum of the bit index + length field
1849   // is greater than 64, the results are undefined".
1850   unsigned End = Index + Length;
1851 
1852   // Note that both field index and field length are 8-bit quantities.
1853   // Since variables 'Index' and 'Length' are unsigned values
1854   // obtained from zero-extending field index and field length
1855   // respectively, their sum should never wrap around.
1856   if (End > 64)
1857     return UndefValue::get(II.getType());
1858 
1859   // If we are inserting whole bytes, we can convert this to a shuffle.
1860   // Lowering can recognize INSERTQI shuffle masks.
1861   if ((Length % 8) == 0 && (Index % 8) == 0) {
1862     // Convert bit indices to byte indices.
1863     Length /= 8;
1864     Index /= 8;
1865 
1866     Type *IntTy8 = Type::getInt8Ty(II.getContext());
1867     auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1868 
1869     SmallVector<int, 16> ShuffleMask;
1870     for (int i = 0; i != (int)Index; ++i)
1871       ShuffleMask.push_back(i);
1872     for (int i = 0; i != (int)Length; ++i)
1873       ShuffleMask.push_back(i + 16);
1874     for (int i = Index + Length; i != 8; ++i)
1875       ShuffleMask.push_back(i);
1876     for (int i = 8; i != 16; ++i)
1877       ShuffleMask.push_back(-1);
1878 
1879     Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1880                                             Builder.CreateBitCast(Op1, ShufTy),
1881                                             ShuffleMask);
1882     return Builder.CreateBitCast(SV, II.getType());
1883   }
1884 
1885   // See if we're dealing with constant values.
1886   auto *C0 = dyn_cast<Constant>(Op0);
1887   auto *C1 = dyn_cast<Constant>(Op1);
1888   auto *CI00 =
1889       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1890          : nullptr;
1891   auto *CI10 =
1892       C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1893          : nullptr;
1894 
1895   // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1896   if (CI00 && CI10) {
1897     APInt V00 = CI00->getValue();
1898     APInt V10 = CI10->getValue();
1899     APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
1900     V00 = V00 & ~Mask;
1901     V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
1902     APInt Val = V00 | V10;
1903     Type *IntTy64 = Type::getInt64Ty(II.getContext());
1904     Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
1905                         UndefValue::get(IntTy64)};
1906     return ConstantVector::get(Args);
1907   }
1908 
1909   // If we were an INSERTQ call, we'll save demanded elements if we convert to
1910   // INSERTQI.
1911   if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
1912     Type *IntTy8 = Type::getInt8Ty(II.getContext());
1913     Constant *CILength = ConstantInt::get(IntTy8, Length, false);
1914     Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
1915 
1916     Value *Args[] = {Op0, Op1, CILength, CIIndex};
1917     Module *M = II.getModule();
1918     Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
1919     return Builder.CreateCall(F, Args);
1920   }
1921 
1922   return nullptr;
1923 }
1924 
1925 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
1926 static Value *simplifyX86pshufb(const IntrinsicInst &II,
1927                                 InstCombiner::BuilderTy &Builder) {
1928   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1929   if (!V)
1930     return nullptr;
1931 
1932   auto *VecTy = cast<FixedVectorType>(II.getType());
1933   unsigned NumElts = VecTy->getNumElements();
1934   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
1935          "Unexpected number of elements in shuffle mask!");
1936 
1937   // Construct a shuffle mask from constant integers or UNDEFs.
1938   int Indexes[64];
1939 
1940   // Each byte in the shuffle control mask forms an index to permute the
1941   // corresponding byte in the destination operand.
1942   for (unsigned I = 0; I < NumElts; ++I) {
1943     Constant *COp = V->getAggregateElement(I);
1944     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1945       return nullptr;
1946 
1947     if (isa<UndefValue>(COp)) {
1948       Indexes[I] = -1;
1949       continue;
1950     }
1951 
1952     int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
1953 
1954     // If the most significant bit (bit[7]) of each byte of the shuffle
1955     // control mask is set, then zero is written in the result byte.
1956     // The zero vector is in the right-hand side of the resulting
1957     // shufflevector.
1958 
1959     // The value of each index for the high 128-bit lane is the least
1960     // significant 4 bits of the respective shuffle control byte.
1961     Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
1962     Indexes[I] = Index;
1963   }
1964 
1965   auto V1 = II.getArgOperand(0);
1966   auto V2 = Constant::getNullValue(VecTy);
1967   return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
1968 }
1969 
1970 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
1971 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
1972                                     InstCombiner::BuilderTy &Builder) {
1973   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1974   if (!V)
1975     return nullptr;
1976 
1977   auto *VecTy = cast<FixedVectorType>(II.getType());
1978   unsigned NumElts = VecTy->getNumElements();
1979   bool IsPD = VecTy->getScalarType()->isDoubleTy();
1980   unsigned NumLaneElts = IsPD ? 2 : 4;
1981   assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
1982 
1983   // Construct a shuffle mask from constant integers or UNDEFs.
1984   int Indexes[16];
1985 
1986   // The intrinsics only read one or two bits, clear the rest.
1987   for (unsigned I = 0; I < NumElts; ++I) {
1988     Constant *COp = V->getAggregateElement(I);
1989     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1990       return nullptr;
1991 
1992     if (isa<UndefValue>(COp)) {
1993       Indexes[I] = -1;
1994       continue;
1995     }
1996 
1997     APInt Index = cast<ConstantInt>(COp)->getValue();
1998     Index = Index.zextOrTrunc(32).getLoBits(2);
1999 
2000     // The PD variants uses bit 1 to select per-lane element index, so
2001     // shift down to convert to generic shuffle mask index.
2002     if (IsPD)
2003       Index.lshrInPlace(1);
2004 
2005     // The _256 variants are a bit trickier since the mask bits always index
2006     // into the corresponding 128 half. In order to convert to a generic
2007     // shuffle, we have to make that explicit.
2008     Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
2009 
2010     Indexes[I] = Index.getZExtValue();
2011   }
2012 
2013   auto V1 = II.getArgOperand(0);
2014   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
2015 }
2016 
2017 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
2018 static Value *simplifyX86vpermv(const IntrinsicInst &II,
2019                                 InstCombiner::BuilderTy &Builder) {
2020   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2021   if (!V)
2022     return nullptr;
2023 
2024   auto *VecTy = cast<FixedVectorType>(II.getType());
2025   unsigned Size = VecTy->getNumElements();
2026   assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
2027          "Unexpected shuffle mask size");
2028 
2029   // Construct a shuffle mask from constant integers or UNDEFs.
2030   int Indexes[64];
2031 
2032   for (unsigned I = 0; I < Size; ++I) {
2033     Constant *COp = V->getAggregateElement(I);
2034     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2035       return nullptr;
2036 
2037     if (isa<UndefValue>(COp)) {
2038       Indexes[I] = -1;
2039       continue;
2040     }
2041 
2042     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2043     Index &= Size - 1;
2044     Indexes[I] = Index;
2045   }
2046 
2047   auto V1 = II.getArgOperand(0);
2048   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
2049 }
2050 
2051 /// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant.
2052 static Value *simplifyX86vpermv3(const IntrinsicInst &II,
2053                                  InstCombiner::BuilderTy &Builder) {
2054   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2055   if (!V)
2056     return nullptr;
2057 
2058   auto *VecTy = cast<FixedVectorType>(II.getType());
2059   unsigned Size = VecTy->getNumElements();
2060   assert((Size == 2 || Size == 4 || Size == 8 || Size == 16 || Size == 32 ||
2061           Size == 64) &&
2062          "Unexpected shuffle mask size");
2063 
2064   // Construct a shuffle mask from constant integers or UNDEFs.
2065   int Indexes[64];
2066 
2067   for (unsigned I = 0; I < Size; ++I) {
2068     Constant *COp = V->getAggregateElement(I);
2069     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2070       return nullptr;
2071 
2072     if (isa<UndefValue>(COp)) {
2073       Indexes[I] = -1;
2074       continue;
2075     }
2076 
2077     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2078     Index &= (2 * Size) - 1;
2079     Indexes[I] = Index;
2080   }
2081 
2082   auto V1 = II.getArgOperand(0);
2083   auto V2 = II.getArgOperand(2);
2084   return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size));
2085 }
2086 
2087 std::optional<Instruction *>
2088 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
2089   auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
2090                                              unsigned DemandedWidth) {
2091     APInt UndefElts(Width, 0);
2092     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2093     return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2094   };
2095 
2096   Intrinsic::ID IID = II.getIntrinsicID();
2097   switch (IID) {
2098   case Intrinsic::x86_bmi_bextr_32:
2099   case Intrinsic::x86_bmi_bextr_64:
2100   case Intrinsic::x86_tbm_bextri_u32:
2101   case Intrinsic::x86_tbm_bextri_u64:
2102     // If the RHS is a constant we can try some simplifications.
2103     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2104       uint64_t Shift = C->getZExtValue();
2105       uint64_t Length = (Shift >> 8) & 0xff;
2106       Shift &= 0xff;
2107       unsigned BitWidth = II.getType()->getIntegerBitWidth();
2108       // If the length is 0 or the shift is out of range, replace with zero.
2109       if (Length == 0 || Shift >= BitWidth) {
2110         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2111       }
2112       // If the LHS is also a constant, we can completely constant fold this.
2113       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2114         uint64_t Result = InC->getZExtValue() >> Shift;
2115         if (Length > BitWidth)
2116           Length = BitWidth;
2117         Result &= maskTrailingOnes<uint64_t>(Length);
2118         return IC.replaceInstUsesWith(II,
2119                                       ConstantInt::get(II.getType(), Result));
2120       }
2121       // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2122       // are only masking bits that a shift already cleared?
2123     }
2124     break;
2125 
2126   case Intrinsic::x86_bmi_bzhi_32:
2127   case Intrinsic::x86_bmi_bzhi_64:
2128     // If the RHS is a constant we can try some simplifications.
2129     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2130       uint64_t Index = C->getZExtValue() & 0xff;
2131       unsigned BitWidth = II.getType()->getIntegerBitWidth();
2132       if (Index >= BitWidth) {
2133         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2134       }
2135       if (Index == 0) {
2136         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2137       }
2138       // If the LHS is also a constant, we can completely constant fold this.
2139       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2140         uint64_t Result = InC->getZExtValue();
2141         Result &= maskTrailingOnes<uint64_t>(Index);
2142         return IC.replaceInstUsesWith(II,
2143                                       ConstantInt::get(II.getType(), Result));
2144       }
2145       // TODO should we convert this to an AND if the RHS is constant?
2146     }
2147     break;
2148   case Intrinsic::x86_bmi_pext_32:
2149   case Intrinsic::x86_bmi_pext_64:
2150     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2151       if (MaskC->isNullValue()) {
2152         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2153       }
2154       if (MaskC->isAllOnesValue()) {
2155         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2156       }
2157 
2158       unsigned MaskIdx, MaskLen;
2159       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2160         // any single contingous sequence of 1s anywhere in the mask simply
2161         // describes a subset of the input bits shifted to the appropriate
2162         // position.  Replace with the straight forward IR.
2163         Value *Input = II.getArgOperand(0);
2164         Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2165         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2166         Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2167         return IC.replaceInstUsesWith(II, Shifted);
2168       }
2169 
2170       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2171         uint64_t Src = SrcC->getZExtValue();
2172         uint64_t Mask = MaskC->getZExtValue();
2173         uint64_t Result = 0;
2174         uint64_t BitToSet = 1;
2175 
2176         while (Mask) {
2177           // Isolate lowest set bit.
2178           uint64_t BitToTest = Mask & -Mask;
2179           if (BitToTest & Src)
2180             Result |= BitToSet;
2181 
2182           BitToSet <<= 1;
2183           // Clear lowest set bit.
2184           Mask &= Mask - 1;
2185         }
2186 
2187         return IC.replaceInstUsesWith(II,
2188                                       ConstantInt::get(II.getType(), Result));
2189       }
2190     }
2191     break;
2192   case Intrinsic::x86_bmi_pdep_32:
2193   case Intrinsic::x86_bmi_pdep_64:
2194     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2195       if (MaskC->isNullValue()) {
2196         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2197       }
2198       if (MaskC->isAllOnesValue()) {
2199         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2200       }
2201 
2202       unsigned MaskIdx, MaskLen;
2203       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2204         // any single contingous sequence of 1s anywhere in the mask simply
2205         // describes a subset of the input bits shifted to the appropriate
2206         // position.  Replace with the straight forward IR.
2207         Value *Input = II.getArgOperand(0);
2208         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2209         Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2210         Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2211         return IC.replaceInstUsesWith(II, Masked);
2212       }
2213 
2214       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2215         uint64_t Src = SrcC->getZExtValue();
2216         uint64_t Mask = MaskC->getZExtValue();
2217         uint64_t Result = 0;
2218         uint64_t BitToTest = 1;
2219 
2220         while (Mask) {
2221           // Isolate lowest set bit.
2222           uint64_t BitToSet = Mask & -Mask;
2223           if (BitToTest & Src)
2224             Result |= BitToSet;
2225 
2226           BitToTest <<= 1;
2227           // Clear lowest set bit;
2228           Mask &= Mask - 1;
2229         }
2230 
2231         return IC.replaceInstUsesWith(II,
2232                                       ConstantInt::get(II.getType(), Result));
2233       }
2234     }
2235     break;
2236 
2237   case Intrinsic::x86_sse_cvtss2si:
2238   case Intrinsic::x86_sse_cvtss2si64:
2239   case Intrinsic::x86_sse_cvttss2si:
2240   case Intrinsic::x86_sse_cvttss2si64:
2241   case Intrinsic::x86_sse2_cvtsd2si:
2242   case Intrinsic::x86_sse2_cvtsd2si64:
2243   case Intrinsic::x86_sse2_cvttsd2si:
2244   case Intrinsic::x86_sse2_cvttsd2si64:
2245   case Intrinsic::x86_avx512_vcvtss2si32:
2246   case Intrinsic::x86_avx512_vcvtss2si64:
2247   case Intrinsic::x86_avx512_vcvtss2usi32:
2248   case Intrinsic::x86_avx512_vcvtss2usi64:
2249   case Intrinsic::x86_avx512_vcvtsd2si32:
2250   case Intrinsic::x86_avx512_vcvtsd2si64:
2251   case Intrinsic::x86_avx512_vcvtsd2usi32:
2252   case Intrinsic::x86_avx512_vcvtsd2usi64:
2253   case Intrinsic::x86_avx512_cvttss2si:
2254   case Intrinsic::x86_avx512_cvttss2si64:
2255   case Intrinsic::x86_avx512_cvttss2usi:
2256   case Intrinsic::x86_avx512_cvttss2usi64:
2257   case Intrinsic::x86_avx512_cvttsd2si:
2258   case Intrinsic::x86_avx512_cvttsd2si64:
2259   case Intrinsic::x86_avx512_cvttsd2usi:
2260   case Intrinsic::x86_avx512_cvttsd2usi64: {
2261     // These intrinsics only demand the 0th element of their input vectors. If
2262     // we can simplify the input based on that, do so now.
2263     Value *Arg = II.getArgOperand(0);
2264     unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2265     if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2266       return IC.replaceOperand(II, 0, V);
2267     }
2268     break;
2269   }
2270 
2271   case Intrinsic::x86_mmx_pmovmskb:
2272   case Intrinsic::x86_sse_movmsk_ps:
2273   case Intrinsic::x86_sse2_movmsk_pd:
2274   case Intrinsic::x86_sse2_pmovmskb_128:
2275   case Intrinsic::x86_avx_movmsk_pd_256:
2276   case Intrinsic::x86_avx_movmsk_ps_256:
2277   case Intrinsic::x86_avx2_pmovmskb:
2278     if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2279       return IC.replaceInstUsesWith(II, V);
2280     }
2281     break;
2282 
2283   case Intrinsic::x86_sse_comieq_ss:
2284   case Intrinsic::x86_sse_comige_ss:
2285   case Intrinsic::x86_sse_comigt_ss:
2286   case Intrinsic::x86_sse_comile_ss:
2287   case Intrinsic::x86_sse_comilt_ss:
2288   case Intrinsic::x86_sse_comineq_ss:
2289   case Intrinsic::x86_sse_ucomieq_ss:
2290   case Intrinsic::x86_sse_ucomige_ss:
2291   case Intrinsic::x86_sse_ucomigt_ss:
2292   case Intrinsic::x86_sse_ucomile_ss:
2293   case Intrinsic::x86_sse_ucomilt_ss:
2294   case Intrinsic::x86_sse_ucomineq_ss:
2295   case Intrinsic::x86_sse2_comieq_sd:
2296   case Intrinsic::x86_sse2_comige_sd:
2297   case Intrinsic::x86_sse2_comigt_sd:
2298   case Intrinsic::x86_sse2_comile_sd:
2299   case Intrinsic::x86_sse2_comilt_sd:
2300   case Intrinsic::x86_sse2_comineq_sd:
2301   case Intrinsic::x86_sse2_ucomieq_sd:
2302   case Intrinsic::x86_sse2_ucomige_sd:
2303   case Intrinsic::x86_sse2_ucomigt_sd:
2304   case Intrinsic::x86_sse2_ucomile_sd:
2305   case Intrinsic::x86_sse2_ucomilt_sd:
2306   case Intrinsic::x86_sse2_ucomineq_sd:
2307   case Intrinsic::x86_avx512_vcomi_ss:
2308   case Intrinsic::x86_avx512_vcomi_sd:
2309   case Intrinsic::x86_avx512_mask_cmp_ss:
2310   case Intrinsic::x86_avx512_mask_cmp_sd: {
2311     // These intrinsics only demand the 0th element of their input vectors. If
2312     // we can simplify the input based on that, do so now.
2313     bool MadeChange = false;
2314     Value *Arg0 = II.getArgOperand(0);
2315     Value *Arg1 = II.getArgOperand(1);
2316     unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2317     if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2318       IC.replaceOperand(II, 0, V);
2319       MadeChange = true;
2320     }
2321     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2322       IC.replaceOperand(II, 1, V);
2323       MadeChange = true;
2324     }
2325     if (MadeChange) {
2326       return &II;
2327     }
2328     break;
2329   }
2330 
2331   case Intrinsic::x86_avx512_add_ps_512:
2332   case Intrinsic::x86_avx512_div_ps_512:
2333   case Intrinsic::x86_avx512_mul_ps_512:
2334   case Intrinsic::x86_avx512_sub_ps_512:
2335   case Intrinsic::x86_avx512_add_pd_512:
2336   case Intrinsic::x86_avx512_div_pd_512:
2337   case Intrinsic::x86_avx512_mul_pd_512:
2338   case Intrinsic::x86_avx512_sub_pd_512:
2339     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2340     // IR operations.
2341     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2342       if (R->getValue() == 4) {
2343         Value *Arg0 = II.getArgOperand(0);
2344         Value *Arg1 = II.getArgOperand(1);
2345 
2346         Value *V;
2347         switch (IID) {
2348         default:
2349           llvm_unreachable("Case stmts out of sync!");
2350         case Intrinsic::x86_avx512_add_ps_512:
2351         case Intrinsic::x86_avx512_add_pd_512:
2352           V = IC.Builder.CreateFAdd(Arg0, Arg1);
2353           break;
2354         case Intrinsic::x86_avx512_sub_ps_512:
2355         case Intrinsic::x86_avx512_sub_pd_512:
2356           V = IC.Builder.CreateFSub(Arg0, Arg1);
2357           break;
2358         case Intrinsic::x86_avx512_mul_ps_512:
2359         case Intrinsic::x86_avx512_mul_pd_512:
2360           V = IC.Builder.CreateFMul(Arg0, Arg1);
2361           break;
2362         case Intrinsic::x86_avx512_div_ps_512:
2363         case Intrinsic::x86_avx512_div_pd_512:
2364           V = IC.Builder.CreateFDiv(Arg0, Arg1);
2365           break;
2366         }
2367 
2368         return IC.replaceInstUsesWith(II, V);
2369       }
2370     }
2371     break;
2372 
2373   case Intrinsic::x86_avx512_mask_add_ss_round:
2374   case Intrinsic::x86_avx512_mask_div_ss_round:
2375   case Intrinsic::x86_avx512_mask_mul_ss_round:
2376   case Intrinsic::x86_avx512_mask_sub_ss_round:
2377   case Intrinsic::x86_avx512_mask_add_sd_round:
2378   case Intrinsic::x86_avx512_mask_div_sd_round:
2379   case Intrinsic::x86_avx512_mask_mul_sd_round:
2380   case Intrinsic::x86_avx512_mask_sub_sd_round:
2381     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2382     // IR operations.
2383     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2384       if (R->getValue() == 4) {
2385         // Extract the element as scalars.
2386         Value *Arg0 = II.getArgOperand(0);
2387         Value *Arg1 = II.getArgOperand(1);
2388         Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
2389         Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
2390 
2391         Value *V;
2392         switch (IID) {
2393         default:
2394           llvm_unreachable("Case stmts out of sync!");
2395         case Intrinsic::x86_avx512_mask_add_ss_round:
2396         case Intrinsic::x86_avx512_mask_add_sd_round:
2397           V = IC.Builder.CreateFAdd(LHS, RHS);
2398           break;
2399         case Intrinsic::x86_avx512_mask_sub_ss_round:
2400         case Intrinsic::x86_avx512_mask_sub_sd_round:
2401           V = IC.Builder.CreateFSub(LHS, RHS);
2402           break;
2403         case Intrinsic::x86_avx512_mask_mul_ss_round:
2404         case Intrinsic::x86_avx512_mask_mul_sd_round:
2405           V = IC.Builder.CreateFMul(LHS, RHS);
2406           break;
2407         case Intrinsic::x86_avx512_mask_div_ss_round:
2408         case Intrinsic::x86_avx512_mask_div_sd_round:
2409           V = IC.Builder.CreateFDiv(LHS, RHS);
2410           break;
2411         }
2412 
2413         // Handle the masking aspect of the intrinsic.
2414         Value *Mask = II.getArgOperand(3);
2415         auto *C = dyn_cast<ConstantInt>(Mask);
2416         // We don't need a select if we know the mask bit is a 1.
2417         if (!C || !C->getValue()[0]) {
2418           // Cast the mask to an i1 vector and then extract the lowest element.
2419           auto *MaskTy = FixedVectorType::get(
2420               IC.Builder.getInt1Ty(),
2421               cast<IntegerType>(Mask->getType())->getBitWidth());
2422           Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2423           Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2424           // Extract the lowest element from the passthru operand.
2425           Value *Passthru =
2426               IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2427           V = IC.Builder.CreateSelect(Mask, V, Passthru);
2428         }
2429 
2430         // Insert the result back into the original argument 0.
2431         V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2432 
2433         return IC.replaceInstUsesWith(II, V);
2434       }
2435     }
2436     break;
2437 
2438   // Constant fold ashr( <A x Bi>, Ci ).
2439   // Constant fold lshr( <A x Bi>, Ci ).
2440   // Constant fold shl( <A x Bi>, Ci ).
2441   case Intrinsic::x86_sse2_psrai_d:
2442   case Intrinsic::x86_sse2_psrai_w:
2443   case Intrinsic::x86_avx2_psrai_d:
2444   case Intrinsic::x86_avx2_psrai_w:
2445   case Intrinsic::x86_avx512_psrai_q_128:
2446   case Intrinsic::x86_avx512_psrai_q_256:
2447   case Intrinsic::x86_avx512_psrai_d_512:
2448   case Intrinsic::x86_avx512_psrai_q_512:
2449   case Intrinsic::x86_avx512_psrai_w_512:
2450   case Intrinsic::x86_sse2_psrli_d:
2451   case Intrinsic::x86_sse2_psrli_q:
2452   case Intrinsic::x86_sse2_psrli_w:
2453   case Intrinsic::x86_avx2_psrli_d:
2454   case Intrinsic::x86_avx2_psrli_q:
2455   case Intrinsic::x86_avx2_psrli_w:
2456   case Intrinsic::x86_avx512_psrli_d_512:
2457   case Intrinsic::x86_avx512_psrli_q_512:
2458   case Intrinsic::x86_avx512_psrli_w_512:
2459   case Intrinsic::x86_sse2_pslli_d:
2460   case Intrinsic::x86_sse2_pslli_q:
2461   case Intrinsic::x86_sse2_pslli_w:
2462   case Intrinsic::x86_avx2_pslli_d:
2463   case Intrinsic::x86_avx2_pslli_q:
2464   case Intrinsic::x86_avx2_pslli_w:
2465   case Intrinsic::x86_avx512_pslli_d_512:
2466   case Intrinsic::x86_avx512_pslli_q_512:
2467   case Intrinsic::x86_avx512_pslli_w_512:
2468     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2469       return IC.replaceInstUsesWith(II, V);
2470     }
2471     break;
2472 
2473   case Intrinsic::x86_sse2_psra_d:
2474   case Intrinsic::x86_sse2_psra_w:
2475   case Intrinsic::x86_avx2_psra_d:
2476   case Intrinsic::x86_avx2_psra_w:
2477   case Intrinsic::x86_avx512_psra_q_128:
2478   case Intrinsic::x86_avx512_psra_q_256:
2479   case Intrinsic::x86_avx512_psra_d_512:
2480   case Intrinsic::x86_avx512_psra_q_512:
2481   case Intrinsic::x86_avx512_psra_w_512:
2482   case Intrinsic::x86_sse2_psrl_d:
2483   case Intrinsic::x86_sse2_psrl_q:
2484   case Intrinsic::x86_sse2_psrl_w:
2485   case Intrinsic::x86_avx2_psrl_d:
2486   case Intrinsic::x86_avx2_psrl_q:
2487   case Intrinsic::x86_avx2_psrl_w:
2488   case Intrinsic::x86_avx512_psrl_d_512:
2489   case Intrinsic::x86_avx512_psrl_q_512:
2490   case Intrinsic::x86_avx512_psrl_w_512:
2491   case Intrinsic::x86_sse2_psll_d:
2492   case Intrinsic::x86_sse2_psll_q:
2493   case Intrinsic::x86_sse2_psll_w:
2494   case Intrinsic::x86_avx2_psll_d:
2495   case Intrinsic::x86_avx2_psll_q:
2496   case Intrinsic::x86_avx2_psll_w:
2497   case Intrinsic::x86_avx512_psll_d_512:
2498   case Intrinsic::x86_avx512_psll_q_512:
2499   case Intrinsic::x86_avx512_psll_w_512: {
2500     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2501       return IC.replaceInstUsesWith(II, V);
2502     }
2503 
2504     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2505     // operand to compute the shift amount.
2506     Value *Arg1 = II.getArgOperand(1);
2507     assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2508            "Unexpected packed shift size");
2509     unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2510 
2511     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2512       return IC.replaceOperand(II, 1, V);
2513     }
2514     break;
2515   }
2516 
2517   case Intrinsic::x86_avx2_psllv_d:
2518   case Intrinsic::x86_avx2_psllv_d_256:
2519   case Intrinsic::x86_avx2_psllv_q:
2520   case Intrinsic::x86_avx2_psllv_q_256:
2521   case Intrinsic::x86_avx512_psllv_d_512:
2522   case Intrinsic::x86_avx512_psllv_q_512:
2523   case Intrinsic::x86_avx512_psllv_w_128:
2524   case Intrinsic::x86_avx512_psllv_w_256:
2525   case Intrinsic::x86_avx512_psllv_w_512:
2526   case Intrinsic::x86_avx2_psrav_d:
2527   case Intrinsic::x86_avx2_psrav_d_256:
2528   case Intrinsic::x86_avx512_psrav_q_128:
2529   case Intrinsic::x86_avx512_psrav_q_256:
2530   case Intrinsic::x86_avx512_psrav_d_512:
2531   case Intrinsic::x86_avx512_psrav_q_512:
2532   case Intrinsic::x86_avx512_psrav_w_128:
2533   case Intrinsic::x86_avx512_psrav_w_256:
2534   case Intrinsic::x86_avx512_psrav_w_512:
2535   case Intrinsic::x86_avx2_psrlv_d:
2536   case Intrinsic::x86_avx2_psrlv_d_256:
2537   case Intrinsic::x86_avx2_psrlv_q:
2538   case Intrinsic::x86_avx2_psrlv_q_256:
2539   case Intrinsic::x86_avx512_psrlv_d_512:
2540   case Intrinsic::x86_avx512_psrlv_q_512:
2541   case Intrinsic::x86_avx512_psrlv_w_128:
2542   case Intrinsic::x86_avx512_psrlv_w_256:
2543   case Intrinsic::x86_avx512_psrlv_w_512:
2544     if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2545       return IC.replaceInstUsesWith(II, V);
2546     }
2547     break;
2548 
2549   case Intrinsic::x86_sse2_packssdw_128:
2550   case Intrinsic::x86_sse2_packsswb_128:
2551   case Intrinsic::x86_avx2_packssdw:
2552   case Intrinsic::x86_avx2_packsswb:
2553   case Intrinsic::x86_avx512_packssdw_512:
2554   case Intrinsic::x86_avx512_packsswb_512:
2555     if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2556       return IC.replaceInstUsesWith(II, V);
2557     }
2558     break;
2559 
2560   case Intrinsic::x86_sse2_packuswb_128:
2561   case Intrinsic::x86_sse41_packusdw:
2562   case Intrinsic::x86_avx2_packusdw:
2563   case Intrinsic::x86_avx2_packuswb:
2564   case Intrinsic::x86_avx512_packusdw_512:
2565   case Intrinsic::x86_avx512_packuswb_512:
2566     if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2567       return IC.replaceInstUsesWith(II, V);
2568     }
2569     break;
2570 
2571   case Intrinsic::x86_sse2_pmadd_wd:
2572   case Intrinsic::x86_avx2_pmadd_wd:
2573   case Intrinsic::x86_avx512_pmaddw_d_512:
2574     if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) {
2575       return IC.replaceInstUsesWith(II, V);
2576     }
2577     break;
2578 
2579   case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
2580   case Intrinsic::x86_avx2_pmadd_ub_sw:
2581   case Intrinsic::x86_avx512_pmaddubs_w_512:
2582     if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) {
2583       return IC.replaceInstUsesWith(II, V);
2584     }
2585     break;
2586 
2587   case Intrinsic::x86_pclmulqdq:
2588   case Intrinsic::x86_pclmulqdq_256:
2589   case Intrinsic::x86_pclmulqdq_512: {
2590     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2591       unsigned Imm = C->getZExtValue();
2592 
2593       bool MadeChange = false;
2594       Value *Arg0 = II.getArgOperand(0);
2595       Value *Arg1 = II.getArgOperand(1);
2596       unsigned VWidth =
2597           cast<FixedVectorType>(Arg0->getType())->getNumElements();
2598 
2599       APInt UndefElts1(VWidth, 0);
2600       APInt DemandedElts1 =
2601           APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2602       if (Value *V =
2603               IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2604         IC.replaceOperand(II, 0, V);
2605         MadeChange = true;
2606       }
2607 
2608       APInt UndefElts2(VWidth, 0);
2609       APInt DemandedElts2 =
2610           APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2611       if (Value *V =
2612               IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2613         IC.replaceOperand(II, 1, V);
2614         MadeChange = true;
2615       }
2616 
2617       // If either input elements are undef, the result is zero.
2618       if (DemandedElts1.isSubsetOf(UndefElts1) ||
2619           DemandedElts2.isSubsetOf(UndefElts2)) {
2620         return IC.replaceInstUsesWith(II,
2621                                       ConstantAggregateZero::get(II.getType()));
2622       }
2623 
2624       if (MadeChange) {
2625         return &II;
2626       }
2627     }
2628     break;
2629   }
2630 
2631   case Intrinsic::x86_sse41_insertps:
2632     if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2633       return IC.replaceInstUsesWith(II, V);
2634     }
2635     break;
2636 
2637   case Intrinsic::x86_sse4a_extrq: {
2638     Value *Op0 = II.getArgOperand(0);
2639     Value *Op1 = II.getArgOperand(1);
2640     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2641     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2642     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2643            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2644            VWidth1 == 16 && "Unexpected operand sizes");
2645 
2646     // See if we're dealing with constant values.
2647     auto *C1 = dyn_cast<Constant>(Op1);
2648     auto *CILength =
2649         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2650            : nullptr;
2651     auto *CIIndex =
2652         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2653            : nullptr;
2654 
2655     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2656     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2657       return IC.replaceInstUsesWith(II, V);
2658     }
2659 
2660     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2661     // operands and the lowest 16-bits of the second.
2662     bool MadeChange = false;
2663     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2664       IC.replaceOperand(II, 0, V);
2665       MadeChange = true;
2666     }
2667     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2668       IC.replaceOperand(II, 1, V);
2669       MadeChange = true;
2670     }
2671     if (MadeChange) {
2672       return &II;
2673     }
2674     break;
2675   }
2676 
2677   case Intrinsic::x86_sse4a_extrqi: {
2678     // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2679     // bits of the lower 64-bits. The upper 64-bits are undefined.
2680     Value *Op0 = II.getArgOperand(0);
2681     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2682     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2683            "Unexpected operand size");
2684 
2685     // See if we're dealing with constant values.
2686     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2687     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2688 
2689     // Attempt to simplify to a constant or shuffle vector.
2690     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2691       return IC.replaceInstUsesWith(II, V);
2692     }
2693 
2694     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2695     // operand.
2696     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2697       return IC.replaceOperand(II, 0, V);
2698     }
2699     break;
2700   }
2701 
2702   case Intrinsic::x86_sse4a_insertq: {
2703     Value *Op0 = II.getArgOperand(0);
2704     Value *Op1 = II.getArgOperand(1);
2705     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2706     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2707            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2708            cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2709            "Unexpected operand size");
2710 
2711     // See if we're dealing with constant values.
2712     auto *C1 = dyn_cast<Constant>(Op1);
2713     auto *CI11 =
2714         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2715            : nullptr;
2716 
2717     // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2718     if (CI11) {
2719       const APInt &V11 = CI11->getValue();
2720       APInt Len = V11.zextOrTrunc(6);
2721       APInt Idx = V11.lshr(8).zextOrTrunc(6);
2722       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2723         return IC.replaceInstUsesWith(II, V);
2724       }
2725     }
2726 
2727     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2728     // operand.
2729     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2730       return IC.replaceOperand(II, 0, V);
2731     }
2732     break;
2733   }
2734 
2735   case Intrinsic::x86_sse4a_insertqi: {
2736     // INSERTQI: Extract lowest Length bits from lower half of second source and
2737     // insert over first source starting at Index bit. The upper 64-bits are
2738     // undefined.
2739     Value *Op0 = II.getArgOperand(0);
2740     Value *Op1 = II.getArgOperand(1);
2741     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2742     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2743     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2744            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2745            VWidth1 == 2 && "Unexpected operand sizes");
2746 
2747     // See if we're dealing with constant values.
2748     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2749     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2750 
2751     // Attempt to simplify to a constant or shuffle vector.
2752     if (CILength && CIIndex) {
2753       APInt Len = CILength->getValue().zextOrTrunc(6);
2754       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2755       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2756         return IC.replaceInstUsesWith(II, V);
2757       }
2758     }
2759 
2760     // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2761     // operands.
2762     bool MadeChange = false;
2763     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2764       IC.replaceOperand(II, 0, V);
2765       MadeChange = true;
2766     }
2767     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2768       IC.replaceOperand(II, 1, V);
2769       MadeChange = true;
2770     }
2771     if (MadeChange) {
2772       return &II;
2773     }
2774     break;
2775   }
2776 
2777   case Intrinsic::x86_sse41_pblendvb:
2778   case Intrinsic::x86_sse41_blendvps:
2779   case Intrinsic::x86_sse41_blendvpd:
2780   case Intrinsic::x86_avx_blendv_ps_256:
2781   case Intrinsic::x86_avx_blendv_pd_256:
2782   case Intrinsic::x86_avx2_pblendvb: {
2783     // fold (blend A, A, Mask) -> A
2784     Value *Op0 = II.getArgOperand(0);
2785     Value *Op1 = II.getArgOperand(1);
2786     Value *Mask = II.getArgOperand(2);
2787     if (Op0 == Op1) {
2788       return IC.replaceInstUsesWith(II, Op0);
2789     }
2790 
2791     // Zero Mask - select 1st argument.
2792     if (isa<ConstantAggregateZero>(Mask)) {
2793       return IC.replaceInstUsesWith(II, Op0);
2794     }
2795 
2796     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2797     if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2798       Constant *NewSelector =
2799           getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
2800       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2801     }
2802 
2803     // Peek through a one-use shuffle - VectorCombine should have simplified
2804     // this for cases where we're splitting wider vectors to use blendv
2805     // intrinsics.
2806     Value *MaskSrc = nullptr;
2807     ArrayRef<int> ShuffleMask;
2808     if (match(Mask, PatternMatch::m_OneUse(PatternMatch::m_Shuffle(
2809                         PatternMatch::m_Value(MaskSrc), PatternMatch::m_Undef(),
2810                         PatternMatch::m_Mask(ShuffleMask))))) {
2811       // Bail if the shuffle was irregular or contains undefs.
2812       int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2813       if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) ||
2814           any_of(ShuffleMask,
2815                  [NumElts](int M) { return M < 0 || M >= NumElts; }))
2816         break;
2817       Mask = MaskSrc;
2818     }
2819 
2820     // Convert to a vector select if we can bypass casts and find a boolean
2821     // vector condition value.
2822     Value *BoolVec;
2823     Mask = InstCombiner::peekThroughBitcast(Mask);
2824     if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
2825         BoolVec->getType()->isVectorTy() &&
2826         BoolVec->getType()->getScalarSizeInBits() == 1) {
2827       auto *MaskTy = cast<FixedVectorType>(Mask->getType());
2828       auto *OpTy = cast<FixedVectorType>(II.getType());
2829       unsigned NumMaskElts = MaskTy->getNumElements();
2830       unsigned NumOperandElts = OpTy->getNumElements();
2831 
2832       // If we peeked through a shuffle, reapply the shuffle to the bool vector.
2833       if (MaskSrc) {
2834         unsigned NumMaskSrcElts =
2835             cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2836         NumMaskElts = (ShuffleMask.size() * NumMaskElts) / NumMaskSrcElts;
2837         // Multiple mask bits maps to the same operand element - bail out.
2838         if (NumMaskElts > NumOperandElts)
2839           break;
2840         SmallVector<int> ScaledMask;
2841         if (!llvm::scaleShuffleMaskElts(NumMaskElts, ShuffleMask, ScaledMask))
2842           break;
2843         BoolVec = IC.Builder.CreateShuffleVector(BoolVec, ScaledMask);
2844         MaskTy = FixedVectorType::get(MaskTy->getElementType(), NumMaskElts);
2845       }
2846       assert(MaskTy->getPrimitiveSizeInBits() ==
2847                  OpTy->getPrimitiveSizeInBits() &&
2848              "Not expecting mask and operands with different sizes");
2849 
2850       if (NumMaskElts == NumOperandElts) {
2851         return SelectInst::Create(BoolVec, Op1, Op0);
2852       }
2853 
2854       // If the mask has less elements than the operands, each mask bit maps to
2855       // multiple elements of the operands. Bitcast back and forth.
2856       if (NumMaskElts < NumOperandElts) {
2857         Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);
2858         Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);
2859         Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
2860         return new BitCastInst(Sel, II.getType());
2861       }
2862     }
2863 
2864     break;
2865   }
2866 
2867   case Intrinsic::x86_ssse3_pshuf_b_128:
2868   case Intrinsic::x86_avx2_pshuf_b:
2869   case Intrinsic::x86_avx512_pshuf_b_512:
2870     if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
2871       return IC.replaceInstUsesWith(II, V);
2872     }
2873     break;
2874 
2875   case Intrinsic::x86_avx_vpermilvar_ps:
2876   case Intrinsic::x86_avx_vpermilvar_ps_256:
2877   case Intrinsic::x86_avx512_vpermilvar_ps_512:
2878   case Intrinsic::x86_avx_vpermilvar_pd:
2879   case Intrinsic::x86_avx_vpermilvar_pd_256:
2880   case Intrinsic::x86_avx512_vpermilvar_pd_512:
2881     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2882       return IC.replaceInstUsesWith(II, V);
2883     }
2884     break;
2885 
2886   case Intrinsic::x86_avx2_permd:
2887   case Intrinsic::x86_avx2_permps:
2888   case Intrinsic::x86_avx512_permvar_df_256:
2889   case Intrinsic::x86_avx512_permvar_df_512:
2890   case Intrinsic::x86_avx512_permvar_di_256:
2891   case Intrinsic::x86_avx512_permvar_di_512:
2892   case Intrinsic::x86_avx512_permvar_hi_128:
2893   case Intrinsic::x86_avx512_permvar_hi_256:
2894   case Intrinsic::x86_avx512_permvar_hi_512:
2895   case Intrinsic::x86_avx512_permvar_qi_128:
2896   case Intrinsic::x86_avx512_permvar_qi_256:
2897   case Intrinsic::x86_avx512_permvar_qi_512:
2898   case Intrinsic::x86_avx512_permvar_sf_512:
2899   case Intrinsic::x86_avx512_permvar_si_512:
2900     if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
2901       return IC.replaceInstUsesWith(II, V);
2902     }
2903     break;
2904 
2905   case Intrinsic::x86_avx512_vpermi2var_d_128:
2906   case Intrinsic::x86_avx512_vpermi2var_d_256:
2907   case Intrinsic::x86_avx512_vpermi2var_d_512:
2908   case Intrinsic::x86_avx512_vpermi2var_hi_128:
2909   case Intrinsic::x86_avx512_vpermi2var_hi_256:
2910   case Intrinsic::x86_avx512_vpermi2var_hi_512:
2911   case Intrinsic::x86_avx512_vpermi2var_pd_128:
2912   case Intrinsic::x86_avx512_vpermi2var_pd_256:
2913   case Intrinsic::x86_avx512_vpermi2var_pd_512:
2914   case Intrinsic::x86_avx512_vpermi2var_ps_128:
2915   case Intrinsic::x86_avx512_vpermi2var_ps_256:
2916   case Intrinsic::x86_avx512_vpermi2var_ps_512:
2917   case Intrinsic::x86_avx512_vpermi2var_q_128:
2918   case Intrinsic::x86_avx512_vpermi2var_q_256:
2919   case Intrinsic::x86_avx512_vpermi2var_q_512:
2920   case Intrinsic::x86_avx512_vpermi2var_qi_128:
2921   case Intrinsic::x86_avx512_vpermi2var_qi_256:
2922   case Intrinsic::x86_avx512_vpermi2var_qi_512:
2923     if (Value *V = simplifyX86vpermv3(II, IC.Builder)) {
2924       return IC.replaceInstUsesWith(II, V);
2925     }
2926     break;
2927 
2928   case Intrinsic::x86_avx_maskload_ps:
2929   case Intrinsic::x86_avx_maskload_pd:
2930   case Intrinsic::x86_avx_maskload_ps_256:
2931   case Intrinsic::x86_avx_maskload_pd_256:
2932   case Intrinsic::x86_avx2_maskload_d:
2933   case Intrinsic::x86_avx2_maskload_q:
2934   case Intrinsic::x86_avx2_maskload_d_256:
2935   case Intrinsic::x86_avx2_maskload_q_256:
2936     if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
2937       return I;
2938     }
2939     break;
2940 
2941   case Intrinsic::x86_sse2_maskmov_dqu:
2942   case Intrinsic::x86_avx_maskstore_ps:
2943   case Intrinsic::x86_avx_maskstore_pd:
2944   case Intrinsic::x86_avx_maskstore_ps_256:
2945   case Intrinsic::x86_avx_maskstore_pd_256:
2946   case Intrinsic::x86_avx2_maskstore_d:
2947   case Intrinsic::x86_avx2_maskstore_q:
2948   case Intrinsic::x86_avx2_maskstore_d_256:
2949   case Intrinsic::x86_avx2_maskstore_q_256:
2950     if (simplifyX86MaskedStore(II, IC)) {
2951       return nullptr;
2952     }
2953     break;
2954 
2955   case Intrinsic::x86_addcarry_32:
2956   case Intrinsic::x86_addcarry_64:
2957     if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
2958       return IC.replaceInstUsesWith(II, V);
2959     }
2960     break;
2961 
2962   case Intrinsic::x86_avx512_pternlog_d_128:
2963   case Intrinsic::x86_avx512_pternlog_d_256:
2964   case Intrinsic::x86_avx512_pternlog_d_512:
2965   case Intrinsic::x86_avx512_pternlog_q_128:
2966   case Intrinsic::x86_avx512_pternlog_q_256:
2967   case Intrinsic::x86_avx512_pternlog_q_512:
2968     if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
2969       return IC.replaceInstUsesWith(II, V);
2970     }
2971     break;
2972   default:
2973     break;
2974   }
2975   return std::nullopt;
2976 }
2977 
2978 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
2979     InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
2980     bool &KnownBitsComputed) const {
2981   switch (II.getIntrinsicID()) {
2982   default:
2983     break;
2984   case Intrinsic::x86_mmx_pmovmskb:
2985   case Intrinsic::x86_sse_movmsk_ps:
2986   case Intrinsic::x86_sse2_movmsk_pd:
2987   case Intrinsic::x86_sse2_pmovmskb_128:
2988   case Intrinsic::x86_avx_movmsk_ps_256:
2989   case Intrinsic::x86_avx_movmsk_pd_256:
2990   case Intrinsic::x86_avx2_pmovmskb: {
2991     // MOVMSK copies the vector elements' sign bits to the low bits
2992     // and zeros the high bits.
2993     unsigned ArgWidth;
2994     if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
2995       ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
2996     } else {
2997       auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
2998       ArgWidth = ArgType->getNumElements();
2999     }
3000 
3001     // If we don't need any of low bits then return zero,
3002     // we know that DemandedMask is non-zero already.
3003     APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
3004     Type *VTy = II.getType();
3005     if (DemandedElts.isZero()) {
3006       return ConstantInt::getNullValue(VTy);
3007     }
3008 
3009     // We know that the upper bits are set to zero.
3010     Known.Zero.setBitsFrom(ArgWidth);
3011     KnownBitsComputed = true;
3012     break;
3013   }
3014   }
3015   return std::nullopt;
3016 }
3017 
3018 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
3019     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
3020     APInt &UndefElts2, APInt &UndefElts3,
3021     std::function<void(Instruction *, unsigned, APInt, APInt &)>
3022         simplifyAndSetOp) const {
3023   unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
3024   switch (II.getIntrinsicID()) {
3025   default:
3026     break;
3027   case Intrinsic::x86_xop_vfrcz_ss:
3028   case Intrinsic::x86_xop_vfrcz_sd:
3029     // The instructions for these intrinsics are speced to zero upper bits not
3030     // pass them through like other scalar intrinsics. So we shouldn't just
3031     // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
3032     // Instead we should return a zero vector.
3033     if (!DemandedElts[0]) {
3034       IC.addToWorklist(&II);
3035       return ConstantAggregateZero::get(II.getType());
3036     }
3037 
3038     // Only the lower element is used.
3039     DemandedElts = 1;
3040     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3041 
3042     // Only the lower element is undefined. The high elements are zero.
3043     UndefElts = UndefElts[0];
3044     break;
3045 
3046   // Unary scalar-as-vector operations that work column-wise.
3047   case Intrinsic::x86_sse_rcp_ss:
3048   case Intrinsic::x86_sse_rsqrt_ss:
3049     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3050 
3051     // If lowest element of a scalar op isn't used then use Arg0.
3052     if (!DemandedElts[0]) {
3053       IC.addToWorklist(&II);
3054       return II.getArgOperand(0);
3055     }
3056     // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
3057     // checks).
3058     break;
3059 
3060   // Binary scalar-as-vector operations that work column-wise. The high
3061   // elements come from operand 0. The low element is a function of both
3062   // operands.
3063   case Intrinsic::x86_sse_min_ss:
3064   case Intrinsic::x86_sse_max_ss:
3065   case Intrinsic::x86_sse_cmp_ss:
3066   case Intrinsic::x86_sse2_min_sd:
3067   case Intrinsic::x86_sse2_max_sd:
3068   case Intrinsic::x86_sse2_cmp_sd: {
3069     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3070 
3071     // If lowest element of a scalar op isn't used then use Arg0.
3072     if (!DemandedElts[0]) {
3073       IC.addToWorklist(&II);
3074       return II.getArgOperand(0);
3075     }
3076 
3077     // Only lower element is used for operand 1.
3078     DemandedElts = 1;
3079     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3080 
3081     // Lower element is undefined if both lower elements are undefined.
3082     // Consider things like undef&0.  The result is known zero, not undef.
3083     if (!UndefElts2[0])
3084       UndefElts.clearBit(0);
3085 
3086     break;
3087   }
3088 
3089   // Binary scalar-as-vector operations that work column-wise. The high
3090   // elements come from operand 0 and the low element comes from operand 1.
3091   case Intrinsic::x86_sse41_round_ss:
3092   case Intrinsic::x86_sse41_round_sd: {
3093     // Don't use the low element of operand 0.
3094     APInt DemandedElts2 = DemandedElts;
3095     DemandedElts2.clearBit(0);
3096     simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
3097 
3098     // If lowest element of a scalar op isn't used then use Arg0.
3099     if (!DemandedElts[0]) {
3100       IC.addToWorklist(&II);
3101       return II.getArgOperand(0);
3102     }
3103 
3104     // Only lower element is used for operand 1.
3105     DemandedElts = 1;
3106     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3107 
3108     // Take the high undef elements from operand 0 and take the lower element
3109     // from operand 1.
3110     UndefElts.clearBit(0);
3111     UndefElts |= UndefElts2[0];
3112     break;
3113   }
3114 
3115   // Three input scalar-as-vector operations that work column-wise. The high
3116   // elements come from operand 0 and the low element is a function of all
3117   // three inputs.
3118   case Intrinsic::x86_avx512_mask_add_ss_round:
3119   case Intrinsic::x86_avx512_mask_div_ss_round:
3120   case Intrinsic::x86_avx512_mask_mul_ss_round:
3121   case Intrinsic::x86_avx512_mask_sub_ss_round:
3122   case Intrinsic::x86_avx512_mask_max_ss_round:
3123   case Intrinsic::x86_avx512_mask_min_ss_round:
3124   case Intrinsic::x86_avx512_mask_add_sd_round:
3125   case Intrinsic::x86_avx512_mask_div_sd_round:
3126   case Intrinsic::x86_avx512_mask_mul_sd_round:
3127   case Intrinsic::x86_avx512_mask_sub_sd_round:
3128   case Intrinsic::x86_avx512_mask_max_sd_round:
3129   case Intrinsic::x86_avx512_mask_min_sd_round:
3130     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3131 
3132     // If lowest element of a scalar op isn't used then use Arg0.
3133     if (!DemandedElts[0]) {
3134       IC.addToWorklist(&II);
3135       return II.getArgOperand(0);
3136     }
3137 
3138     // Only lower element is used for operand 1 and 2.
3139     DemandedElts = 1;
3140     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3141     simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
3142 
3143     // Lower element is undefined if all three lower elements are undefined.
3144     // Consider things like undef&0.  The result is known zero, not undef.
3145     if (!UndefElts2[0] || !UndefElts3[0])
3146       UndefElts.clearBit(0);
3147     break;
3148 
3149   // TODO: Add fmaddsub support?
3150   case Intrinsic::x86_sse3_addsub_pd:
3151   case Intrinsic::x86_sse3_addsub_ps:
3152   case Intrinsic::x86_avx_addsub_pd_256:
3153   case Intrinsic::x86_avx_addsub_ps_256: {
3154     // If none of the even or none of the odd lanes are required, turn this
3155     // into a generic FP math instruction.
3156     APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
3157     APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
3158     bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
3159     bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
3160     if (IsSubOnly || IsAddOnly) {
3161       assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
3162       IRBuilderBase::InsertPointGuard Guard(IC.Builder);
3163       IC.Builder.SetInsertPoint(&II);
3164       Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3165       return IC.Builder.CreateBinOp(
3166           IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3167     }
3168 
3169     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3170     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3171     UndefElts &= UndefElts2;
3172     break;
3173   }
3174 
3175   // General per-element vector operations.
3176   case Intrinsic::x86_avx2_psllv_d:
3177   case Intrinsic::x86_avx2_psllv_d_256:
3178   case Intrinsic::x86_avx2_psllv_q:
3179   case Intrinsic::x86_avx2_psllv_q_256:
3180   case Intrinsic::x86_avx2_psrlv_d:
3181   case Intrinsic::x86_avx2_psrlv_d_256:
3182   case Intrinsic::x86_avx2_psrlv_q:
3183   case Intrinsic::x86_avx2_psrlv_q_256:
3184   case Intrinsic::x86_avx2_psrav_d:
3185   case Intrinsic::x86_avx2_psrav_d_256: {
3186     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3187     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3188     UndefElts &= UndefElts2;
3189     break;
3190   }
3191 
3192   case Intrinsic::x86_sse2_pmulh_w:
3193   case Intrinsic::x86_avx2_pmulh_w:
3194   case Intrinsic::x86_avx512_pmulh_w_512:
3195   case Intrinsic::x86_sse2_pmulhu_w:
3196   case Intrinsic::x86_avx2_pmulhu_w:
3197   case Intrinsic::x86_avx512_pmulhu_w_512:
3198   case Intrinsic::x86_ssse3_pmul_hr_sw_128:
3199   case Intrinsic::x86_avx2_pmul_hr_sw:
3200   case Intrinsic::x86_avx512_pmul_hr_sw_512: {
3201     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3202     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3203     // NOTE: mulh(undef,undef) != undef.
3204     break;
3205   }
3206 
3207   case Intrinsic::x86_sse2_packssdw_128:
3208   case Intrinsic::x86_sse2_packsswb_128:
3209   case Intrinsic::x86_sse2_packuswb_128:
3210   case Intrinsic::x86_sse41_packusdw:
3211   case Intrinsic::x86_avx2_packssdw:
3212   case Intrinsic::x86_avx2_packsswb:
3213   case Intrinsic::x86_avx2_packusdw:
3214   case Intrinsic::x86_avx2_packuswb:
3215   case Intrinsic::x86_avx512_packssdw_512:
3216   case Intrinsic::x86_avx512_packsswb_512:
3217   case Intrinsic::x86_avx512_packusdw_512:
3218   case Intrinsic::x86_avx512_packuswb_512: {
3219     auto *Ty0 = II.getArgOperand(0)->getType();
3220     unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3221     assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3222 
3223     unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3224     unsigned VWidthPerLane = VWidth / NumLanes;
3225     unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3226 
3227     // Per lane, pack the elements of the first input and then the second.
3228     // e.g.
3229     // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3230     // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3231     for (int OpNum = 0; OpNum != 2; ++OpNum) {
3232       APInt OpDemandedElts(InnerVWidth, 0);
3233       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3234         unsigned LaneIdx = Lane * VWidthPerLane;
3235         for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3236           unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3237           if (DemandedElts[Idx])
3238             OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3239         }
3240       }
3241 
3242       // Demand elements from the operand.
3243       APInt OpUndefElts(InnerVWidth, 0);
3244       simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3245 
3246       // Pack the operand's UNDEF elements, one lane at a time.
3247       OpUndefElts = OpUndefElts.zext(VWidth);
3248       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3249         APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3250         LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3251         LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3252         UndefElts |= LaneElts;
3253       }
3254     }
3255     break;
3256   }
3257 
3258   case Intrinsic::x86_sse2_pmadd_wd:
3259   case Intrinsic::x86_avx2_pmadd_wd:
3260   case Intrinsic::x86_avx512_pmaddw_d_512:
3261   case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
3262   case Intrinsic::x86_avx2_pmadd_ub_sw:
3263   case Intrinsic::x86_avx512_pmaddubs_w_512: {
3264     // PMADD - demand both src elements that map to each dst element.
3265     auto *ArgTy = II.getArgOperand(0)->getType();
3266     unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements();
3267     assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
3268     APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth);
3269     APInt Op0UndefElts(InnerVWidth, 0);
3270     APInt Op1UndefElts(InnerVWidth, 0);
3271     simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts);
3272     simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts);
3273     // NOTE: madd(undef,undef) != undef.
3274     break;
3275   }
3276 
3277   // PSHUFB
3278   case Intrinsic::x86_ssse3_pshuf_b_128:
3279   case Intrinsic::x86_avx2_pshuf_b:
3280   case Intrinsic::x86_avx512_pshuf_b_512:
3281   // PERMILVAR
3282   case Intrinsic::x86_avx_vpermilvar_ps:
3283   case Intrinsic::x86_avx_vpermilvar_ps_256:
3284   case Intrinsic::x86_avx512_vpermilvar_ps_512:
3285   case Intrinsic::x86_avx_vpermilvar_pd:
3286   case Intrinsic::x86_avx_vpermilvar_pd_256:
3287   case Intrinsic::x86_avx512_vpermilvar_pd_512:
3288   // PERMV
3289   case Intrinsic::x86_avx2_permd:
3290   case Intrinsic::x86_avx2_permps: {
3291     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3292     break;
3293   }
3294 
3295   // SSE4A instructions leave the upper 64-bits of the 128-bit result
3296   // in an undefined state.
3297   case Intrinsic::x86_sse4a_extrq:
3298   case Intrinsic::x86_sse4a_extrqi:
3299   case Intrinsic::x86_sse4a_insertq:
3300   case Intrinsic::x86_sse4a_insertqi:
3301     UndefElts.setHighBits(VWidth / 2);
3302     break;
3303   }
3304   return std::nullopt;
3305 }
3306