xref: /llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp (revision 416f1c465db62d829283f6902ef35e027e127aa7)
1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
21 #include <optional>
22 
23 using namespace llvm;
24 using namespace llvm::PatternMatch;
25 
26 #define DEBUG_TYPE "x86tti"
27 
28 /// Return a constant boolean vector that has true elements in all positions
29 /// where the input constant data vector has an element with the sign bit set.
30 static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) {
31   VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
32   V = ConstantExpr::getBitCast(V, IntTy);
33   V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT,
34                                       Constant::getNullValue(IntTy), V, DL);
35   assert(V && "Vector must be foldable");
36   return V;
37 }
38 
39 /// Convert the x86 XMM integer vector mask to a vector of bools based on
40 /// each element's most significant bit (the sign bit).
41 static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
42   // Fold Constant Mask.
43   if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
44     return getNegativeIsTrueBoolVec(ConstantMask, DL);
45 
46   // Mask was extended from a boolean vector.
47   Value *ExtMask;
48   if (match(Mask, m_SExt(m_Value(ExtMask))) &&
49       ExtMask->getType()->isIntOrIntVectorTy(1))
50     return ExtMask;
51 
52   return nullptr;
53 }
54 
55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
56 // XMM register mask efficiently, we could transform all x86 masked intrinsics
57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
58 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
59   Value *Ptr = II.getOperand(0);
60   Value *Mask = II.getOperand(1);
61   Constant *ZeroVec = Constant::getNullValue(II.getType());
62 
63   // Zero Mask - masked load instruction creates a zero vector.
64   if (isa<ConstantAggregateZero>(Mask))
65     return IC.replaceInstUsesWith(II, ZeroVec);
66 
67   // The mask is constant or extended from a bool vector. Convert this x86
68   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
69   if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
70     // The pass-through vector for an x86 masked load is a zero vector.
71     CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
72         II.getType(), Ptr, Align(1), BoolMask, ZeroVec);
73     return IC.replaceInstUsesWith(II, NewMaskedLoad);
74   }
75 
76   return nullptr;
77 }
78 
79 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
80 // XMM register mask efficiently, we could transform all x86 masked intrinsics
81 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
82 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
83   Value *Ptr = II.getOperand(0);
84   Value *Mask = II.getOperand(1);
85   Value *Vec = II.getOperand(2);
86 
87   // Zero Mask - this masked store instruction does nothing.
88   if (isa<ConstantAggregateZero>(Mask)) {
89     IC.eraseInstFromFunction(II);
90     return true;
91   }
92 
93   // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
94   // anything else at this level.
95   if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
96     return false;
97 
98   // The mask is constant or extended from a bool vector. Convert this x86
99   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
100   if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
101     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
102     PointerType *VecPtrTy = PointerType::get(Vec->getContext(), AddrSpace);
103     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
104 
105     IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
106 
107     // 'Replace uses' doesn't work for stores. Erase the original masked store.
108     IC.eraseInstFromFunction(II);
109     return true;
110   }
111 
112   return false;
113 }
114 
115 static Value *simplifyX86immShift(const IntrinsicInst &II,
116                                   InstCombiner::BuilderTy &Builder) {
117   bool LogicalShift = false;
118   bool ShiftLeft = false;
119   bool IsImm = false;
120 
121   switch (II.getIntrinsicID()) {
122   default:
123     llvm_unreachable("Unexpected intrinsic!");
124   case Intrinsic::x86_sse2_psrai_d:
125   case Intrinsic::x86_sse2_psrai_w:
126   case Intrinsic::x86_avx2_psrai_d:
127   case Intrinsic::x86_avx2_psrai_w:
128   case Intrinsic::x86_avx512_psrai_q_128:
129   case Intrinsic::x86_avx512_psrai_q_256:
130   case Intrinsic::x86_avx512_psrai_d_512:
131   case Intrinsic::x86_avx512_psrai_q_512:
132   case Intrinsic::x86_avx512_psrai_w_512:
133     IsImm = true;
134     [[fallthrough]];
135   case Intrinsic::x86_sse2_psra_d:
136   case Intrinsic::x86_sse2_psra_w:
137   case Intrinsic::x86_avx2_psra_d:
138   case Intrinsic::x86_avx2_psra_w:
139   case Intrinsic::x86_avx512_psra_q_128:
140   case Intrinsic::x86_avx512_psra_q_256:
141   case Intrinsic::x86_avx512_psra_d_512:
142   case Intrinsic::x86_avx512_psra_q_512:
143   case Intrinsic::x86_avx512_psra_w_512:
144     LogicalShift = false;
145     ShiftLeft = false;
146     break;
147   case Intrinsic::x86_sse2_psrli_d:
148   case Intrinsic::x86_sse2_psrli_q:
149   case Intrinsic::x86_sse2_psrli_w:
150   case Intrinsic::x86_avx2_psrli_d:
151   case Intrinsic::x86_avx2_psrli_q:
152   case Intrinsic::x86_avx2_psrli_w:
153   case Intrinsic::x86_avx512_psrli_d_512:
154   case Intrinsic::x86_avx512_psrli_q_512:
155   case Intrinsic::x86_avx512_psrli_w_512:
156     IsImm = true;
157     [[fallthrough]];
158   case Intrinsic::x86_sse2_psrl_d:
159   case Intrinsic::x86_sse2_psrl_q:
160   case Intrinsic::x86_sse2_psrl_w:
161   case Intrinsic::x86_avx2_psrl_d:
162   case Intrinsic::x86_avx2_psrl_q:
163   case Intrinsic::x86_avx2_psrl_w:
164   case Intrinsic::x86_avx512_psrl_d_512:
165   case Intrinsic::x86_avx512_psrl_q_512:
166   case Intrinsic::x86_avx512_psrl_w_512:
167     LogicalShift = true;
168     ShiftLeft = false;
169     break;
170   case Intrinsic::x86_sse2_pslli_d:
171   case Intrinsic::x86_sse2_pslli_q:
172   case Intrinsic::x86_sse2_pslli_w:
173   case Intrinsic::x86_avx2_pslli_d:
174   case Intrinsic::x86_avx2_pslli_q:
175   case Intrinsic::x86_avx2_pslli_w:
176   case Intrinsic::x86_avx512_pslli_d_512:
177   case Intrinsic::x86_avx512_pslli_q_512:
178   case Intrinsic::x86_avx512_pslli_w_512:
179     IsImm = true;
180     [[fallthrough]];
181   case Intrinsic::x86_sse2_psll_d:
182   case Intrinsic::x86_sse2_psll_q:
183   case Intrinsic::x86_sse2_psll_w:
184   case Intrinsic::x86_avx2_psll_d:
185   case Intrinsic::x86_avx2_psll_q:
186   case Intrinsic::x86_avx2_psll_w:
187   case Intrinsic::x86_avx512_psll_d_512:
188   case Intrinsic::x86_avx512_psll_q_512:
189   case Intrinsic::x86_avx512_psll_w_512:
190     LogicalShift = true;
191     ShiftLeft = true;
192     break;
193   }
194   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
195 
196   Value *Vec = II.getArgOperand(0);
197   Value *Amt = II.getArgOperand(1);
198   auto *VT = cast<FixedVectorType>(Vec->getType());
199   Type *SVT = VT->getElementType();
200   Type *AmtVT = Amt->getType();
201   unsigned VWidth = VT->getNumElements();
202   unsigned BitWidth = SVT->getPrimitiveSizeInBits();
203 
204   // If the shift amount is guaranteed to be in-range we can replace it with a
205   // generic shift. If its guaranteed to be out of range, logical shifts combine
206   // to zero and arithmetic shifts are clamped to (BitWidth - 1).
207   if (IsImm) {
208     assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
209     KnownBits KnownAmtBits =
210         llvm::computeKnownBits(Amt, II.getDataLayout());
211     if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
212       Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
213       Amt = Builder.CreateVectorSplat(VWidth, Amt);
214       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
215                                         : Builder.CreateLShr(Vec, Amt))
216                            : Builder.CreateAShr(Vec, Amt));
217     }
218     if (KnownAmtBits.getMinValue().uge(BitWidth)) {
219       if (LogicalShift)
220         return ConstantAggregateZero::get(VT);
221       Amt = ConstantInt::get(SVT, BitWidth - 1);
222       return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
223     }
224   } else {
225     // Ensure the first element has an in-range value and the rest of the
226     // elements in the bottom 64 bits are zero.
227     assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
228            cast<VectorType>(AmtVT)->getElementType() == SVT &&
229            "Unexpected shift-by-scalar type");
230     unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
231     APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
232     APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
233     KnownBits KnownLowerBits = llvm::computeKnownBits(
234         Amt, DemandedLower, II.getDataLayout());
235     KnownBits KnownUpperBits = llvm::computeKnownBits(
236         Amt, DemandedUpper, II.getDataLayout());
237     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
238         (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
239       SmallVector<int, 16> ZeroSplat(VWidth, 0);
240       Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
241       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
242                                         : Builder.CreateLShr(Vec, Amt))
243                            : Builder.CreateAShr(Vec, Amt));
244     }
245   }
246 
247   // Simplify if count is constant vector.
248   auto *CDV = dyn_cast<ConstantDataVector>(Amt);
249   if (!CDV)
250     return nullptr;
251 
252   // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
253   // operand to compute the shift amount.
254   assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
255          cast<VectorType>(AmtVT)->getElementType() == SVT &&
256          "Unexpected shift-by-scalar type");
257 
258   // Concatenate the sub-elements to create the 64-bit value.
259   APInt Count(64, 0);
260   for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
261     unsigned SubEltIdx = (NumSubElts - 1) - i;
262     auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
263     Count <<= BitWidth;
264     Count |= SubElt->getValue().zextOrTrunc(64);
265   }
266 
267   // If shift-by-zero then just return the original value.
268   if (Count.isZero())
269     return Vec;
270 
271   // Handle cases when Shift >= BitWidth.
272   if (Count.uge(BitWidth)) {
273     // If LogicalShift - just return zero.
274     if (LogicalShift)
275       return ConstantAggregateZero::get(VT);
276 
277     // If ArithmeticShift - clamp Shift to (BitWidth - 1).
278     Count = APInt(64, BitWidth - 1);
279   }
280 
281   // Get a constant vector of the same type as the first operand.
282   auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
283   auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
284 
285   if (ShiftLeft)
286     return Builder.CreateShl(Vec, ShiftVec);
287 
288   if (LogicalShift)
289     return Builder.CreateLShr(Vec, ShiftVec);
290 
291   return Builder.CreateAShr(Vec, ShiftVec);
292 }
293 
294 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
295 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
296 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
297 static Value *simplifyX86varShift(const IntrinsicInst &II,
298                                   InstCombiner::BuilderTy &Builder) {
299   bool LogicalShift = false;
300   bool ShiftLeft = false;
301 
302   switch (II.getIntrinsicID()) {
303   default:
304     llvm_unreachable("Unexpected intrinsic!");
305   case Intrinsic::x86_avx2_psrav_d:
306   case Intrinsic::x86_avx2_psrav_d_256:
307   case Intrinsic::x86_avx512_psrav_q_128:
308   case Intrinsic::x86_avx512_psrav_q_256:
309   case Intrinsic::x86_avx512_psrav_d_512:
310   case Intrinsic::x86_avx512_psrav_q_512:
311   case Intrinsic::x86_avx512_psrav_w_128:
312   case Intrinsic::x86_avx512_psrav_w_256:
313   case Intrinsic::x86_avx512_psrav_w_512:
314     LogicalShift = false;
315     ShiftLeft = false;
316     break;
317   case Intrinsic::x86_avx2_psrlv_d:
318   case Intrinsic::x86_avx2_psrlv_d_256:
319   case Intrinsic::x86_avx2_psrlv_q:
320   case Intrinsic::x86_avx2_psrlv_q_256:
321   case Intrinsic::x86_avx512_psrlv_d_512:
322   case Intrinsic::x86_avx512_psrlv_q_512:
323   case Intrinsic::x86_avx512_psrlv_w_128:
324   case Intrinsic::x86_avx512_psrlv_w_256:
325   case Intrinsic::x86_avx512_psrlv_w_512:
326     LogicalShift = true;
327     ShiftLeft = false;
328     break;
329   case Intrinsic::x86_avx2_psllv_d:
330   case Intrinsic::x86_avx2_psllv_d_256:
331   case Intrinsic::x86_avx2_psllv_q:
332   case Intrinsic::x86_avx2_psllv_q_256:
333   case Intrinsic::x86_avx512_psllv_d_512:
334   case Intrinsic::x86_avx512_psllv_q_512:
335   case Intrinsic::x86_avx512_psllv_w_128:
336   case Intrinsic::x86_avx512_psllv_w_256:
337   case Intrinsic::x86_avx512_psllv_w_512:
338     LogicalShift = true;
339     ShiftLeft = true;
340     break;
341   }
342   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
343 
344   Value *Vec = II.getArgOperand(0);
345   Value *Amt = II.getArgOperand(1);
346   auto *VT = cast<FixedVectorType>(II.getType());
347   Type *SVT = VT->getElementType();
348   int NumElts = VT->getNumElements();
349   int BitWidth = SVT->getIntegerBitWidth();
350 
351   // If the shift amount is guaranteed to be in-range we can replace it with a
352   // generic shift.
353   KnownBits KnownAmt =
354       llvm::computeKnownBits(Amt, II.getDataLayout());
355   if (KnownAmt.getMaxValue().ult(BitWidth)) {
356     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
357                                       : Builder.CreateLShr(Vec, Amt))
358                          : Builder.CreateAShr(Vec, Amt));
359   }
360 
361   // Simplify if all shift amounts are constant/undef.
362   auto *CShift = dyn_cast<Constant>(Amt);
363   if (!CShift)
364     return nullptr;
365 
366   // Collect each element's shift amount.
367   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
368   bool AnyOutOfRange = false;
369   SmallVector<int, 8> ShiftAmts;
370   for (int I = 0; I < NumElts; ++I) {
371     auto *CElt = CShift->getAggregateElement(I);
372     if (isa_and_nonnull<UndefValue>(CElt)) {
373       ShiftAmts.push_back(-1);
374       continue;
375     }
376 
377     auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
378     if (!COp)
379       return nullptr;
380 
381     // Handle out of range shifts.
382     // If LogicalShift - set to BitWidth (special case).
383     // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
384     APInt ShiftVal = COp->getValue();
385     if (ShiftVal.uge(BitWidth)) {
386       AnyOutOfRange = LogicalShift;
387       ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
388       continue;
389     }
390 
391     ShiftAmts.push_back((int)ShiftVal.getZExtValue());
392   }
393 
394   // If all elements out of range or UNDEF, return vector of zeros/undefs.
395   // ArithmeticShift should only hit this if they are all UNDEF.
396   auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
397   if (llvm::all_of(ShiftAmts, OutOfRange)) {
398     SmallVector<Constant *, 8> ConstantVec;
399     for (int Idx : ShiftAmts) {
400       if (Idx < 0) {
401         ConstantVec.push_back(UndefValue::get(SVT));
402       } else {
403         assert(LogicalShift && "Logical shift expected");
404         ConstantVec.push_back(ConstantInt::getNullValue(SVT));
405       }
406     }
407     return ConstantVector::get(ConstantVec);
408   }
409 
410   // We can't handle only some out of range values with generic logical shifts.
411   if (AnyOutOfRange)
412     return nullptr;
413 
414   // Build the shift amount constant vector.
415   SmallVector<Constant *, 8> ShiftVecAmts;
416   for (int Idx : ShiftAmts) {
417     if (Idx < 0)
418       ShiftVecAmts.push_back(UndefValue::get(SVT));
419     else
420       ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
421   }
422   auto ShiftVec = ConstantVector::get(ShiftVecAmts);
423 
424   if (ShiftLeft)
425     return Builder.CreateShl(Vec, ShiftVec);
426 
427   if (LogicalShift)
428     return Builder.CreateLShr(Vec, ShiftVec);
429 
430   return Builder.CreateAShr(Vec, ShiftVec);
431 }
432 
433 static Value *simplifyX86pack(IntrinsicInst &II,
434                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
435   Value *Arg0 = II.getArgOperand(0);
436   Value *Arg1 = II.getArgOperand(1);
437   Type *ResTy = II.getType();
438 
439   // Fast all undef handling.
440   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
441     return UndefValue::get(ResTy);
442 
443   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
444   unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
445   unsigned NumSrcElts = ArgTy->getNumElements();
446   assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
447          "Unexpected packing types");
448 
449   unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
450   unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
451   unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
452   assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
453          "Unexpected packing types");
454 
455   // Constant folding.
456   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
457     return nullptr;
458 
459   // Clamp Values - signed/unsigned both use signed clamp values, but they
460   // differ on the min/max values.
461   APInt MinValue, MaxValue;
462   if (IsSigned) {
463     // PACKSS: Truncate signed value with signed saturation.
464     // Source values less than dst minint are saturated to minint.
465     // Source values greater than dst maxint are saturated to maxint.
466     MinValue =
467         APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
468     MaxValue =
469         APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
470   } else {
471     // PACKUS: Truncate signed value with unsigned saturation.
472     // Source values less than zero are saturated to zero.
473     // Source values greater than dst maxuint are saturated to maxuint.
474     MinValue = APInt::getZero(SrcScalarSizeInBits);
475     MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
476   }
477 
478   auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
479   auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
480   Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
481   Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
482   Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
483   Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
484 
485   // Shuffle clamped args together at the lane level.
486   SmallVector<int, 32> PackMask;
487   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
488     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
489       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
490     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
491       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
492   }
493   auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
494 
495   // Truncate to dst size.
496   return Builder.CreateTrunc(Shuffle, ResTy);
497 }
498 
499 static Value *simplifyX86pmulh(IntrinsicInst &II,
500                                InstCombiner::BuilderTy &Builder, bool IsSigned,
501                                bool IsRounding) {
502   Value *Arg0 = II.getArgOperand(0);
503   Value *Arg1 = II.getArgOperand(1);
504   auto *ResTy = cast<FixedVectorType>(II.getType());
505   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
506   assert(ArgTy == ResTy && ResTy->getScalarSizeInBits() == 16 &&
507          "Unexpected PMULH types");
508   assert((!IsRounding || IsSigned) && "PMULHRS instruction must be signed");
509 
510   // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
511   if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
512     return ConstantAggregateZero::get(ResTy);
513 
514   // Multiply by zero.
515   if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
516     return ConstantAggregateZero::get(ResTy);
517 
518   // Multiply by one.
519   if (!IsRounding) {
520     if (match(Arg0, m_One()))
521       return IsSigned ? Builder.CreateAShr(Arg1, 15)
522                       : ConstantAggregateZero::get(ResTy);
523     if (match(Arg1, m_One()))
524       return IsSigned ? Builder.CreateAShr(Arg0, 15)
525                       : ConstantAggregateZero::get(ResTy);
526   }
527 
528   // Constant folding.
529   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
530     return nullptr;
531 
532   // Extend to twice the width and multiply.
533   auto Cast =
534       IsSigned ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
535   auto *ExtTy = FixedVectorType::getExtendedElementVectorType(ArgTy);
536   Value *LHS = Builder.CreateCast(Cast, Arg0, ExtTy);
537   Value *RHS = Builder.CreateCast(Cast, Arg1, ExtTy);
538   Value *Mul = Builder.CreateMul(LHS, RHS);
539 
540   if (IsRounding) {
541     // PMULHRSW: truncate to vXi18 of the most significant bits, add one and
542     // extract bits[16:1].
543     auto *RndEltTy = IntegerType::get(ExtTy->getContext(), 18);
544     auto *RndTy = FixedVectorType::get(RndEltTy, ExtTy);
545     Mul = Builder.CreateLShr(Mul, 14);
546     Mul = Builder.CreateTrunc(Mul, RndTy);
547     Mul = Builder.CreateAdd(Mul, ConstantInt::get(RndTy, 1));
548     Mul = Builder.CreateLShr(Mul, 1);
549   } else {
550     // PMULH/PMULHU: extract the vXi16 most significant bits.
551     Mul = Builder.CreateLShr(Mul, 16);
552   }
553 
554   return Builder.CreateTrunc(Mul, ResTy);
555 }
556 
557 static Value *simplifyX86pmadd(IntrinsicInst &II,
558                                InstCombiner::BuilderTy &Builder,
559                                bool IsPMADDWD) {
560   Value *Arg0 = II.getArgOperand(0);
561   Value *Arg1 = II.getArgOperand(1);
562   auto *ResTy = cast<FixedVectorType>(II.getType());
563   [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
564 
565   unsigned NumDstElts = ResTy->getNumElements();
566   assert(ArgTy->getNumElements() == (2 * NumDstElts) &&
567          ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) &&
568          "Unexpected PMADD types");
569 
570   // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
571   if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
572     return ConstantAggregateZero::get(ResTy);
573 
574   // Multiply by zero.
575   if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
576     return ConstantAggregateZero::get(ResTy);
577 
578   // Constant folding.
579   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
580     return nullptr;
581 
582   // Split Lo/Hi elements pairs, extend and add together.
583   // PMADDWD(X,Y) =
584   // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1])))
585   // PMADDUBSW(X,Y) =
586   // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1])))
587   SmallVector<int> LoMask, HiMask;
588   for (unsigned I = 0; I != NumDstElts; ++I) {
589     LoMask.push_back(2 * I + 0);
590     HiMask.push_back(2 * I + 1);
591   }
592 
593   auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask);
594   auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask);
595   auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask);
596   auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask);
597 
598   auto LHSCast =
599       IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
600   LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy);
601   LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy);
602   RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy);
603   RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy);
604   Value *Lo = Builder.CreateMul(LHSLo, RHSLo);
605   Value *Hi = Builder.CreateMul(LHSHi, RHSHi);
606   return IsPMADDWD
607              ? Builder.CreateAdd(Lo, Hi)
608              : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi});
609 }
610 
611 static Value *simplifyX86movmsk(const IntrinsicInst &II,
612                                 InstCombiner::BuilderTy &Builder) {
613   Value *Arg = II.getArgOperand(0);
614   Type *ResTy = II.getType();
615 
616   // movmsk(undef) -> zero as we must ensure the upper bits are zero.
617   if (isa<UndefValue>(Arg))
618     return Constant::getNullValue(ResTy);
619 
620   // Preserve previous behavior and give up.
621   // TODO: treat as <8 x i8>.
622   if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb)
623     return nullptr;
624 
625   auto *ArgTy = cast<FixedVectorType>(Arg->getType());
626 
627   // Expand MOVMSK to compare/bitcast/zext:
628   // e.g. PMOVMSKB(v16i8 x):
629   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
630   // %int = bitcast <16 x i1> %cmp to i16
631   // %res = zext i16 %int to i32
632   unsigned NumElts = ArgTy->getNumElements();
633   Type *IntegerTy = Builder.getIntNTy(NumElts);
634 
635   Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
636   Res = Builder.CreateIsNeg(Res);
637   Res = Builder.CreateBitCast(Res, IntegerTy);
638   Res = Builder.CreateZExtOrTrunc(Res, ResTy);
639   return Res;
640 }
641 
642 static Value *simplifyX86addcarry(const IntrinsicInst &II,
643                                   InstCombiner::BuilderTy &Builder) {
644   Value *CarryIn = II.getArgOperand(0);
645   Value *Op1 = II.getArgOperand(1);
646   Value *Op2 = II.getArgOperand(2);
647   Type *RetTy = II.getType();
648   Type *OpTy = Op1->getType();
649   assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
650          RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
651          "Unexpected types for x86 addcarry");
652 
653   // If carry-in is zero, this is just an unsigned add with overflow.
654   if (match(CarryIn, m_ZeroInt())) {
655     Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
656                                           {Op1, Op2});
657     // The types have to be adjusted to match the x86 call types.
658     Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
659     Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
660                                        Builder.getInt8Ty());
661     Value *Res = PoisonValue::get(RetTy);
662     Res = Builder.CreateInsertValue(Res, UAddOV, 0);
663     return Builder.CreateInsertValue(Res, UAddResult, 1);
664   }
665 
666   return nullptr;
667 }
668 
669 static Value *simplifyTernarylogic(const IntrinsicInst &II,
670                                    InstCombiner::BuilderTy &Builder) {
671 
672   auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
673   if (!ArgImm || ArgImm->getValue().uge(256))
674     return nullptr;
675 
676   Value *ArgA = II.getArgOperand(0);
677   Value *ArgB = II.getArgOperand(1);
678   Value *ArgC = II.getArgOperand(2);
679 
680   Type *Ty = II.getType();
681 
682   auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
683     return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
684   };
685   auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
686     return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
687   };
688   auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
689     return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
690   };
691   auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
692     return {Builder.CreateNot(V.first), ~V.second};
693   };
694   auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
695   auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
696   auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
697 
698   bool AIsConst = match(ArgA, m_ImmConstant());
699   bool BIsConst = match(ArgB, m_ImmConstant());
700   bool CIsConst = match(ArgC, m_ImmConstant());
701 
702   bool ABIsConst = AIsConst && BIsConst;
703   bool ACIsConst = AIsConst && CIsConst;
704   bool BCIsConst = BIsConst && CIsConst;
705   bool ABCIsConst = AIsConst && BIsConst && CIsConst;
706 
707   // Use for verification. Its a big table. Its difficult to go from Imm ->
708   // logic ops, but easy to verify that a set of logic ops is correct. We track
709   // the logic ops through the second value in the pair. At the end it should
710   // equal Imm.
711   std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
712   std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
713   std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
714   std::pair<Value *, uint8_t> Res = {nullptr, 0};
715 
716   // Currently we only handle cases that convert directly to another instruction
717   // or cases where all the ops are constant.  This is because we don't properly
718   // handle creating ternary ops in the backend, so splitting them here may
719   // cause regressions. As the backend improves, uncomment more cases.
720 
721   uint8_t Imm = ArgImm->getValue().getZExtValue();
722   switch (Imm) {
723   case 0x0:
724     Res = {Constant::getNullValue(Ty), 0};
725     break;
726   case 0x1:
727     if (ABCIsConst)
728       Res = Nor(Or(A, B), C);
729     break;
730   case 0x2:
731     if (ABCIsConst)
732       Res = And(Nor(A, B), C);
733     break;
734   case 0x3:
735     if (ABIsConst)
736       Res = Nor(A, B);
737     break;
738   case 0x4:
739     if (ABCIsConst)
740       Res = And(Nor(A, C), B);
741     break;
742   case 0x5:
743     if (ACIsConst)
744       Res = Nor(A, C);
745     break;
746   case 0x6:
747     if (ABCIsConst)
748       Res = Nor(A, Xnor(B, C));
749     break;
750   case 0x7:
751     if (ABCIsConst)
752       Res = Nor(A, And(B, C));
753     break;
754   case 0x8:
755     if (ABCIsConst)
756       Res = Nor(A, Nand(B, C));
757     break;
758   case 0x9:
759     if (ABCIsConst)
760       Res = Nor(A, Xor(B, C));
761     break;
762   case 0xa:
763     if (ACIsConst)
764       Res = Nor(A, Not(C));
765     break;
766   case 0xb:
767     if (ABCIsConst)
768       Res = Nor(A, Nor(C, Not(B)));
769     break;
770   case 0xc:
771     if (ABIsConst)
772       Res = Nor(A, Not(B));
773     break;
774   case 0xd:
775     if (ABCIsConst)
776       Res = Nor(A, Nor(B, Not(C)));
777     break;
778   case 0xe:
779     if (ABCIsConst)
780       Res = Nor(A, Nor(B, C));
781     break;
782   case 0xf:
783     Res = Not(A);
784     break;
785   case 0x10:
786     if (ABCIsConst)
787       Res = And(A, Nor(B, C));
788     break;
789   case 0x11:
790     if (BCIsConst)
791       Res = Nor(B, C);
792     break;
793   case 0x12:
794     if (ABCIsConst)
795       Res = Nor(Xnor(A, C), B);
796     break;
797   case 0x13:
798     if (ABCIsConst)
799       Res = Nor(And(A, C), B);
800     break;
801   case 0x14:
802     if (ABCIsConst)
803       Res = Nor(Xnor(A, B), C);
804     break;
805   case 0x15:
806     if (ABCIsConst)
807       Res = Nor(And(A, B), C);
808     break;
809   case 0x16:
810     if (ABCIsConst)
811       Res = Xor(Xor(A, B), And(Nand(A, B), C));
812     break;
813   case 0x17:
814     if (ABCIsConst)
815       Res = Xor(Or(A, B), Or(Xnor(A, B), C));
816     break;
817   case 0x18:
818     if (ABCIsConst)
819       Res = Nor(Xnor(A, B), Xnor(A, C));
820     break;
821   case 0x19:
822     if (ABCIsConst)
823       Res = And(Nand(A, B), Xnor(B, C));
824     break;
825   case 0x1a:
826     if (ABCIsConst)
827       Res = Xor(A, Or(And(A, B), C));
828     break;
829   case 0x1b:
830     if (ABCIsConst)
831       Res = Xor(A, Or(Xnor(A, B), C));
832     break;
833   case 0x1c:
834     if (ABCIsConst)
835       Res = Xor(A, Or(And(A, C), B));
836     break;
837   case 0x1d:
838     if (ABCIsConst)
839       Res = Xor(A, Or(Xnor(A, C), B));
840     break;
841   case 0x1e:
842     if (ABCIsConst)
843       Res = Xor(A, Or(B, C));
844     break;
845   case 0x1f:
846     if (ABCIsConst)
847       Res = Nand(A, Or(B, C));
848     break;
849   case 0x20:
850     if (ABCIsConst)
851       Res = Nor(Nand(A, C), B);
852     break;
853   case 0x21:
854     if (ABCIsConst)
855       Res = Nor(Xor(A, C), B);
856     break;
857   case 0x22:
858     if (BCIsConst)
859       Res = Nor(B, Not(C));
860     break;
861   case 0x23:
862     if (ABCIsConst)
863       Res = Nor(B, Nor(C, Not(A)));
864     break;
865   case 0x24:
866     if (ABCIsConst)
867       Res = Nor(Xnor(A, B), Xor(A, C));
868     break;
869   case 0x25:
870     if (ABCIsConst)
871       Res = Xor(A, Nand(Nand(A, B), C));
872     break;
873   case 0x26:
874     if (ABCIsConst)
875       Res = And(Nand(A, B), Xor(B, C));
876     break;
877   case 0x27:
878     if (ABCIsConst)
879       Res = Xor(Or(Xnor(A, B), C), B);
880     break;
881   case 0x28:
882     if (ABCIsConst)
883       Res = And(Xor(A, B), C);
884     break;
885   case 0x29:
886     if (ABCIsConst)
887       Res = Xor(Xor(A, B), Nor(And(A, B), C));
888     break;
889   case 0x2a:
890     if (ABCIsConst)
891       Res = And(Nand(A, B), C);
892     break;
893   case 0x2b:
894     if (ABCIsConst)
895       Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
896     break;
897   case 0x2c:
898     if (ABCIsConst)
899       Res = Nor(Xnor(A, B), Nor(B, C));
900     break;
901   case 0x2d:
902     if (ABCIsConst)
903       Res = Xor(A, Or(B, Not(C)));
904     break;
905   case 0x2e:
906     if (ABCIsConst)
907       Res = Xor(A, Or(Xor(A, C), B));
908     break;
909   case 0x2f:
910     if (ABCIsConst)
911       Res = Nand(A, Or(B, Not(C)));
912     break;
913   case 0x30:
914     if (ABIsConst)
915       Res = Nor(B, Not(A));
916     break;
917   case 0x31:
918     if (ABCIsConst)
919       Res = Nor(Nor(A, Not(C)), B);
920     break;
921   case 0x32:
922     if (ABCIsConst)
923       Res = Nor(Nor(A, C), B);
924     break;
925   case 0x33:
926     Res = Not(B);
927     break;
928   case 0x34:
929     if (ABCIsConst)
930       Res = And(Xor(A, B), Nand(B, C));
931     break;
932   case 0x35:
933     if (ABCIsConst)
934       Res = Xor(B, Or(A, Xnor(B, C)));
935     break;
936   case 0x36:
937     if (ABCIsConst)
938       Res = Xor(Or(A, C), B);
939     break;
940   case 0x37:
941     if (ABCIsConst)
942       Res = Nand(Or(A, C), B);
943     break;
944   case 0x38:
945     if (ABCIsConst)
946       Res = Nor(Xnor(A, B), Nor(A, C));
947     break;
948   case 0x39:
949     if (ABCIsConst)
950       Res = Xor(Or(A, Not(C)), B);
951     break;
952   case 0x3a:
953     if (ABCIsConst)
954       Res = Xor(B, Or(A, Xor(B, C)));
955     break;
956   case 0x3b:
957     if (ABCIsConst)
958       Res = Nand(Or(A, Not(C)), B);
959     break;
960   case 0x3c:
961     Res = Xor(A, B);
962     break;
963   case 0x3d:
964     if (ABCIsConst)
965       Res = Xor(A, Or(Nor(A, C), B));
966     break;
967   case 0x3e:
968     if (ABCIsConst)
969       Res = Xor(A, Or(Nor(A, Not(C)), B));
970     break;
971   case 0x3f:
972     if (ABIsConst)
973       Res = Nand(A, B);
974     break;
975   case 0x40:
976     if (ABCIsConst)
977       Res = Nor(Nand(A, B), C);
978     break;
979   case 0x41:
980     if (ABCIsConst)
981       Res = Nor(Xor(A, B), C);
982     break;
983   case 0x42:
984     if (ABCIsConst)
985       Res = Nor(Xor(A, B), Xnor(A, C));
986     break;
987   case 0x43:
988     if (ABCIsConst)
989       Res = Xor(A, Nand(Nand(A, C), B));
990     break;
991   case 0x44:
992     if (BCIsConst)
993       Res = Nor(C, Not(B));
994     break;
995   case 0x45:
996     if (ABCIsConst)
997       Res = Nor(Nor(B, Not(A)), C);
998     break;
999   case 0x46:
1000     if (ABCIsConst)
1001       Res = Xor(Or(And(A, C), B), C);
1002     break;
1003   case 0x47:
1004     if (ABCIsConst)
1005       Res = Xor(Or(Xnor(A, C), B), C);
1006     break;
1007   case 0x48:
1008     if (ABCIsConst)
1009       Res = And(Xor(A, C), B);
1010     break;
1011   case 0x49:
1012     if (ABCIsConst)
1013       Res = Xor(Or(Xnor(A, B), And(A, C)), C);
1014     break;
1015   case 0x4a:
1016     if (ABCIsConst)
1017       Res = Nor(Xnor(A, C), Nor(B, C));
1018     break;
1019   case 0x4b:
1020     if (ABCIsConst)
1021       Res = Xor(A, Or(C, Not(B)));
1022     break;
1023   case 0x4c:
1024     if (ABCIsConst)
1025       Res = And(Nand(A, C), B);
1026     break;
1027   case 0x4d:
1028     if (ABCIsConst)
1029       Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
1030     break;
1031   case 0x4e:
1032     if (ABCIsConst)
1033       Res = Xor(A, Or(Xor(A, B), C));
1034     break;
1035   case 0x4f:
1036     if (ABCIsConst)
1037       Res = Nand(A, Nand(B, Not(C)));
1038     break;
1039   case 0x50:
1040     if (ACIsConst)
1041       Res = Nor(C, Not(A));
1042     break;
1043   case 0x51:
1044     if (ABCIsConst)
1045       Res = Nor(Nor(A, Not(B)), C);
1046     break;
1047   case 0x52:
1048     if (ABCIsConst)
1049       Res = And(Xor(A, C), Nand(B, C));
1050     break;
1051   case 0x53:
1052     if (ABCIsConst)
1053       Res = Xor(Or(Xnor(B, C), A), C);
1054     break;
1055   case 0x54:
1056     if (ABCIsConst)
1057       Res = Nor(Nor(A, B), C);
1058     break;
1059   case 0x55:
1060     Res = Not(C);
1061     break;
1062   case 0x56:
1063     if (ABCIsConst)
1064       Res = Xor(Or(A, B), C);
1065     break;
1066   case 0x57:
1067     if (ABCIsConst)
1068       Res = Nand(Or(A, B), C);
1069     break;
1070   case 0x58:
1071     if (ABCIsConst)
1072       Res = Nor(Nor(A, B), Xnor(A, C));
1073     break;
1074   case 0x59:
1075     if (ABCIsConst)
1076       Res = Xor(Or(A, Not(B)), C);
1077     break;
1078   case 0x5a:
1079     Res = Xor(A, C);
1080     break;
1081   case 0x5b:
1082     if (ABCIsConst)
1083       Res = Xor(A, Or(Nor(A, B), C));
1084     break;
1085   case 0x5c:
1086     if (ABCIsConst)
1087       Res = Xor(Or(Xor(B, C), A), C);
1088     break;
1089   case 0x5d:
1090     if (ABCIsConst)
1091       Res = Nand(Or(A, Not(B)), C);
1092     break;
1093   case 0x5e:
1094     if (ABCIsConst)
1095       Res = Xor(A, Or(Nor(A, Not(B)), C));
1096     break;
1097   case 0x5f:
1098     if (ACIsConst)
1099       Res = Nand(A, C);
1100     break;
1101   case 0x60:
1102     if (ABCIsConst)
1103       Res = And(A, Xor(B, C));
1104     break;
1105   case 0x61:
1106     if (ABCIsConst)
1107       Res = Xor(Or(Xnor(A, B), And(B, C)), C);
1108     break;
1109   case 0x62:
1110     if (ABCIsConst)
1111       Res = Nor(Nor(A, C), Xnor(B, C));
1112     break;
1113   case 0x63:
1114     if (ABCIsConst)
1115       Res = Xor(B, Or(C, Not(A)));
1116     break;
1117   case 0x64:
1118     if (ABCIsConst)
1119       Res = Nor(Nor(A, B), Xnor(B, C));
1120     break;
1121   case 0x65:
1122     if (ABCIsConst)
1123       Res = Xor(Or(B, Not(A)), C);
1124     break;
1125   case 0x66:
1126     Res = Xor(B, C);
1127     break;
1128   case 0x67:
1129     if (ABCIsConst)
1130       Res = Or(Nor(A, B), Xor(B, C));
1131     break;
1132   case 0x68:
1133     if (ABCIsConst)
1134       Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1135     break;
1136   case 0x69:
1137     if (ABCIsConst)
1138       Res = Xor(Xnor(A, B), C);
1139     break;
1140   case 0x6a:
1141     if (ABCIsConst)
1142       Res = Xor(And(A, B), C);
1143     break;
1144   case 0x6b:
1145     if (ABCIsConst)
1146       Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1147     break;
1148   case 0x6c:
1149     if (ABCIsConst)
1150       Res = Xor(And(A, C), B);
1151     break;
1152   case 0x6d:
1153     if (ABCIsConst)
1154       Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1155     break;
1156   case 0x6e:
1157     if (ABCIsConst)
1158       Res = Or(Nor(A, Not(B)), Xor(B, C));
1159     break;
1160   case 0x6f:
1161     if (ABCIsConst)
1162       Res = Nand(A, Xnor(B, C));
1163     break;
1164   case 0x70:
1165     if (ABCIsConst)
1166       Res = And(A, Nand(B, C));
1167     break;
1168   case 0x71:
1169     if (ABCIsConst)
1170       Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1171     break;
1172   case 0x72:
1173     if (ABCIsConst)
1174       Res = Xor(Or(Xor(A, B), C), B);
1175     break;
1176   case 0x73:
1177     if (ABCIsConst)
1178       Res = Nand(Nand(A, Not(C)), B);
1179     break;
1180   case 0x74:
1181     if (ABCIsConst)
1182       Res = Xor(Or(Xor(A, C), B), C);
1183     break;
1184   case 0x75:
1185     if (ABCIsConst)
1186       Res = Nand(Nand(A, Not(B)), C);
1187     break;
1188   case 0x76:
1189     if (ABCIsConst)
1190       Res = Xor(B, Or(Nor(B, Not(A)), C));
1191     break;
1192   case 0x77:
1193     if (BCIsConst)
1194       Res = Nand(B, C);
1195     break;
1196   case 0x78:
1197     if (ABCIsConst)
1198       Res = Xor(A, And(B, C));
1199     break;
1200   case 0x79:
1201     if (ABCIsConst)
1202       Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1203     break;
1204   case 0x7a:
1205     if (ABCIsConst)
1206       Res = Or(Xor(A, C), Nor(B, Not(A)));
1207     break;
1208   case 0x7b:
1209     if (ABCIsConst)
1210       Res = Nand(Xnor(A, C), B);
1211     break;
1212   case 0x7c:
1213     if (ABCIsConst)
1214       Res = Or(Xor(A, B), Nor(C, Not(A)));
1215     break;
1216   case 0x7d:
1217     if (ABCIsConst)
1218       Res = Nand(Xnor(A, B), C);
1219     break;
1220   case 0x7e:
1221     if (ABCIsConst)
1222       Res = Or(Xor(A, B), Xor(A, C));
1223     break;
1224   case 0x7f:
1225     if (ABCIsConst)
1226       Res = Nand(And(A, B), C);
1227     break;
1228   case 0x80:
1229     if (ABCIsConst)
1230       Res = And(And(A, B), C);
1231     break;
1232   case 0x81:
1233     if (ABCIsConst)
1234       Res = Nor(Xor(A, B), Xor(A, C));
1235     break;
1236   case 0x82:
1237     if (ABCIsConst)
1238       Res = And(Xnor(A, B), C);
1239     break;
1240   case 0x83:
1241     if (ABCIsConst)
1242       Res = Nor(Xor(A, B), Nor(C, Not(A)));
1243     break;
1244   case 0x84:
1245     if (ABCIsConst)
1246       Res = And(Xnor(A, C), B);
1247     break;
1248   case 0x85:
1249     if (ABCIsConst)
1250       Res = Nor(Xor(A, C), Nor(B, Not(A)));
1251     break;
1252   case 0x86:
1253     if (ABCIsConst)
1254       Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1255     break;
1256   case 0x87:
1257     if (ABCIsConst)
1258       Res = Xor(A, Nand(B, C));
1259     break;
1260   case 0x88:
1261     Res = And(B, C);
1262     break;
1263   case 0x89:
1264     if (ABCIsConst)
1265       Res = Xor(B, Nor(Nor(B, Not(A)), C));
1266     break;
1267   case 0x8a:
1268     if (ABCIsConst)
1269       Res = And(Nand(A, Not(B)), C);
1270     break;
1271   case 0x8b:
1272     if (ABCIsConst)
1273       Res = Xor(Nor(Xor(A, C), B), C);
1274     break;
1275   case 0x8c:
1276     if (ABCIsConst)
1277       Res = And(Nand(A, Not(C)), B);
1278     break;
1279   case 0x8d:
1280     if (ABCIsConst)
1281       Res = Xor(Nor(Xor(A, B), C), B);
1282     break;
1283   case 0x8e:
1284     if (ABCIsConst)
1285       Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1286     break;
1287   case 0x8f:
1288     if (ABCIsConst)
1289       Res = Nand(A, Nand(B, C));
1290     break;
1291   case 0x90:
1292     if (ABCIsConst)
1293       Res = And(A, Xnor(B, C));
1294     break;
1295   case 0x91:
1296     if (ABCIsConst)
1297       Res = Nor(Nor(A, Not(B)), Xor(B, C));
1298     break;
1299   case 0x92:
1300     if (ABCIsConst)
1301       Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1302     break;
1303   case 0x93:
1304     if (ABCIsConst)
1305       Res = Xor(Nand(A, C), B);
1306     break;
1307   case 0x94:
1308     if (ABCIsConst)
1309       Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1310     break;
1311   case 0x95:
1312     if (ABCIsConst)
1313       Res = Xor(Nand(A, B), C);
1314     break;
1315   case 0x96:
1316     if (ABCIsConst)
1317       Res = Xor(Xor(A, B), C);
1318     break;
1319   case 0x97:
1320     if (ABCIsConst)
1321       Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1322     break;
1323   case 0x98:
1324     if (ABCIsConst)
1325       Res = Nor(Nor(A, B), Xor(B, C));
1326     break;
1327   case 0x99:
1328     if (BCIsConst)
1329       Res = Xnor(B, C);
1330     break;
1331   case 0x9a:
1332     if (ABCIsConst)
1333       Res = Xor(Nor(B, Not(A)), C);
1334     break;
1335   case 0x9b:
1336     if (ABCIsConst)
1337       Res = Or(Nor(A, B), Xnor(B, C));
1338     break;
1339   case 0x9c:
1340     if (ABCIsConst)
1341       Res = Xor(B, Nor(C, Not(A)));
1342     break;
1343   case 0x9d:
1344     if (ABCIsConst)
1345       Res = Or(Nor(A, C), Xnor(B, C));
1346     break;
1347   case 0x9e:
1348     if (ABCIsConst)
1349       Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1350     break;
1351   case 0x9f:
1352     if (ABCIsConst)
1353       Res = Nand(A, Xor(B, C));
1354     break;
1355   case 0xa0:
1356     Res = And(A, C);
1357     break;
1358   case 0xa1:
1359     if (ABCIsConst)
1360       Res = Xor(A, Nor(Nor(A, Not(B)), C));
1361     break;
1362   case 0xa2:
1363     if (ABCIsConst)
1364       Res = And(Or(A, Not(B)), C);
1365     break;
1366   case 0xa3:
1367     if (ABCIsConst)
1368       Res = Xor(Nor(Xor(B, C), A), C);
1369     break;
1370   case 0xa4:
1371     if (ABCIsConst)
1372       Res = Xor(A, Nor(Nor(A, B), C));
1373     break;
1374   case 0xa5:
1375     if (ACIsConst)
1376       Res = Xnor(A, C);
1377     break;
1378   case 0xa6:
1379     if (ABCIsConst)
1380       Res = Xor(Nor(A, Not(B)), C);
1381     break;
1382   case 0xa7:
1383     if (ABCIsConst)
1384       Res = Or(Nor(A, B), Xnor(A, C));
1385     break;
1386   case 0xa8:
1387     if (ABCIsConst)
1388       Res = And(Or(A, B), C);
1389     break;
1390   case 0xa9:
1391     if (ABCIsConst)
1392       Res = Xor(Nor(A, B), C);
1393     break;
1394   case 0xaa:
1395     Res = C;
1396     break;
1397   case 0xab:
1398     if (ABCIsConst)
1399       Res = Or(Nor(A, B), C);
1400     break;
1401   case 0xac:
1402     if (ABCIsConst)
1403       Res = Xor(Nor(Xnor(B, C), A), C);
1404     break;
1405   case 0xad:
1406     if (ABCIsConst)
1407       Res = Or(Xnor(A, C), And(B, C));
1408     break;
1409   case 0xae:
1410     if (ABCIsConst)
1411       Res = Or(Nor(A, Not(B)), C);
1412     break;
1413   case 0xaf:
1414     if (ACIsConst)
1415       Res = Or(C, Not(A));
1416     break;
1417   case 0xb0:
1418     if (ABCIsConst)
1419       Res = And(A, Nand(B, Not(C)));
1420     break;
1421   case 0xb1:
1422     if (ABCIsConst)
1423       Res = Xor(A, Nor(Xor(A, B), C));
1424     break;
1425   case 0xb2:
1426     if (ABCIsConst)
1427       Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1428     break;
1429   case 0xb3:
1430     if (ABCIsConst)
1431       Res = Nand(Nand(A, C), B);
1432     break;
1433   case 0xb4:
1434     if (ABCIsConst)
1435       Res = Xor(A, Nor(C, Not(B)));
1436     break;
1437   case 0xb5:
1438     if (ABCIsConst)
1439       Res = Or(Xnor(A, C), Nor(B, C));
1440     break;
1441   case 0xb6:
1442     if (ABCIsConst)
1443       Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1444     break;
1445   case 0xb7:
1446     if (ABCIsConst)
1447       Res = Nand(Xor(A, C), B);
1448     break;
1449   case 0xb8:
1450     if (ABCIsConst)
1451       Res = Xor(Nor(Xnor(A, C), B), C);
1452     break;
1453   case 0xb9:
1454     if (ABCIsConst)
1455       Res = Xor(Nor(And(A, C), B), C);
1456     break;
1457   case 0xba:
1458     if (ABCIsConst)
1459       Res = Or(Nor(B, Not(A)), C);
1460     break;
1461   case 0xbb:
1462     if (BCIsConst)
1463       Res = Or(C, Not(B));
1464     break;
1465   case 0xbc:
1466     if (ABCIsConst)
1467       Res = Xor(A, And(Nand(A, C), B));
1468     break;
1469   case 0xbd:
1470     if (ABCIsConst)
1471       Res = Or(Xor(A, B), Xnor(A, C));
1472     break;
1473   case 0xbe:
1474     if (ABCIsConst)
1475       Res = Or(Xor(A, B), C);
1476     break;
1477   case 0xbf:
1478     if (ABCIsConst)
1479       Res = Or(Nand(A, B), C);
1480     break;
1481   case 0xc0:
1482     Res = And(A, B);
1483     break;
1484   case 0xc1:
1485     if (ABCIsConst)
1486       Res = Xor(A, Nor(Nor(A, Not(C)), B));
1487     break;
1488   case 0xc2:
1489     if (ABCIsConst)
1490       Res = Xor(A, Nor(Nor(A, C), B));
1491     break;
1492   case 0xc3:
1493     if (ABIsConst)
1494       Res = Xnor(A, B);
1495     break;
1496   case 0xc4:
1497     if (ABCIsConst)
1498       Res = And(Or(A, Not(C)), B);
1499     break;
1500   case 0xc5:
1501     if (ABCIsConst)
1502       Res = Xor(B, Nor(A, Xor(B, C)));
1503     break;
1504   case 0xc6:
1505     if (ABCIsConst)
1506       Res = Xor(Nor(A, Not(C)), B);
1507     break;
1508   case 0xc7:
1509     if (ABCIsConst)
1510       Res = Or(Xnor(A, B), Nor(A, C));
1511     break;
1512   case 0xc8:
1513     if (ABCIsConst)
1514       Res = And(Or(A, C), B);
1515     break;
1516   case 0xc9:
1517     if (ABCIsConst)
1518       Res = Xor(Nor(A, C), B);
1519     break;
1520   case 0xca:
1521     if (ABCIsConst)
1522       Res = Xor(B, Nor(A, Xnor(B, C)));
1523     break;
1524   case 0xcb:
1525     if (ABCIsConst)
1526       Res = Or(Xnor(A, B), And(B, C));
1527     break;
1528   case 0xcc:
1529     Res = B;
1530     break;
1531   case 0xcd:
1532     if (ABCIsConst)
1533       Res = Or(Nor(A, C), B);
1534     break;
1535   case 0xce:
1536     if (ABCIsConst)
1537       Res = Or(Nor(A, Not(C)), B);
1538     break;
1539   case 0xcf:
1540     if (ABIsConst)
1541       Res = Or(B, Not(A));
1542     break;
1543   case 0xd0:
1544     if (ABCIsConst)
1545       Res = And(A, Or(B, Not(C)));
1546     break;
1547   case 0xd1:
1548     if (ABCIsConst)
1549       Res = Xor(A, Nor(Xor(A, C), B));
1550     break;
1551   case 0xd2:
1552     if (ABCIsConst)
1553       Res = Xor(A, Nor(B, Not(C)));
1554     break;
1555   case 0xd3:
1556     if (ABCIsConst)
1557       Res = Or(Xnor(A, B), Nor(B, C));
1558     break;
1559   case 0xd4:
1560     if (ABCIsConst)
1561       Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1562     break;
1563   case 0xd5:
1564     if (ABCIsConst)
1565       Res = Nand(Nand(A, B), C);
1566     break;
1567   case 0xd6:
1568     if (ABCIsConst)
1569       Res = Xor(Xor(A, B), Or(And(A, B), C));
1570     break;
1571   case 0xd7:
1572     if (ABCIsConst)
1573       Res = Nand(Xor(A, B), C);
1574     break;
1575   case 0xd8:
1576     if (ABCIsConst)
1577       Res = Xor(Nor(Xnor(A, B), C), B);
1578     break;
1579   case 0xd9:
1580     if (ABCIsConst)
1581       Res = Or(And(A, B), Xnor(B, C));
1582     break;
1583   case 0xda:
1584     if (ABCIsConst)
1585       Res = Xor(A, And(Nand(A, B), C));
1586     break;
1587   case 0xdb:
1588     if (ABCIsConst)
1589       Res = Or(Xnor(A, B), Xor(A, C));
1590     break;
1591   case 0xdc:
1592     if (ABCIsConst)
1593       Res = Or(B, Nor(C, Not(A)));
1594     break;
1595   case 0xdd:
1596     if (BCIsConst)
1597       Res = Or(B, Not(C));
1598     break;
1599   case 0xde:
1600     if (ABCIsConst)
1601       Res = Or(Xor(A, C), B);
1602     break;
1603   case 0xdf:
1604     if (ABCIsConst)
1605       Res = Or(Nand(A, C), B);
1606     break;
1607   case 0xe0:
1608     if (ABCIsConst)
1609       Res = And(A, Or(B, C));
1610     break;
1611   case 0xe1:
1612     if (ABCIsConst)
1613       Res = Xor(A, Nor(B, C));
1614     break;
1615   case 0xe2:
1616     if (ABCIsConst)
1617       Res = Xor(A, Nor(Xnor(A, C), B));
1618     break;
1619   case 0xe3:
1620     if (ABCIsConst)
1621       Res = Xor(A, Nor(And(A, C), B));
1622     break;
1623   case 0xe4:
1624     if (ABCIsConst)
1625       Res = Xor(A, Nor(Xnor(A, B), C));
1626     break;
1627   case 0xe5:
1628     if (ABCIsConst)
1629       Res = Xor(A, Nor(And(A, B), C));
1630     break;
1631   case 0xe6:
1632     if (ABCIsConst)
1633       Res = Or(And(A, B), Xor(B, C));
1634     break;
1635   case 0xe7:
1636     if (ABCIsConst)
1637       Res = Or(Xnor(A, B), Xnor(A, C));
1638     break;
1639   case 0xe8:
1640     if (ABCIsConst)
1641       Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1642     break;
1643   case 0xe9:
1644     if (ABCIsConst)
1645       Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1646     break;
1647   case 0xea:
1648     if (ABCIsConst)
1649       Res = Or(And(A, B), C);
1650     break;
1651   case 0xeb:
1652     if (ABCIsConst)
1653       Res = Or(Xnor(A, B), C);
1654     break;
1655   case 0xec:
1656     if (ABCIsConst)
1657       Res = Or(And(A, C), B);
1658     break;
1659   case 0xed:
1660     if (ABCIsConst)
1661       Res = Or(Xnor(A, C), B);
1662     break;
1663   case 0xee:
1664     Res = Or(B, C);
1665     break;
1666   case 0xef:
1667     if (ABCIsConst)
1668       Res = Nand(A, Nor(B, C));
1669     break;
1670   case 0xf0:
1671     Res = A;
1672     break;
1673   case 0xf1:
1674     if (ABCIsConst)
1675       Res = Or(A, Nor(B, C));
1676     break;
1677   case 0xf2:
1678     if (ABCIsConst)
1679       Res = Or(A, Nor(B, Not(C)));
1680     break;
1681   case 0xf3:
1682     if (ABIsConst)
1683       Res = Or(A, Not(B));
1684     break;
1685   case 0xf4:
1686     if (ABCIsConst)
1687       Res = Or(A, Nor(C, Not(B)));
1688     break;
1689   case 0xf5:
1690     if (ACIsConst)
1691       Res = Or(A, Not(C));
1692     break;
1693   case 0xf6:
1694     if (ABCIsConst)
1695       Res = Or(A, Xor(B, C));
1696     break;
1697   case 0xf7:
1698     if (ABCIsConst)
1699       Res = Or(A, Nand(B, C));
1700     break;
1701   case 0xf8:
1702     if (ABCIsConst)
1703       Res = Or(A, And(B, C));
1704     break;
1705   case 0xf9:
1706     if (ABCIsConst)
1707       Res = Or(A, Xnor(B, C));
1708     break;
1709   case 0xfa:
1710     Res = Or(A, C);
1711     break;
1712   case 0xfb:
1713     if (ABCIsConst)
1714       Res = Nand(Nor(A, C), B);
1715     break;
1716   case 0xfc:
1717     Res = Or(A, B);
1718     break;
1719   case 0xfd:
1720     if (ABCIsConst)
1721       Res = Nand(Nor(A, B), C);
1722     break;
1723   case 0xfe:
1724     if (ABCIsConst)
1725       Res = Or(Or(A, B), C);
1726     break;
1727   case 0xff:
1728     Res = {Constant::getAllOnesValue(Ty), 0xff};
1729     break;
1730   }
1731 
1732   assert((Res.first == nullptr || Res.second == Imm) &&
1733          "Simplification of ternary logic does not verify!");
1734   return Res.first;
1735 }
1736 
1737 static Value *simplifyX86insertps(const IntrinsicInst &II,
1738                                   InstCombiner::BuilderTy &Builder) {
1739   auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1740   if (!CInt)
1741     return nullptr;
1742 
1743   auto *VecTy = cast<FixedVectorType>(II.getType());
1744   assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1745 
1746   // The immediate permute control byte looks like this:
1747   //    [3:0] - zero mask for each 32-bit lane
1748   //    [5:4] - select one 32-bit destination lane
1749   //    [7:6] - select one 32-bit source lane
1750 
1751   uint8_t Imm = CInt->getZExtValue();
1752   uint8_t ZMask = Imm & 0xf;
1753   uint8_t DestLane = (Imm >> 4) & 0x3;
1754   uint8_t SourceLane = (Imm >> 6) & 0x3;
1755 
1756   ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
1757 
1758   // If all zero mask bits are set, this was just a weird way to
1759   // generate a zero vector.
1760   if (ZMask == 0xf)
1761     return ZeroVector;
1762 
1763   // Initialize by passing all of the first source bits through.
1764   int ShuffleMask[4] = {0, 1, 2, 3};
1765 
1766   // We may replace the second operand with the zero vector.
1767   Value *V1 = II.getArgOperand(1);
1768 
1769   if (ZMask) {
1770     // If the zero mask is being used with a single input or the zero mask
1771     // overrides the destination lane, this is a shuffle with the zero vector.
1772     if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1773         (ZMask & (1 << DestLane))) {
1774       V1 = ZeroVector;
1775       // We may still move 32-bits of the first source vector from one lane
1776       // to another.
1777       ShuffleMask[DestLane] = SourceLane;
1778       // The zero mask may override the previous insert operation.
1779       for (unsigned i = 0; i < 4; ++i)
1780         if ((ZMask >> i) & 0x1)
1781           ShuffleMask[i] = i + 4;
1782     } else {
1783       // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1784       return nullptr;
1785     }
1786   } else {
1787     // Replace the selected destination lane with the selected source lane.
1788     ShuffleMask[DestLane] = SourceLane + 4;
1789   }
1790 
1791   return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1792 }
1793 
1794 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1795 /// or conversion to a shuffle vector.
1796 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
1797                                ConstantInt *CILength, ConstantInt *CIIndex,
1798                                InstCombiner::BuilderTy &Builder) {
1799   auto LowConstantHighUndef = [&](uint64_t Val) {
1800     Type *IntTy64 = Type::getInt64Ty(II.getContext());
1801     Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1802                         UndefValue::get(IntTy64)};
1803     return ConstantVector::get(Args);
1804   };
1805 
1806   // See if we're dealing with constant values.
1807   auto *C0 = dyn_cast<Constant>(Op0);
1808   auto *CI0 =
1809       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1810          : nullptr;
1811 
1812   // Attempt to constant fold.
1813   if (CILength && CIIndex) {
1814     // From AMD documentation: "The bit index and field length are each six
1815     // bits in length other bits of the field are ignored."
1816     APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1817     APInt APLength = CILength->getValue().zextOrTrunc(6);
1818 
1819     unsigned Index = APIndex.getZExtValue();
1820 
1821     // From AMD documentation: "a value of zero in the field length is
1822     // defined as length of 64".
1823     unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1824 
1825     // From AMD documentation: "If the sum of the bit index + length field
1826     // is greater than 64, the results are undefined".
1827     unsigned End = Index + Length;
1828 
1829     // Note that both field index and field length are 8-bit quantities.
1830     // Since variables 'Index' and 'Length' are unsigned values
1831     // obtained from zero-extending field index and field length
1832     // respectively, their sum should never wrap around.
1833     if (End > 64)
1834       return UndefValue::get(II.getType());
1835 
1836     // If we are inserting whole bytes, we can convert this to a shuffle.
1837     // Lowering can recognize EXTRQI shuffle masks.
1838     if ((Length % 8) == 0 && (Index % 8) == 0) {
1839       // Convert bit indices to byte indices.
1840       Length /= 8;
1841       Index /= 8;
1842 
1843       Type *IntTy8 = Type::getInt8Ty(II.getContext());
1844       auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1845 
1846       SmallVector<int, 16> ShuffleMask;
1847       for (int i = 0; i != (int)Length; ++i)
1848         ShuffleMask.push_back(i + Index);
1849       for (int i = Length; i != 8; ++i)
1850         ShuffleMask.push_back(i + 16);
1851       for (int i = 8; i != 16; ++i)
1852         ShuffleMask.push_back(-1);
1853 
1854       Value *SV = Builder.CreateShuffleVector(
1855           Builder.CreateBitCast(Op0, ShufTy),
1856           ConstantAggregateZero::get(ShufTy), ShuffleMask);
1857       return Builder.CreateBitCast(SV, II.getType());
1858     }
1859 
1860     // Constant Fold - shift Index'th bit to lowest position and mask off
1861     // Length bits.
1862     if (CI0) {
1863       APInt Elt = CI0->getValue();
1864       Elt.lshrInPlace(Index);
1865       Elt = Elt.zextOrTrunc(Length);
1866       return LowConstantHighUndef(Elt.getZExtValue());
1867     }
1868 
1869     // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1870     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1871       Value *Args[] = {Op0, CILength, CIIndex};
1872       return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_extrqi, {}, Args);
1873     }
1874   }
1875 
1876   // Constant Fold - extraction from zero is always {zero, undef}.
1877   if (CI0 && CI0->isZero())
1878     return LowConstantHighUndef(0);
1879 
1880   return nullptr;
1881 }
1882 
1883 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1884 /// folding or conversion to a shuffle vector.
1885 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
1886                                  APInt APLength, APInt APIndex,
1887                                  InstCombiner::BuilderTy &Builder) {
1888   // From AMD documentation: "The bit index and field length are each six bits
1889   // in length other bits of the field are ignored."
1890   APIndex = APIndex.zextOrTrunc(6);
1891   APLength = APLength.zextOrTrunc(6);
1892 
1893   // Attempt to constant fold.
1894   unsigned Index = APIndex.getZExtValue();
1895 
1896   // From AMD documentation: "a value of zero in the field length is
1897   // defined as length of 64".
1898   unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1899 
1900   // From AMD documentation: "If the sum of the bit index + length field
1901   // is greater than 64, the results are undefined".
1902   unsigned End = Index + Length;
1903 
1904   // Note that both field index and field length are 8-bit quantities.
1905   // Since variables 'Index' and 'Length' are unsigned values
1906   // obtained from zero-extending field index and field length
1907   // respectively, their sum should never wrap around.
1908   if (End > 64)
1909     return UndefValue::get(II.getType());
1910 
1911   // If we are inserting whole bytes, we can convert this to a shuffle.
1912   // Lowering can recognize INSERTQI shuffle masks.
1913   if ((Length % 8) == 0 && (Index % 8) == 0) {
1914     // Convert bit indices to byte indices.
1915     Length /= 8;
1916     Index /= 8;
1917 
1918     Type *IntTy8 = Type::getInt8Ty(II.getContext());
1919     auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1920 
1921     SmallVector<int, 16> ShuffleMask;
1922     for (int i = 0; i != (int)Index; ++i)
1923       ShuffleMask.push_back(i);
1924     for (int i = 0; i != (int)Length; ++i)
1925       ShuffleMask.push_back(i + 16);
1926     for (int i = Index + Length; i != 8; ++i)
1927       ShuffleMask.push_back(i);
1928     for (int i = 8; i != 16; ++i)
1929       ShuffleMask.push_back(-1);
1930 
1931     Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1932                                             Builder.CreateBitCast(Op1, ShufTy),
1933                                             ShuffleMask);
1934     return Builder.CreateBitCast(SV, II.getType());
1935   }
1936 
1937   // See if we're dealing with constant values.
1938   auto *C0 = dyn_cast<Constant>(Op0);
1939   auto *C1 = dyn_cast<Constant>(Op1);
1940   auto *CI00 =
1941       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1942          : nullptr;
1943   auto *CI10 =
1944       C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1945          : nullptr;
1946 
1947   // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1948   if (CI00 && CI10) {
1949     APInt V00 = CI00->getValue();
1950     APInt V10 = CI10->getValue();
1951     APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
1952     V00 = V00 & ~Mask;
1953     V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
1954     APInt Val = V00 | V10;
1955     Type *IntTy64 = Type::getInt64Ty(II.getContext());
1956     Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
1957                         UndefValue::get(IntTy64)};
1958     return ConstantVector::get(Args);
1959   }
1960 
1961   // If we were an INSERTQ call, we'll save demanded elements if we convert to
1962   // INSERTQI.
1963   if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
1964     Type *IntTy8 = Type::getInt8Ty(II.getContext());
1965     Constant *CILength = ConstantInt::get(IntTy8, Length, false);
1966     Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
1967 
1968     Value *Args[] = {Op0, Op1, CILength, CIIndex};
1969     return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_insertqi, {}, Args);
1970   }
1971 
1972   return nullptr;
1973 }
1974 
1975 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
1976 static Value *simplifyX86pshufb(const IntrinsicInst &II,
1977                                 InstCombiner::BuilderTy &Builder) {
1978   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1979   if (!V)
1980     return nullptr;
1981 
1982   auto *VecTy = cast<FixedVectorType>(II.getType());
1983   unsigned NumElts = VecTy->getNumElements();
1984   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
1985          "Unexpected number of elements in shuffle mask!");
1986 
1987   // Construct a shuffle mask from constant integers or UNDEFs.
1988   int Indexes[64];
1989 
1990   // Each byte in the shuffle control mask forms an index to permute the
1991   // corresponding byte in the destination operand.
1992   for (unsigned I = 0; I < NumElts; ++I) {
1993     Constant *COp = V->getAggregateElement(I);
1994     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1995       return nullptr;
1996 
1997     if (isa<UndefValue>(COp)) {
1998       Indexes[I] = -1;
1999       continue;
2000     }
2001 
2002     int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
2003 
2004     // If the most significant bit (bit[7]) of each byte of the shuffle
2005     // control mask is set, then zero is written in the result byte.
2006     // The zero vector is in the right-hand side of the resulting
2007     // shufflevector.
2008 
2009     // The value of each index for the high 128-bit lane is the least
2010     // significant 4 bits of the respective shuffle control byte.
2011     Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
2012     Indexes[I] = Index;
2013   }
2014 
2015   auto V1 = II.getArgOperand(0);
2016   auto V2 = Constant::getNullValue(VecTy);
2017   return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
2018 }
2019 
2020 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
2021 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
2022                                     InstCombiner::BuilderTy &Builder) {
2023   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2024   if (!V)
2025     return nullptr;
2026 
2027   auto *VecTy = cast<FixedVectorType>(II.getType());
2028   unsigned NumElts = VecTy->getNumElements();
2029   bool IsPD = VecTy->getScalarType()->isDoubleTy();
2030   unsigned NumLaneElts = IsPD ? 2 : 4;
2031   assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
2032 
2033   // Construct a shuffle mask from constant integers or UNDEFs.
2034   int Indexes[16];
2035 
2036   // The intrinsics only read one or two bits, clear the rest.
2037   for (unsigned I = 0; I < NumElts; ++I) {
2038     Constant *COp = V->getAggregateElement(I);
2039     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2040       return nullptr;
2041 
2042     if (isa<UndefValue>(COp)) {
2043       Indexes[I] = -1;
2044       continue;
2045     }
2046 
2047     APInt Index = cast<ConstantInt>(COp)->getValue();
2048     Index = Index.zextOrTrunc(32).getLoBits(2);
2049 
2050     // The PD variants uses bit 1 to select per-lane element index, so
2051     // shift down to convert to generic shuffle mask index.
2052     if (IsPD)
2053       Index.lshrInPlace(1);
2054 
2055     // The _256 variants are a bit trickier since the mask bits always index
2056     // into the corresponding 128 half. In order to convert to a generic
2057     // shuffle, we have to make that explicit.
2058     Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
2059 
2060     Indexes[I] = Index.getZExtValue();
2061   }
2062 
2063   auto V1 = II.getArgOperand(0);
2064   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
2065 }
2066 
2067 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
2068 static Value *simplifyX86vpermv(const IntrinsicInst &II,
2069                                 InstCombiner::BuilderTy &Builder) {
2070   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2071   if (!V)
2072     return nullptr;
2073 
2074   auto *VecTy = cast<FixedVectorType>(II.getType());
2075   unsigned Size = VecTy->getNumElements();
2076   assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
2077          "Unexpected shuffle mask size");
2078 
2079   // Construct a shuffle mask from constant integers or UNDEFs.
2080   int Indexes[64];
2081 
2082   for (unsigned I = 0; I < Size; ++I) {
2083     Constant *COp = V->getAggregateElement(I);
2084     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2085       return nullptr;
2086 
2087     if (isa<UndefValue>(COp)) {
2088       Indexes[I] = -1;
2089       continue;
2090     }
2091 
2092     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2093     Index &= Size - 1;
2094     Indexes[I] = Index;
2095   }
2096 
2097   auto V1 = II.getArgOperand(0);
2098   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
2099 }
2100 
2101 /// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant.
2102 static Value *simplifyX86vpermv3(const IntrinsicInst &II,
2103                                  InstCombiner::BuilderTy &Builder) {
2104   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2105   if (!V)
2106     return nullptr;
2107 
2108   auto *VecTy = cast<FixedVectorType>(II.getType());
2109   unsigned Size = VecTy->getNumElements();
2110   assert((Size == 2 || Size == 4 || Size == 8 || Size == 16 || Size == 32 ||
2111           Size == 64) &&
2112          "Unexpected shuffle mask size");
2113 
2114   // Construct a shuffle mask from constant integers or UNDEFs.
2115   int Indexes[64];
2116 
2117   for (unsigned I = 0; I < Size; ++I) {
2118     Constant *COp = V->getAggregateElement(I);
2119     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2120       return nullptr;
2121 
2122     if (isa<UndefValue>(COp)) {
2123       Indexes[I] = -1;
2124       continue;
2125     }
2126 
2127     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2128     Index &= (2 * Size) - 1;
2129     Indexes[I] = Index;
2130   }
2131 
2132   auto V1 = II.getArgOperand(0);
2133   auto V2 = II.getArgOperand(2);
2134   return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size));
2135 }
2136 
2137 // Simplify VPERMV/VPERMV3 mask - only demand the active index bits.
2138 static bool simplifyX86VPERMMask(Instruction *II, bool IsBinary,
2139                                  InstCombiner &IC) {
2140   auto *VecTy = cast<FixedVectorType>(II->getType());
2141   unsigned EltSizeInBits = VecTy->getScalarSizeInBits();
2142   unsigned NumElts = VecTy->getNumElements();
2143   assert(isPowerOf2_32(NumElts) && isPowerOf2_32(EltSizeInBits) &&
2144          "Unexpected shuffle mask size");
2145 
2146   unsigned IdxSizeInBits = Log2_32(IsBinary ? (2 * NumElts) : NumElts);
2147   APInt DemandedMask = APInt::getLowBitsSet(EltSizeInBits, IdxSizeInBits);
2148 
2149   KnownBits KnownMask(EltSizeInBits);
2150   return IC.SimplifyDemandedBits(II, /*OpNo=*/1, DemandedMask, KnownMask);
2151 }
2152 
2153 std::optional<Instruction *>
2154 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
2155   auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
2156                                              unsigned DemandedWidth) {
2157     APInt UndefElts(Width, 0);
2158     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2159     return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2160   };
2161 
2162   Intrinsic::ID IID = II.getIntrinsicID();
2163   switch (IID) {
2164   case Intrinsic::x86_bmi_bextr_32:
2165   case Intrinsic::x86_bmi_bextr_64:
2166   case Intrinsic::x86_tbm_bextri_u32:
2167   case Intrinsic::x86_tbm_bextri_u64:
2168     // If the RHS is a constant we can try some simplifications.
2169     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2170       uint64_t Shift = C->getZExtValue();
2171       uint64_t Length = (Shift >> 8) & 0xff;
2172       Shift &= 0xff;
2173       unsigned BitWidth = II.getType()->getIntegerBitWidth();
2174       // If the length is 0 or the shift is out of range, replace with zero.
2175       if (Length == 0 || Shift >= BitWidth) {
2176         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2177       }
2178       // If the LHS is also a constant, we can completely constant fold this.
2179       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2180         uint64_t Result = InC->getZExtValue() >> Shift;
2181         if (Length > BitWidth)
2182           Length = BitWidth;
2183         Result &= maskTrailingOnes<uint64_t>(Length);
2184         return IC.replaceInstUsesWith(II,
2185                                       ConstantInt::get(II.getType(), Result));
2186       }
2187       // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2188       // are only masking bits that a shift already cleared?
2189     }
2190     break;
2191 
2192   case Intrinsic::x86_bmi_bzhi_32:
2193   case Intrinsic::x86_bmi_bzhi_64:
2194     // If the RHS is a constant we can try some simplifications.
2195     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2196       uint64_t Index = C->getZExtValue() & 0xff;
2197       unsigned BitWidth = II.getType()->getIntegerBitWidth();
2198       if (Index >= BitWidth) {
2199         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2200       }
2201       if (Index == 0) {
2202         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2203       }
2204       // If the LHS is also a constant, we can completely constant fold this.
2205       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2206         uint64_t Result = InC->getZExtValue();
2207         Result &= maskTrailingOnes<uint64_t>(Index);
2208         return IC.replaceInstUsesWith(II,
2209                                       ConstantInt::get(II.getType(), Result));
2210       }
2211       // TODO should we convert this to an AND if the RHS is constant?
2212     }
2213     break;
2214   case Intrinsic::x86_bmi_pext_32:
2215   case Intrinsic::x86_bmi_pext_64:
2216     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2217       if (MaskC->isNullValue()) {
2218         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2219       }
2220       if (MaskC->isAllOnesValue()) {
2221         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2222       }
2223 
2224       unsigned MaskIdx, MaskLen;
2225       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2226         // any single contingous sequence of 1s anywhere in the mask simply
2227         // describes a subset of the input bits shifted to the appropriate
2228         // position.  Replace with the straight forward IR.
2229         Value *Input = II.getArgOperand(0);
2230         Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2231         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2232         Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2233         return IC.replaceInstUsesWith(II, Shifted);
2234       }
2235 
2236       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2237         uint64_t Src = SrcC->getZExtValue();
2238         uint64_t Mask = MaskC->getZExtValue();
2239         uint64_t Result = 0;
2240         uint64_t BitToSet = 1;
2241 
2242         while (Mask) {
2243           // Isolate lowest set bit.
2244           uint64_t BitToTest = Mask & -Mask;
2245           if (BitToTest & Src)
2246             Result |= BitToSet;
2247 
2248           BitToSet <<= 1;
2249           // Clear lowest set bit.
2250           Mask &= Mask - 1;
2251         }
2252 
2253         return IC.replaceInstUsesWith(II,
2254                                       ConstantInt::get(II.getType(), Result));
2255       }
2256     }
2257     break;
2258   case Intrinsic::x86_bmi_pdep_32:
2259   case Intrinsic::x86_bmi_pdep_64:
2260     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2261       if (MaskC->isNullValue()) {
2262         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2263       }
2264       if (MaskC->isAllOnesValue()) {
2265         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2266       }
2267 
2268       unsigned MaskIdx, MaskLen;
2269       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2270         // any single contingous sequence of 1s anywhere in the mask simply
2271         // describes a subset of the input bits shifted to the appropriate
2272         // position.  Replace with the straight forward IR.
2273         Value *Input = II.getArgOperand(0);
2274         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2275         Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2276         Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2277         return IC.replaceInstUsesWith(II, Masked);
2278       }
2279 
2280       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2281         uint64_t Src = SrcC->getZExtValue();
2282         uint64_t Mask = MaskC->getZExtValue();
2283         uint64_t Result = 0;
2284         uint64_t BitToTest = 1;
2285 
2286         while (Mask) {
2287           // Isolate lowest set bit.
2288           uint64_t BitToSet = Mask & -Mask;
2289           if (BitToTest & Src)
2290             Result |= BitToSet;
2291 
2292           BitToTest <<= 1;
2293           // Clear lowest set bit;
2294           Mask &= Mask - 1;
2295         }
2296 
2297         return IC.replaceInstUsesWith(II,
2298                                       ConstantInt::get(II.getType(), Result));
2299       }
2300     }
2301     break;
2302 
2303   case Intrinsic::x86_sse_cvtss2si:
2304   case Intrinsic::x86_sse_cvtss2si64:
2305   case Intrinsic::x86_sse_cvttss2si:
2306   case Intrinsic::x86_sse_cvttss2si64:
2307   case Intrinsic::x86_sse2_cvtsd2si:
2308   case Intrinsic::x86_sse2_cvtsd2si64:
2309   case Intrinsic::x86_sse2_cvttsd2si:
2310   case Intrinsic::x86_sse2_cvttsd2si64:
2311   case Intrinsic::x86_avx512_vcvtss2si32:
2312   case Intrinsic::x86_avx512_vcvtss2si64:
2313   case Intrinsic::x86_avx512_vcvtss2usi32:
2314   case Intrinsic::x86_avx512_vcvtss2usi64:
2315   case Intrinsic::x86_avx512_vcvtsd2si32:
2316   case Intrinsic::x86_avx512_vcvtsd2si64:
2317   case Intrinsic::x86_avx512_vcvtsd2usi32:
2318   case Intrinsic::x86_avx512_vcvtsd2usi64:
2319   case Intrinsic::x86_avx512_cvttss2si:
2320   case Intrinsic::x86_avx512_cvttss2si64:
2321   case Intrinsic::x86_avx512_cvttss2usi:
2322   case Intrinsic::x86_avx512_cvttss2usi64:
2323   case Intrinsic::x86_avx512_cvttsd2si:
2324   case Intrinsic::x86_avx512_cvttsd2si64:
2325   case Intrinsic::x86_avx512_cvttsd2usi:
2326   case Intrinsic::x86_avx512_cvttsd2usi64: {
2327     // These intrinsics only demand the 0th element of their input vectors. If
2328     // we can simplify the input based on that, do so now.
2329     Value *Arg = II.getArgOperand(0);
2330     unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2331     if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2332       return IC.replaceOperand(II, 0, V);
2333     }
2334     break;
2335   }
2336 
2337   case Intrinsic::x86_mmx_pmovmskb:
2338   case Intrinsic::x86_sse_movmsk_ps:
2339   case Intrinsic::x86_sse2_movmsk_pd:
2340   case Intrinsic::x86_sse2_pmovmskb_128:
2341   case Intrinsic::x86_avx_movmsk_pd_256:
2342   case Intrinsic::x86_avx_movmsk_ps_256:
2343   case Intrinsic::x86_avx2_pmovmskb:
2344     if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2345       return IC.replaceInstUsesWith(II, V);
2346     }
2347     break;
2348 
2349   case Intrinsic::x86_sse_comieq_ss:
2350   case Intrinsic::x86_sse_comige_ss:
2351   case Intrinsic::x86_sse_comigt_ss:
2352   case Intrinsic::x86_sse_comile_ss:
2353   case Intrinsic::x86_sse_comilt_ss:
2354   case Intrinsic::x86_sse_comineq_ss:
2355   case Intrinsic::x86_sse_ucomieq_ss:
2356   case Intrinsic::x86_sse_ucomige_ss:
2357   case Intrinsic::x86_sse_ucomigt_ss:
2358   case Intrinsic::x86_sse_ucomile_ss:
2359   case Intrinsic::x86_sse_ucomilt_ss:
2360   case Intrinsic::x86_sse_ucomineq_ss:
2361   case Intrinsic::x86_sse2_comieq_sd:
2362   case Intrinsic::x86_sse2_comige_sd:
2363   case Intrinsic::x86_sse2_comigt_sd:
2364   case Intrinsic::x86_sse2_comile_sd:
2365   case Intrinsic::x86_sse2_comilt_sd:
2366   case Intrinsic::x86_sse2_comineq_sd:
2367   case Intrinsic::x86_sse2_ucomieq_sd:
2368   case Intrinsic::x86_sse2_ucomige_sd:
2369   case Intrinsic::x86_sse2_ucomigt_sd:
2370   case Intrinsic::x86_sse2_ucomile_sd:
2371   case Intrinsic::x86_sse2_ucomilt_sd:
2372   case Intrinsic::x86_sse2_ucomineq_sd:
2373   case Intrinsic::x86_avx512_vcomi_ss:
2374   case Intrinsic::x86_avx512_vcomi_sd:
2375   case Intrinsic::x86_avx512_mask_cmp_ss:
2376   case Intrinsic::x86_avx512_mask_cmp_sd: {
2377     // These intrinsics only demand the 0th element of their input vectors. If
2378     // we can simplify the input based on that, do so now.
2379     bool MadeChange = false;
2380     Value *Arg0 = II.getArgOperand(0);
2381     Value *Arg1 = II.getArgOperand(1);
2382     unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2383     if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2384       IC.replaceOperand(II, 0, V);
2385       MadeChange = true;
2386     }
2387     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2388       IC.replaceOperand(II, 1, V);
2389       MadeChange = true;
2390     }
2391     if (MadeChange) {
2392       return &II;
2393     }
2394     break;
2395   }
2396 
2397   case Intrinsic::x86_avx512_add_ps_512:
2398   case Intrinsic::x86_avx512_div_ps_512:
2399   case Intrinsic::x86_avx512_mul_ps_512:
2400   case Intrinsic::x86_avx512_sub_ps_512:
2401   case Intrinsic::x86_avx512_add_pd_512:
2402   case Intrinsic::x86_avx512_div_pd_512:
2403   case Intrinsic::x86_avx512_mul_pd_512:
2404   case Intrinsic::x86_avx512_sub_pd_512:
2405     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2406     // IR operations.
2407     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2408       if (R->getValue() == 4) {
2409         Value *Arg0 = II.getArgOperand(0);
2410         Value *Arg1 = II.getArgOperand(1);
2411 
2412         Value *V;
2413         switch (IID) {
2414         default:
2415           llvm_unreachable("Case stmts out of sync!");
2416         case Intrinsic::x86_avx512_add_ps_512:
2417         case Intrinsic::x86_avx512_add_pd_512:
2418           V = IC.Builder.CreateFAdd(Arg0, Arg1);
2419           break;
2420         case Intrinsic::x86_avx512_sub_ps_512:
2421         case Intrinsic::x86_avx512_sub_pd_512:
2422           V = IC.Builder.CreateFSub(Arg0, Arg1);
2423           break;
2424         case Intrinsic::x86_avx512_mul_ps_512:
2425         case Intrinsic::x86_avx512_mul_pd_512:
2426           V = IC.Builder.CreateFMul(Arg0, Arg1);
2427           break;
2428         case Intrinsic::x86_avx512_div_ps_512:
2429         case Intrinsic::x86_avx512_div_pd_512:
2430           V = IC.Builder.CreateFDiv(Arg0, Arg1);
2431           break;
2432         }
2433 
2434         return IC.replaceInstUsesWith(II, V);
2435       }
2436     }
2437     break;
2438 
2439   case Intrinsic::x86_avx512_mask_add_ss_round:
2440   case Intrinsic::x86_avx512_mask_div_ss_round:
2441   case Intrinsic::x86_avx512_mask_mul_ss_round:
2442   case Intrinsic::x86_avx512_mask_sub_ss_round:
2443   case Intrinsic::x86_avx512_mask_add_sd_round:
2444   case Intrinsic::x86_avx512_mask_div_sd_round:
2445   case Intrinsic::x86_avx512_mask_mul_sd_round:
2446   case Intrinsic::x86_avx512_mask_sub_sd_round:
2447     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2448     // IR operations.
2449     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2450       if (R->getValue() == 4) {
2451         // Extract the element as scalars.
2452         Value *Arg0 = II.getArgOperand(0);
2453         Value *Arg1 = II.getArgOperand(1);
2454         Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
2455         Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
2456 
2457         Value *V;
2458         switch (IID) {
2459         default:
2460           llvm_unreachable("Case stmts out of sync!");
2461         case Intrinsic::x86_avx512_mask_add_ss_round:
2462         case Intrinsic::x86_avx512_mask_add_sd_round:
2463           V = IC.Builder.CreateFAdd(LHS, RHS);
2464           break;
2465         case Intrinsic::x86_avx512_mask_sub_ss_round:
2466         case Intrinsic::x86_avx512_mask_sub_sd_round:
2467           V = IC.Builder.CreateFSub(LHS, RHS);
2468           break;
2469         case Intrinsic::x86_avx512_mask_mul_ss_round:
2470         case Intrinsic::x86_avx512_mask_mul_sd_round:
2471           V = IC.Builder.CreateFMul(LHS, RHS);
2472           break;
2473         case Intrinsic::x86_avx512_mask_div_ss_round:
2474         case Intrinsic::x86_avx512_mask_div_sd_round:
2475           V = IC.Builder.CreateFDiv(LHS, RHS);
2476           break;
2477         }
2478 
2479         // Handle the masking aspect of the intrinsic.
2480         Value *Mask = II.getArgOperand(3);
2481         auto *C = dyn_cast<ConstantInt>(Mask);
2482         // We don't need a select if we know the mask bit is a 1.
2483         if (!C || !C->getValue()[0]) {
2484           // Cast the mask to an i1 vector and then extract the lowest element.
2485           auto *MaskTy = FixedVectorType::get(
2486               IC.Builder.getInt1Ty(),
2487               cast<IntegerType>(Mask->getType())->getBitWidth());
2488           Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2489           Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2490           // Extract the lowest element from the passthru operand.
2491           Value *Passthru =
2492               IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2493           V = IC.Builder.CreateSelect(Mask, V, Passthru);
2494         }
2495 
2496         // Insert the result back into the original argument 0.
2497         V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2498 
2499         return IC.replaceInstUsesWith(II, V);
2500       }
2501     }
2502     break;
2503 
2504   // Constant fold ashr( <A x Bi>, Ci ).
2505   // Constant fold lshr( <A x Bi>, Ci ).
2506   // Constant fold shl( <A x Bi>, Ci ).
2507   case Intrinsic::x86_sse2_psrai_d:
2508   case Intrinsic::x86_sse2_psrai_w:
2509   case Intrinsic::x86_avx2_psrai_d:
2510   case Intrinsic::x86_avx2_psrai_w:
2511   case Intrinsic::x86_avx512_psrai_q_128:
2512   case Intrinsic::x86_avx512_psrai_q_256:
2513   case Intrinsic::x86_avx512_psrai_d_512:
2514   case Intrinsic::x86_avx512_psrai_q_512:
2515   case Intrinsic::x86_avx512_psrai_w_512:
2516   case Intrinsic::x86_sse2_psrli_d:
2517   case Intrinsic::x86_sse2_psrli_q:
2518   case Intrinsic::x86_sse2_psrli_w:
2519   case Intrinsic::x86_avx2_psrli_d:
2520   case Intrinsic::x86_avx2_psrli_q:
2521   case Intrinsic::x86_avx2_psrli_w:
2522   case Intrinsic::x86_avx512_psrli_d_512:
2523   case Intrinsic::x86_avx512_psrli_q_512:
2524   case Intrinsic::x86_avx512_psrli_w_512:
2525   case Intrinsic::x86_sse2_pslli_d:
2526   case Intrinsic::x86_sse2_pslli_q:
2527   case Intrinsic::x86_sse2_pslli_w:
2528   case Intrinsic::x86_avx2_pslli_d:
2529   case Intrinsic::x86_avx2_pslli_q:
2530   case Intrinsic::x86_avx2_pslli_w:
2531   case Intrinsic::x86_avx512_pslli_d_512:
2532   case Intrinsic::x86_avx512_pslli_q_512:
2533   case Intrinsic::x86_avx512_pslli_w_512:
2534     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2535       return IC.replaceInstUsesWith(II, V);
2536     }
2537     break;
2538 
2539   case Intrinsic::x86_sse2_psra_d:
2540   case Intrinsic::x86_sse2_psra_w:
2541   case Intrinsic::x86_avx2_psra_d:
2542   case Intrinsic::x86_avx2_psra_w:
2543   case Intrinsic::x86_avx512_psra_q_128:
2544   case Intrinsic::x86_avx512_psra_q_256:
2545   case Intrinsic::x86_avx512_psra_d_512:
2546   case Intrinsic::x86_avx512_psra_q_512:
2547   case Intrinsic::x86_avx512_psra_w_512:
2548   case Intrinsic::x86_sse2_psrl_d:
2549   case Intrinsic::x86_sse2_psrl_q:
2550   case Intrinsic::x86_sse2_psrl_w:
2551   case Intrinsic::x86_avx2_psrl_d:
2552   case Intrinsic::x86_avx2_psrl_q:
2553   case Intrinsic::x86_avx2_psrl_w:
2554   case Intrinsic::x86_avx512_psrl_d_512:
2555   case Intrinsic::x86_avx512_psrl_q_512:
2556   case Intrinsic::x86_avx512_psrl_w_512:
2557   case Intrinsic::x86_sse2_psll_d:
2558   case Intrinsic::x86_sse2_psll_q:
2559   case Intrinsic::x86_sse2_psll_w:
2560   case Intrinsic::x86_avx2_psll_d:
2561   case Intrinsic::x86_avx2_psll_q:
2562   case Intrinsic::x86_avx2_psll_w:
2563   case Intrinsic::x86_avx512_psll_d_512:
2564   case Intrinsic::x86_avx512_psll_q_512:
2565   case Intrinsic::x86_avx512_psll_w_512: {
2566     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2567       return IC.replaceInstUsesWith(II, V);
2568     }
2569 
2570     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2571     // operand to compute the shift amount.
2572     Value *Arg1 = II.getArgOperand(1);
2573     assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2574            "Unexpected packed shift size");
2575     unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2576 
2577     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2578       return IC.replaceOperand(II, 1, V);
2579     }
2580     break;
2581   }
2582 
2583   case Intrinsic::x86_avx2_psllv_d:
2584   case Intrinsic::x86_avx2_psllv_d_256:
2585   case Intrinsic::x86_avx2_psllv_q:
2586   case Intrinsic::x86_avx2_psllv_q_256:
2587   case Intrinsic::x86_avx512_psllv_d_512:
2588   case Intrinsic::x86_avx512_psllv_q_512:
2589   case Intrinsic::x86_avx512_psllv_w_128:
2590   case Intrinsic::x86_avx512_psllv_w_256:
2591   case Intrinsic::x86_avx512_psllv_w_512:
2592   case Intrinsic::x86_avx2_psrav_d:
2593   case Intrinsic::x86_avx2_psrav_d_256:
2594   case Intrinsic::x86_avx512_psrav_q_128:
2595   case Intrinsic::x86_avx512_psrav_q_256:
2596   case Intrinsic::x86_avx512_psrav_d_512:
2597   case Intrinsic::x86_avx512_psrav_q_512:
2598   case Intrinsic::x86_avx512_psrav_w_128:
2599   case Intrinsic::x86_avx512_psrav_w_256:
2600   case Intrinsic::x86_avx512_psrav_w_512:
2601   case Intrinsic::x86_avx2_psrlv_d:
2602   case Intrinsic::x86_avx2_psrlv_d_256:
2603   case Intrinsic::x86_avx2_psrlv_q:
2604   case Intrinsic::x86_avx2_psrlv_q_256:
2605   case Intrinsic::x86_avx512_psrlv_d_512:
2606   case Intrinsic::x86_avx512_psrlv_q_512:
2607   case Intrinsic::x86_avx512_psrlv_w_128:
2608   case Intrinsic::x86_avx512_psrlv_w_256:
2609   case Intrinsic::x86_avx512_psrlv_w_512:
2610     if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2611       return IC.replaceInstUsesWith(II, V);
2612     }
2613     break;
2614 
2615   case Intrinsic::x86_sse2_packssdw_128:
2616   case Intrinsic::x86_sse2_packsswb_128:
2617   case Intrinsic::x86_avx2_packssdw:
2618   case Intrinsic::x86_avx2_packsswb:
2619   case Intrinsic::x86_avx512_packssdw_512:
2620   case Intrinsic::x86_avx512_packsswb_512:
2621     if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2622       return IC.replaceInstUsesWith(II, V);
2623     }
2624     break;
2625 
2626   case Intrinsic::x86_sse2_packuswb_128:
2627   case Intrinsic::x86_sse41_packusdw:
2628   case Intrinsic::x86_avx2_packusdw:
2629   case Intrinsic::x86_avx2_packuswb:
2630   case Intrinsic::x86_avx512_packusdw_512:
2631   case Intrinsic::x86_avx512_packuswb_512:
2632     if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2633       return IC.replaceInstUsesWith(II, V);
2634     }
2635     break;
2636 
2637   case Intrinsic::x86_sse2_pmulh_w:
2638   case Intrinsic::x86_avx2_pmulh_w:
2639   case Intrinsic::x86_avx512_pmulh_w_512:
2640     if (Value *V = simplifyX86pmulh(II, IC.Builder, true, false)) {
2641       return IC.replaceInstUsesWith(II, V);
2642     }
2643     break;
2644 
2645   case Intrinsic::x86_sse2_pmulhu_w:
2646   case Intrinsic::x86_avx2_pmulhu_w:
2647   case Intrinsic::x86_avx512_pmulhu_w_512:
2648     if (Value *V = simplifyX86pmulh(II, IC.Builder, false, false)) {
2649       return IC.replaceInstUsesWith(II, V);
2650     }
2651     break;
2652 
2653   case Intrinsic::x86_ssse3_pmul_hr_sw_128:
2654   case Intrinsic::x86_avx2_pmul_hr_sw:
2655   case Intrinsic::x86_avx512_pmul_hr_sw_512:
2656     if (Value *V = simplifyX86pmulh(II, IC.Builder, true, true)) {
2657       return IC.replaceInstUsesWith(II, V);
2658     }
2659     break;
2660 
2661   case Intrinsic::x86_sse2_pmadd_wd:
2662   case Intrinsic::x86_avx2_pmadd_wd:
2663   case Intrinsic::x86_avx512_pmaddw_d_512:
2664     if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) {
2665       return IC.replaceInstUsesWith(II, V);
2666     }
2667     break;
2668 
2669   case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
2670   case Intrinsic::x86_avx2_pmadd_ub_sw:
2671   case Intrinsic::x86_avx512_pmaddubs_w_512:
2672     if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) {
2673       return IC.replaceInstUsesWith(II, V);
2674     }
2675     break;
2676 
2677   case Intrinsic::x86_pclmulqdq:
2678   case Intrinsic::x86_pclmulqdq_256:
2679   case Intrinsic::x86_pclmulqdq_512: {
2680     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2681       unsigned Imm = C->getZExtValue();
2682 
2683       bool MadeChange = false;
2684       Value *Arg0 = II.getArgOperand(0);
2685       Value *Arg1 = II.getArgOperand(1);
2686       unsigned VWidth =
2687           cast<FixedVectorType>(Arg0->getType())->getNumElements();
2688 
2689       APInt UndefElts1(VWidth, 0);
2690       APInt DemandedElts1 =
2691           APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2692       if (Value *V =
2693               IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2694         IC.replaceOperand(II, 0, V);
2695         MadeChange = true;
2696       }
2697 
2698       APInt UndefElts2(VWidth, 0);
2699       APInt DemandedElts2 =
2700           APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2701       if (Value *V =
2702               IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2703         IC.replaceOperand(II, 1, V);
2704         MadeChange = true;
2705       }
2706 
2707       // If either input elements are undef, the result is zero.
2708       if (DemandedElts1.isSubsetOf(UndefElts1) ||
2709           DemandedElts2.isSubsetOf(UndefElts2)) {
2710         return IC.replaceInstUsesWith(II,
2711                                       ConstantAggregateZero::get(II.getType()));
2712       }
2713 
2714       if (MadeChange) {
2715         return &II;
2716       }
2717     }
2718     break;
2719   }
2720 
2721   case Intrinsic::x86_sse41_insertps:
2722     if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2723       return IC.replaceInstUsesWith(II, V);
2724     }
2725     break;
2726 
2727   case Intrinsic::x86_sse4a_extrq: {
2728     Value *Op0 = II.getArgOperand(0);
2729     Value *Op1 = II.getArgOperand(1);
2730     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2731     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2732     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2733            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2734            VWidth1 == 16 && "Unexpected operand sizes");
2735 
2736     // See if we're dealing with constant values.
2737     auto *C1 = dyn_cast<Constant>(Op1);
2738     auto *CILength =
2739         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2740            : nullptr;
2741     auto *CIIndex =
2742         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2743            : nullptr;
2744 
2745     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2746     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2747       return IC.replaceInstUsesWith(II, V);
2748     }
2749 
2750     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2751     // operands and the lowest 16-bits of the second.
2752     bool MadeChange = false;
2753     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2754       IC.replaceOperand(II, 0, V);
2755       MadeChange = true;
2756     }
2757     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2758       IC.replaceOperand(II, 1, V);
2759       MadeChange = true;
2760     }
2761     if (MadeChange) {
2762       return &II;
2763     }
2764     break;
2765   }
2766 
2767   case Intrinsic::x86_sse4a_extrqi: {
2768     // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2769     // bits of the lower 64-bits. The upper 64-bits are undefined.
2770     Value *Op0 = II.getArgOperand(0);
2771     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2772     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2773            "Unexpected operand size");
2774 
2775     // See if we're dealing with constant values.
2776     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2777     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2778 
2779     // Attempt to simplify to a constant or shuffle vector.
2780     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2781       return IC.replaceInstUsesWith(II, V);
2782     }
2783 
2784     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2785     // operand.
2786     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2787       return IC.replaceOperand(II, 0, V);
2788     }
2789     break;
2790   }
2791 
2792   case Intrinsic::x86_sse4a_insertq: {
2793     Value *Op0 = II.getArgOperand(0);
2794     Value *Op1 = II.getArgOperand(1);
2795     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2796     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2797            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2798            cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2799            "Unexpected operand size");
2800 
2801     // See if we're dealing with constant values.
2802     auto *C1 = dyn_cast<Constant>(Op1);
2803     auto *CI11 =
2804         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2805            : nullptr;
2806 
2807     // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2808     if (CI11) {
2809       const APInt &V11 = CI11->getValue();
2810       APInt Len = V11.zextOrTrunc(6);
2811       APInt Idx = V11.lshr(8).zextOrTrunc(6);
2812       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2813         return IC.replaceInstUsesWith(II, V);
2814       }
2815     }
2816 
2817     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2818     // operand.
2819     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2820       return IC.replaceOperand(II, 0, V);
2821     }
2822     break;
2823   }
2824 
2825   case Intrinsic::x86_sse4a_insertqi: {
2826     // INSERTQI: Extract lowest Length bits from lower half of second source and
2827     // insert over first source starting at Index bit. The upper 64-bits are
2828     // undefined.
2829     Value *Op0 = II.getArgOperand(0);
2830     Value *Op1 = II.getArgOperand(1);
2831     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2832     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2833     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2834            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2835            VWidth1 == 2 && "Unexpected operand sizes");
2836 
2837     // See if we're dealing with constant values.
2838     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2839     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2840 
2841     // Attempt to simplify to a constant or shuffle vector.
2842     if (CILength && CIIndex) {
2843       APInt Len = CILength->getValue().zextOrTrunc(6);
2844       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2845       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2846         return IC.replaceInstUsesWith(II, V);
2847       }
2848     }
2849 
2850     // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2851     // operands.
2852     bool MadeChange = false;
2853     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2854       IC.replaceOperand(II, 0, V);
2855       MadeChange = true;
2856     }
2857     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2858       IC.replaceOperand(II, 1, V);
2859       MadeChange = true;
2860     }
2861     if (MadeChange) {
2862       return &II;
2863     }
2864     break;
2865   }
2866 
2867   case Intrinsic::x86_sse41_pblendvb:
2868   case Intrinsic::x86_sse41_blendvps:
2869   case Intrinsic::x86_sse41_blendvpd:
2870   case Intrinsic::x86_avx_blendv_ps_256:
2871   case Intrinsic::x86_avx_blendv_pd_256:
2872   case Intrinsic::x86_avx2_pblendvb: {
2873     // fold (blend A, A, Mask) -> A
2874     Value *Op0 = II.getArgOperand(0);
2875     Value *Op1 = II.getArgOperand(1);
2876     Value *Mask = II.getArgOperand(2);
2877     if (Op0 == Op1) {
2878       return IC.replaceInstUsesWith(II, Op0);
2879     }
2880 
2881     // Zero Mask - select 1st argument.
2882     if (isa<ConstantAggregateZero>(Mask)) {
2883       return IC.replaceInstUsesWith(II, Op0);
2884     }
2885 
2886     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2887     if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2888       Constant *NewSelector =
2889           getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
2890       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2891     }
2892 
2893     Mask = InstCombiner::peekThroughBitcast(Mask);
2894 
2895     // Peek through a one-use shuffle - VectorCombine should have simplified
2896     // this for cases where we're splitting wider vectors to use blendv
2897     // intrinsics.
2898     Value *MaskSrc = nullptr;
2899     ArrayRef<int> ShuffleMask;
2900     if (match(Mask, m_OneUse(m_Shuffle(m_Value(MaskSrc), m_Undef(),
2901                                        m_Mask(ShuffleMask))))) {
2902       // Bail if the shuffle was irregular or contains undefs.
2903       int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2904       if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) ||
2905           any_of(ShuffleMask,
2906                  [NumElts](int M) { return M < 0 || M >= NumElts; }))
2907         break;
2908       Mask = InstCombiner::peekThroughBitcast(MaskSrc);
2909     }
2910 
2911     // Convert to a vector select if we can bypass casts and find a boolean
2912     // vector condition value.
2913     Value *BoolVec;
2914     if (match(Mask, m_SExt(m_Value(BoolVec))) &&
2915         BoolVec->getType()->isVectorTy() &&
2916         BoolVec->getType()->getScalarSizeInBits() == 1) {
2917       auto *MaskTy = cast<FixedVectorType>(Mask->getType());
2918       auto *OpTy = cast<FixedVectorType>(II.getType());
2919       unsigned NumMaskElts = MaskTy->getNumElements();
2920       unsigned NumOperandElts = OpTy->getNumElements();
2921 
2922       // If we peeked through a shuffle, reapply the shuffle to the bool vector.
2923       if (MaskSrc) {
2924         unsigned NumMaskSrcElts =
2925             cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2926         NumMaskElts = (ShuffleMask.size() * NumMaskElts) / NumMaskSrcElts;
2927         // Multiple mask bits maps to the same operand element - bail out.
2928         if (NumMaskElts > NumOperandElts)
2929           break;
2930         SmallVector<int> ScaledMask;
2931         if (!llvm::scaleShuffleMaskElts(NumMaskElts, ShuffleMask, ScaledMask))
2932           break;
2933         BoolVec = IC.Builder.CreateShuffleVector(BoolVec, ScaledMask);
2934         MaskTy = FixedVectorType::get(MaskTy->getElementType(), NumMaskElts);
2935       }
2936       assert(MaskTy->getPrimitiveSizeInBits() ==
2937                  OpTy->getPrimitiveSizeInBits() &&
2938              "Not expecting mask and operands with different sizes");
2939 
2940       if (NumMaskElts == NumOperandElts) {
2941         return SelectInst::Create(BoolVec, Op1, Op0);
2942       }
2943 
2944       // If the mask has less elements than the operands, each mask bit maps to
2945       // multiple elements of the operands. Bitcast back and forth.
2946       if (NumMaskElts < NumOperandElts) {
2947         Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);
2948         Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);
2949         Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
2950         return new BitCastInst(Sel, II.getType());
2951       }
2952     }
2953 
2954     break;
2955   }
2956 
2957   case Intrinsic::x86_ssse3_pshuf_b_128:
2958   case Intrinsic::x86_avx2_pshuf_b:
2959   case Intrinsic::x86_avx512_pshuf_b_512: {
2960     if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
2961       return IC.replaceInstUsesWith(II, V);
2962     }
2963 
2964     KnownBits KnownMask(8);
2965     if (IC.SimplifyDemandedBits(&II, 1, APInt(8, 0b10001111), KnownMask))
2966       return &II;
2967     break;
2968   }
2969 
2970   case Intrinsic::x86_avx_vpermilvar_ps:
2971   case Intrinsic::x86_avx_vpermilvar_ps_256:
2972   case Intrinsic::x86_avx512_vpermilvar_ps_512: {
2973     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2974       return IC.replaceInstUsesWith(II, V);
2975     }
2976 
2977     KnownBits KnownMask(32);
2978     if (IC.SimplifyDemandedBits(&II, 1, APInt(32, 0b00011), KnownMask))
2979       return &II;
2980     break;
2981   }
2982 
2983   case Intrinsic::x86_avx_vpermilvar_pd:
2984   case Intrinsic::x86_avx_vpermilvar_pd_256:
2985   case Intrinsic::x86_avx512_vpermilvar_pd_512: {
2986     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2987       return IC.replaceInstUsesWith(II, V);
2988     }
2989 
2990     KnownBits KnownMask(64);
2991     if (IC.SimplifyDemandedBits(&II, 1, APInt(64, 0b00010), KnownMask))
2992       return &II;
2993     break;
2994   }
2995 
2996   case Intrinsic::x86_avx2_permd:
2997   case Intrinsic::x86_avx2_permps:
2998   case Intrinsic::x86_avx512_permvar_df_256:
2999   case Intrinsic::x86_avx512_permvar_df_512:
3000   case Intrinsic::x86_avx512_permvar_di_256:
3001   case Intrinsic::x86_avx512_permvar_di_512:
3002   case Intrinsic::x86_avx512_permvar_hi_128:
3003   case Intrinsic::x86_avx512_permvar_hi_256:
3004   case Intrinsic::x86_avx512_permvar_hi_512:
3005   case Intrinsic::x86_avx512_permvar_qi_128:
3006   case Intrinsic::x86_avx512_permvar_qi_256:
3007   case Intrinsic::x86_avx512_permvar_qi_512:
3008   case Intrinsic::x86_avx512_permvar_sf_512:
3009   case Intrinsic::x86_avx512_permvar_si_512:
3010     if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
3011       return IC.replaceInstUsesWith(II, V);
3012     }
3013     if (simplifyX86VPERMMask(&II, /*IsBinary=*/false, IC))
3014       return &II;
3015     break;
3016 
3017   case Intrinsic::x86_avx512_vpermi2var_d_128:
3018   case Intrinsic::x86_avx512_vpermi2var_d_256:
3019   case Intrinsic::x86_avx512_vpermi2var_d_512:
3020   case Intrinsic::x86_avx512_vpermi2var_hi_128:
3021   case Intrinsic::x86_avx512_vpermi2var_hi_256:
3022   case Intrinsic::x86_avx512_vpermi2var_hi_512:
3023   case Intrinsic::x86_avx512_vpermi2var_pd_128:
3024   case Intrinsic::x86_avx512_vpermi2var_pd_256:
3025   case Intrinsic::x86_avx512_vpermi2var_pd_512:
3026   case Intrinsic::x86_avx512_vpermi2var_ps_128:
3027   case Intrinsic::x86_avx512_vpermi2var_ps_256:
3028   case Intrinsic::x86_avx512_vpermi2var_ps_512:
3029   case Intrinsic::x86_avx512_vpermi2var_q_128:
3030   case Intrinsic::x86_avx512_vpermi2var_q_256:
3031   case Intrinsic::x86_avx512_vpermi2var_q_512:
3032   case Intrinsic::x86_avx512_vpermi2var_qi_128:
3033   case Intrinsic::x86_avx512_vpermi2var_qi_256:
3034   case Intrinsic::x86_avx512_vpermi2var_qi_512:
3035     if (Value *V = simplifyX86vpermv3(II, IC.Builder)) {
3036       return IC.replaceInstUsesWith(II, V);
3037     }
3038     if (simplifyX86VPERMMask(&II, /*IsBinary=*/true, IC))
3039       return &II;
3040     break;
3041 
3042   case Intrinsic::x86_avx_maskload_ps:
3043   case Intrinsic::x86_avx_maskload_pd:
3044   case Intrinsic::x86_avx_maskload_ps_256:
3045   case Intrinsic::x86_avx_maskload_pd_256:
3046   case Intrinsic::x86_avx2_maskload_d:
3047   case Intrinsic::x86_avx2_maskload_q:
3048   case Intrinsic::x86_avx2_maskload_d_256:
3049   case Intrinsic::x86_avx2_maskload_q_256:
3050     if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
3051       return I;
3052     }
3053     break;
3054 
3055   case Intrinsic::x86_sse2_maskmov_dqu:
3056   case Intrinsic::x86_avx_maskstore_ps:
3057   case Intrinsic::x86_avx_maskstore_pd:
3058   case Intrinsic::x86_avx_maskstore_ps_256:
3059   case Intrinsic::x86_avx_maskstore_pd_256:
3060   case Intrinsic::x86_avx2_maskstore_d:
3061   case Intrinsic::x86_avx2_maskstore_q:
3062   case Intrinsic::x86_avx2_maskstore_d_256:
3063   case Intrinsic::x86_avx2_maskstore_q_256:
3064     if (simplifyX86MaskedStore(II, IC)) {
3065       return nullptr;
3066     }
3067     break;
3068 
3069   case Intrinsic::x86_addcarry_32:
3070   case Intrinsic::x86_addcarry_64:
3071     if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
3072       return IC.replaceInstUsesWith(II, V);
3073     }
3074     break;
3075 
3076   case Intrinsic::x86_avx512_pternlog_d_128:
3077   case Intrinsic::x86_avx512_pternlog_d_256:
3078   case Intrinsic::x86_avx512_pternlog_d_512:
3079   case Intrinsic::x86_avx512_pternlog_q_128:
3080   case Intrinsic::x86_avx512_pternlog_q_256:
3081   case Intrinsic::x86_avx512_pternlog_q_512:
3082     if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
3083       return IC.replaceInstUsesWith(II, V);
3084     }
3085     break;
3086   default:
3087     break;
3088   }
3089   return std::nullopt;
3090 }
3091 
3092 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
3093     InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
3094     bool &KnownBitsComputed) const {
3095   switch (II.getIntrinsicID()) {
3096   default:
3097     break;
3098   case Intrinsic::x86_mmx_pmovmskb:
3099   case Intrinsic::x86_sse_movmsk_ps:
3100   case Intrinsic::x86_sse2_movmsk_pd:
3101   case Intrinsic::x86_sse2_pmovmskb_128:
3102   case Intrinsic::x86_avx_movmsk_ps_256:
3103   case Intrinsic::x86_avx_movmsk_pd_256:
3104   case Intrinsic::x86_avx2_pmovmskb: {
3105     // MOVMSK copies the vector elements' sign bits to the low bits
3106     // and zeros the high bits.
3107     unsigned ArgWidth;
3108     if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
3109       ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
3110     } else {
3111       auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
3112       ArgWidth = ArgType->getNumElements();
3113     }
3114 
3115     // If we don't need any of low bits then return zero,
3116     // we know that DemandedMask is non-zero already.
3117     APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
3118     Type *VTy = II.getType();
3119     if (DemandedElts.isZero()) {
3120       return ConstantInt::getNullValue(VTy);
3121     }
3122 
3123     // We know that the upper bits are set to zero.
3124     Known.Zero.setBitsFrom(ArgWidth);
3125     KnownBitsComputed = true;
3126     break;
3127   }
3128   }
3129   return std::nullopt;
3130 }
3131 
3132 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
3133     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
3134     APInt &UndefElts2, APInt &UndefElts3,
3135     std::function<void(Instruction *, unsigned, APInt, APInt &)>
3136         simplifyAndSetOp) const {
3137   unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
3138   switch (II.getIntrinsicID()) {
3139   default:
3140     break;
3141   case Intrinsic::x86_xop_vfrcz_ss:
3142   case Intrinsic::x86_xop_vfrcz_sd:
3143     // The instructions for these intrinsics are speced to zero upper bits not
3144     // pass them through like other scalar intrinsics. So we shouldn't just
3145     // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
3146     // Instead we should return a zero vector.
3147     if (!DemandedElts[0]) {
3148       IC.addToWorklist(&II);
3149       return ConstantAggregateZero::get(II.getType());
3150     }
3151 
3152     // Only the lower element is used.
3153     DemandedElts = 1;
3154     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3155 
3156     // Only the lower element is undefined. The high elements are zero.
3157     UndefElts = UndefElts[0];
3158     break;
3159 
3160   // Unary scalar-as-vector operations that work column-wise.
3161   case Intrinsic::x86_sse_rcp_ss:
3162   case Intrinsic::x86_sse_rsqrt_ss:
3163     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3164 
3165     // If lowest element of a scalar op isn't used then use Arg0.
3166     if (!DemandedElts[0]) {
3167       IC.addToWorklist(&II);
3168       return II.getArgOperand(0);
3169     }
3170     // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
3171     // checks).
3172     break;
3173 
3174   // Binary scalar-as-vector operations that work column-wise. The high
3175   // elements come from operand 0. The low element is a function of both
3176   // operands.
3177   case Intrinsic::x86_sse_min_ss:
3178   case Intrinsic::x86_sse_max_ss:
3179   case Intrinsic::x86_sse_cmp_ss:
3180   case Intrinsic::x86_sse2_min_sd:
3181   case Intrinsic::x86_sse2_max_sd:
3182   case Intrinsic::x86_sse2_cmp_sd: {
3183     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3184 
3185     // If lowest element of a scalar op isn't used then use Arg0.
3186     if (!DemandedElts[0]) {
3187       IC.addToWorklist(&II);
3188       return II.getArgOperand(0);
3189     }
3190 
3191     // Only lower element is used for operand 1.
3192     DemandedElts = 1;
3193     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3194 
3195     // Lower element is undefined if both lower elements are undefined.
3196     // Consider things like undef&0.  The result is known zero, not undef.
3197     if (!UndefElts2[0])
3198       UndefElts.clearBit(0);
3199 
3200     break;
3201   }
3202 
3203   // Binary scalar-as-vector operations that work column-wise. The high
3204   // elements come from operand 0 and the low element comes from operand 1.
3205   case Intrinsic::x86_sse41_round_ss:
3206   case Intrinsic::x86_sse41_round_sd: {
3207     // Don't use the low element of operand 0.
3208     APInt DemandedElts2 = DemandedElts;
3209     DemandedElts2.clearBit(0);
3210     simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
3211 
3212     // If lowest element of a scalar op isn't used then use Arg0.
3213     if (!DemandedElts[0]) {
3214       IC.addToWorklist(&II);
3215       return II.getArgOperand(0);
3216     }
3217 
3218     // Only lower element is used for operand 1.
3219     DemandedElts = 1;
3220     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3221 
3222     // Take the high undef elements from operand 0 and take the lower element
3223     // from operand 1.
3224     UndefElts.clearBit(0);
3225     UndefElts |= UndefElts2[0];
3226     break;
3227   }
3228 
3229   // Three input scalar-as-vector operations that work column-wise. The high
3230   // elements come from operand 0 and the low element is a function of all
3231   // three inputs.
3232   case Intrinsic::x86_avx512_mask_add_ss_round:
3233   case Intrinsic::x86_avx512_mask_div_ss_round:
3234   case Intrinsic::x86_avx512_mask_mul_ss_round:
3235   case Intrinsic::x86_avx512_mask_sub_ss_round:
3236   case Intrinsic::x86_avx512_mask_max_ss_round:
3237   case Intrinsic::x86_avx512_mask_min_ss_round:
3238   case Intrinsic::x86_avx512_mask_add_sd_round:
3239   case Intrinsic::x86_avx512_mask_div_sd_round:
3240   case Intrinsic::x86_avx512_mask_mul_sd_round:
3241   case Intrinsic::x86_avx512_mask_sub_sd_round:
3242   case Intrinsic::x86_avx512_mask_max_sd_round:
3243   case Intrinsic::x86_avx512_mask_min_sd_round:
3244     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3245 
3246     // If lowest element of a scalar op isn't used then use Arg0.
3247     if (!DemandedElts[0]) {
3248       IC.addToWorklist(&II);
3249       return II.getArgOperand(0);
3250     }
3251 
3252     // Only lower element is used for operand 1 and 2.
3253     DemandedElts = 1;
3254     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3255     simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
3256 
3257     // Lower element is undefined if all three lower elements are undefined.
3258     // Consider things like undef&0.  The result is known zero, not undef.
3259     if (!UndefElts2[0] || !UndefElts3[0])
3260       UndefElts.clearBit(0);
3261     break;
3262 
3263   // TODO: Add fmaddsub support?
3264   case Intrinsic::x86_sse3_addsub_pd:
3265   case Intrinsic::x86_sse3_addsub_ps:
3266   case Intrinsic::x86_avx_addsub_pd_256:
3267   case Intrinsic::x86_avx_addsub_ps_256: {
3268     // If none of the even or none of the odd lanes are required, turn this
3269     // into a generic FP math instruction.
3270     APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
3271     APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
3272     bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
3273     bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
3274     if (IsSubOnly || IsAddOnly) {
3275       assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
3276       IRBuilderBase::InsertPointGuard Guard(IC.Builder);
3277       IC.Builder.SetInsertPoint(&II);
3278       Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3279       return IC.Builder.CreateBinOp(
3280           IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3281     }
3282 
3283     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3284     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3285     UndefElts &= UndefElts2;
3286     break;
3287   }
3288 
3289   // General per-element vector operations.
3290   case Intrinsic::x86_avx2_psllv_d:
3291   case Intrinsic::x86_avx2_psllv_d_256:
3292   case Intrinsic::x86_avx2_psllv_q:
3293   case Intrinsic::x86_avx2_psllv_q_256:
3294   case Intrinsic::x86_avx2_psrlv_d:
3295   case Intrinsic::x86_avx2_psrlv_d_256:
3296   case Intrinsic::x86_avx2_psrlv_q:
3297   case Intrinsic::x86_avx2_psrlv_q_256:
3298   case Intrinsic::x86_avx2_psrav_d:
3299   case Intrinsic::x86_avx2_psrav_d_256: {
3300     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3301     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3302     UndefElts &= UndefElts2;
3303     break;
3304   }
3305 
3306   case Intrinsic::x86_sse2_pmulh_w:
3307   case Intrinsic::x86_avx2_pmulh_w:
3308   case Intrinsic::x86_avx512_pmulh_w_512:
3309   case Intrinsic::x86_sse2_pmulhu_w:
3310   case Intrinsic::x86_avx2_pmulhu_w:
3311   case Intrinsic::x86_avx512_pmulhu_w_512:
3312   case Intrinsic::x86_ssse3_pmul_hr_sw_128:
3313   case Intrinsic::x86_avx2_pmul_hr_sw:
3314   case Intrinsic::x86_avx512_pmul_hr_sw_512: {
3315     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3316     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3317     // NOTE: mulh(undef,undef) != undef.
3318     break;
3319   }
3320 
3321   case Intrinsic::x86_sse2_packssdw_128:
3322   case Intrinsic::x86_sse2_packsswb_128:
3323   case Intrinsic::x86_sse2_packuswb_128:
3324   case Intrinsic::x86_sse41_packusdw:
3325   case Intrinsic::x86_avx2_packssdw:
3326   case Intrinsic::x86_avx2_packsswb:
3327   case Intrinsic::x86_avx2_packusdw:
3328   case Intrinsic::x86_avx2_packuswb:
3329   case Intrinsic::x86_avx512_packssdw_512:
3330   case Intrinsic::x86_avx512_packsswb_512:
3331   case Intrinsic::x86_avx512_packusdw_512:
3332   case Intrinsic::x86_avx512_packuswb_512: {
3333     auto *Ty0 = II.getArgOperand(0)->getType();
3334     unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3335     assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3336 
3337     unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3338     unsigned VWidthPerLane = VWidth / NumLanes;
3339     unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3340 
3341     // Per lane, pack the elements of the first input and then the second.
3342     // e.g.
3343     // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3344     // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3345     for (int OpNum = 0; OpNum != 2; ++OpNum) {
3346       APInt OpDemandedElts(InnerVWidth, 0);
3347       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3348         unsigned LaneIdx = Lane * VWidthPerLane;
3349         for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3350           unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3351           if (DemandedElts[Idx])
3352             OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3353         }
3354       }
3355 
3356       // Demand elements from the operand.
3357       APInt OpUndefElts(InnerVWidth, 0);
3358       simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3359 
3360       // Pack the operand's UNDEF elements, one lane at a time.
3361       OpUndefElts = OpUndefElts.zext(VWidth);
3362       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3363         APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3364         LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3365         LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3366         UndefElts |= LaneElts;
3367       }
3368     }
3369     break;
3370   }
3371 
3372   case Intrinsic::x86_sse2_pmadd_wd:
3373   case Intrinsic::x86_avx2_pmadd_wd:
3374   case Intrinsic::x86_avx512_pmaddw_d_512:
3375   case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
3376   case Intrinsic::x86_avx2_pmadd_ub_sw:
3377   case Intrinsic::x86_avx512_pmaddubs_w_512: {
3378     // PMADD - demand both src elements that map to each dst element.
3379     auto *ArgTy = II.getArgOperand(0)->getType();
3380     unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements();
3381     assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
3382     APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth);
3383     APInt Op0UndefElts(InnerVWidth, 0);
3384     APInt Op1UndefElts(InnerVWidth, 0);
3385     simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts);
3386     simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts);
3387     // NOTE: madd(undef,undef) != undef.
3388     break;
3389   }
3390 
3391   // PSHUFB
3392   case Intrinsic::x86_ssse3_pshuf_b_128:
3393   case Intrinsic::x86_avx2_pshuf_b:
3394   case Intrinsic::x86_avx512_pshuf_b_512:
3395   // PERMILVAR
3396   case Intrinsic::x86_avx_vpermilvar_ps:
3397   case Intrinsic::x86_avx_vpermilvar_ps_256:
3398   case Intrinsic::x86_avx512_vpermilvar_ps_512:
3399   case Intrinsic::x86_avx_vpermilvar_pd:
3400   case Intrinsic::x86_avx_vpermilvar_pd_256:
3401   case Intrinsic::x86_avx512_vpermilvar_pd_512:
3402   // PERMV
3403   case Intrinsic::x86_avx2_permd:
3404   case Intrinsic::x86_avx2_permps: {
3405     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3406     break;
3407   }
3408 
3409   // SSE4A instructions leave the upper 64-bits of the 128-bit result
3410   // in an undefined state.
3411   case Intrinsic::x86_sse4a_extrq:
3412   case Intrinsic::x86_sse4a_extrqi:
3413   case Intrinsic::x86_sse4a_insertq:
3414   case Intrinsic::x86_sse4a_insertqi:
3415     UndefElts.setHighBits(VWidth / 2);
3416     break;
3417   }
3418   return std::nullopt;
3419 }
3420