xref: /llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp (revision f1faba25433c971f024dd8a29da14020246e89ec)
1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
21 #include <optional>
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "x86tti"
26 
27 /// Return a constant boolean vector that has true elements in all positions
28 /// where the input constant data vector has an element with the sign bit set.
29 static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) {
30   VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
31   V = ConstantExpr::getBitCast(V, IntTy);
32   V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT,
33                                       Constant::getNullValue(IntTy), V, DL);
34   assert(V && "Vector must be foldable");
35   return V;
36 }
37 
38 /// Convert the x86 XMM integer vector mask to a vector of bools based on
39 /// each element's most significant bit (the sign bit).
40 static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
41   // Fold Constant Mask.
42   if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
43     return getNegativeIsTrueBoolVec(ConstantMask, DL);
44 
45   // Mask was extended from a boolean vector.
46   Value *ExtMask;
47   if (PatternMatch::match(
48           Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
49       ExtMask->getType()->isIntOrIntVectorTy(1))
50     return ExtMask;
51 
52   return nullptr;
53 }
54 
55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
56 // XMM register mask efficiently, we could transform all x86 masked intrinsics
57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
58 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
59   Value *Ptr = II.getOperand(0);
60   Value *Mask = II.getOperand(1);
61   Constant *ZeroVec = Constant::getNullValue(II.getType());
62 
63   // Zero Mask - masked load instruction creates a zero vector.
64   if (isa<ConstantAggregateZero>(Mask))
65     return IC.replaceInstUsesWith(II, ZeroVec);
66 
67   // The mask is constant or extended from a bool vector. Convert this x86
68   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
69   if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
70     // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
71     // the LLVM intrinsic definition for the pointer argument.
72     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
73     PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
74     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
75 
76     // The pass-through vector for an x86 masked load is a zero vector.
77     CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
78         II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
79     return IC.replaceInstUsesWith(II, NewMaskedLoad);
80   }
81 
82   return nullptr;
83 }
84 
85 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
86 // XMM register mask efficiently, we could transform all x86 masked intrinsics
87 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
88 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
89   Value *Ptr = II.getOperand(0);
90   Value *Mask = II.getOperand(1);
91   Value *Vec = II.getOperand(2);
92 
93   // Zero Mask - this masked store instruction does nothing.
94   if (isa<ConstantAggregateZero>(Mask)) {
95     IC.eraseInstFromFunction(II);
96     return true;
97   }
98 
99   // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
100   // anything else at this level.
101   if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
102     return false;
103 
104   // The mask is constant or extended from a bool vector. Convert this x86
105   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
106   if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
107     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
108     PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
109     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
110 
111     IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
112 
113     // 'Replace uses' doesn't work for stores. Erase the original masked store.
114     IC.eraseInstFromFunction(II);
115     return true;
116   }
117 
118   return false;
119 }
120 
121 static Value *simplifyX86immShift(const IntrinsicInst &II,
122                                   InstCombiner::BuilderTy &Builder) {
123   bool LogicalShift = false;
124   bool ShiftLeft = false;
125   bool IsImm = false;
126 
127   switch (II.getIntrinsicID()) {
128   default:
129     llvm_unreachable("Unexpected intrinsic!");
130   case Intrinsic::x86_sse2_psrai_d:
131   case Intrinsic::x86_sse2_psrai_w:
132   case Intrinsic::x86_avx2_psrai_d:
133   case Intrinsic::x86_avx2_psrai_w:
134   case Intrinsic::x86_avx512_psrai_q_128:
135   case Intrinsic::x86_avx512_psrai_q_256:
136   case Intrinsic::x86_avx512_psrai_d_512:
137   case Intrinsic::x86_avx512_psrai_q_512:
138   case Intrinsic::x86_avx512_psrai_w_512:
139     IsImm = true;
140     [[fallthrough]];
141   case Intrinsic::x86_sse2_psra_d:
142   case Intrinsic::x86_sse2_psra_w:
143   case Intrinsic::x86_avx2_psra_d:
144   case Intrinsic::x86_avx2_psra_w:
145   case Intrinsic::x86_avx512_psra_q_128:
146   case Intrinsic::x86_avx512_psra_q_256:
147   case Intrinsic::x86_avx512_psra_d_512:
148   case Intrinsic::x86_avx512_psra_q_512:
149   case Intrinsic::x86_avx512_psra_w_512:
150     LogicalShift = false;
151     ShiftLeft = false;
152     break;
153   case Intrinsic::x86_sse2_psrli_d:
154   case Intrinsic::x86_sse2_psrli_q:
155   case Intrinsic::x86_sse2_psrli_w:
156   case Intrinsic::x86_avx2_psrli_d:
157   case Intrinsic::x86_avx2_psrli_q:
158   case Intrinsic::x86_avx2_psrli_w:
159   case Intrinsic::x86_avx512_psrli_d_512:
160   case Intrinsic::x86_avx512_psrli_q_512:
161   case Intrinsic::x86_avx512_psrli_w_512:
162     IsImm = true;
163     [[fallthrough]];
164   case Intrinsic::x86_sse2_psrl_d:
165   case Intrinsic::x86_sse2_psrl_q:
166   case Intrinsic::x86_sse2_psrl_w:
167   case Intrinsic::x86_avx2_psrl_d:
168   case Intrinsic::x86_avx2_psrl_q:
169   case Intrinsic::x86_avx2_psrl_w:
170   case Intrinsic::x86_avx512_psrl_d_512:
171   case Intrinsic::x86_avx512_psrl_q_512:
172   case Intrinsic::x86_avx512_psrl_w_512:
173     LogicalShift = true;
174     ShiftLeft = false;
175     break;
176   case Intrinsic::x86_sse2_pslli_d:
177   case Intrinsic::x86_sse2_pslli_q:
178   case Intrinsic::x86_sse2_pslli_w:
179   case Intrinsic::x86_avx2_pslli_d:
180   case Intrinsic::x86_avx2_pslli_q:
181   case Intrinsic::x86_avx2_pslli_w:
182   case Intrinsic::x86_avx512_pslli_d_512:
183   case Intrinsic::x86_avx512_pslli_q_512:
184   case Intrinsic::x86_avx512_pslli_w_512:
185     IsImm = true;
186     [[fallthrough]];
187   case Intrinsic::x86_sse2_psll_d:
188   case Intrinsic::x86_sse2_psll_q:
189   case Intrinsic::x86_sse2_psll_w:
190   case Intrinsic::x86_avx2_psll_d:
191   case Intrinsic::x86_avx2_psll_q:
192   case Intrinsic::x86_avx2_psll_w:
193   case Intrinsic::x86_avx512_psll_d_512:
194   case Intrinsic::x86_avx512_psll_q_512:
195   case Intrinsic::x86_avx512_psll_w_512:
196     LogicalShift = true;
197     ShiftLeft = true;
198     break;
199   }
200   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
201 
202   Value *Vec = II.getArgOperand(0);
203   Value *Amt = II.getArgOperand(1);
204   auto *VT = cast<FixedVectorType>(Vec->getType());
205   Type *SVT = VT->getElementType();
206   Type *AmtVT = Amt->getType();
207   unsigned VWidth = VT->getNumElements();
208   unsigned BitWidth = SVT->getPrimitiveSizeInBits();
209 
210   // If the shift amount is guaranteed to be in-range we can replace it with a
211   // generic shift. If its guaranteed to be out of range, logical shifts combine
212   // to zero and arithmetic shifts are clamped to (BitWidth - 1).
213   if (IsImm) {
214     assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
215     KnownBits KnownAmtBits =
216         llvm::computeKnownBits(Amt, II.getDataLayout());
217     if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
218       Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
219       Amt = Builder.CreateVectorSplat(VWidth, Amt);
220       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
221                                         : Builder.CreateLShr(Vec, Amt))
222                            : Builder.CreateAShr(Vec, Amt));
223     }
224     if (KnownAmtBits.getMinValue().uge(BitWidth)) {
225       if (LogicalShift)
226         return ConstantAggregateZero::get(VT);
227       Amt = ConstantInt::get(SVT, BitWidth - 1);
228       return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
229     }
230   } else {
231     // Ensure the first element has an in-range value and the rest of the
232     // elements in the bottom 64 bits are zero.
233     assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
234            cast<VectorType>(AmtVT)->getElementType() == SVT &&
235            "Unexpected shift-by-scalar type");
236     unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
237     APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
238     APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
239     KnownBits KnownLowerBits = llvm::computeKnownBits(
240         Amt, DemandedLower, II.getDataLayout());
241     KnownBits KnownUpperBits = llvm::computeKnownBits(
242         Amt, DemandedUpper, II.getDataLayout());
243     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
244         (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
245       SmallVector<int, 16> ZeroSplat(VWidth, 0);
246       Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
247       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
248                                         : Builder.CreateLShr(Vec, Amt))
249                            : Builder.CreateAShr(Vec, Amt));
250     }
251   }
252 
253   // Simplify if count is constant vector.
254   auto *CDV = dyn_cast<ConstantDataVector>(Amt);
255   if (!CDV)
256     return nullptr;
257 
258   // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
259   // operand to compute the shift amount.
260   assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
261          cast<VectorType>(AmtVT)->getElementType() == SVT &&
262          "Unexpected shift-by-scalar type");
263 
264   // Concatenate the sub-elements to create the 64-bit value.
265   APInt Count(64, 0);
266   for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
267     unsigned SubEltIdx = (NumSubElts - 1) - i;
268     auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
269     Count <<= BitWidth;
270     Count |= SubElt->getValue().zextOrTrunc(64);
271   }
272 
273   // If shift-by-zero then just return the original value.
274   if (Count.isZero())
275     return Vec;
276 
277   // Handle cases when Shift >= BitWidth.
278   if (Count.uge(BitWidth)) {
279     // If LogicalShift - just return zero.
280     if (LogicalShift)
281       return ConstantAggregateZero::get(VT);
282 
283     // If ArithmeticShift - clamp Shift to (BitWidth - 1).
284     Count = APInt(64, BitWidth - 1);
285   }
286 
287   // Get a constant vector of the same type as the first operand.
288   auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
289   auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
290 
291   if (ShiftLeft)
292     return Builder.CreateShl(Vec, ShiftVec);
293 
294   if (LogicalShift)
295     return Builder.CreateLShr(Vec, ShiftVec);
296 
297   return Builder.CreateAShr(Vec, ShiftVec);
298 }
299 
300 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
301 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
302 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
303 static Value *simplifyX86varShift(const IntrinsicInst &II,
304                                   InstCombiner::BuilderTy &Builder) {
305   bool LogicalShift = false;
306   bool ShiftLeft = false;
307 
308   switch (II.getIntrinsicID()) {
309   default:
310     llvm_unreachable("Unexpected intrinsic!");
311   case Intrinsic::x86_avx2_psrav_d:
312   case Intrinsic::x86_avx2_psrav_d_256:
313   case Intrinsic::x86_avx512_psrav_q_128:
314   case Intrinsic::x86_avx512_psrav_q_256:
315   case Intrinsic::x86_avx512_psrav_d_512:
316   case Intrinsic::x86_avx512_psrav_q_512:
317   case Intrinsic::x86_avx512_psrav_w_128:
318   case Intrinsic::x86_avx512_psrav_w_256:
319   case Intrinsic::x86_avx512_psrav_w_512:
320     LogicalShift = false;
321     ShiftLeft = false;
322     break;
323   case Intrinsic::x86_avx2_psrlv_d:
324   case Intrinsic::x86_avx2_psrlv_d_256:
325   case Intrinsic::x86_avx2_psrlv_q:
326   case Intrinsic::x86_avx2_psrlv_q_256:
327   case Intrinsic::x86_avx512_psrlv_d_512:
328   case Intrinsic::x86_avx512_psrlv_q_512:
329   case Intrinsic::x86_avx512_psrlv_w_128:
330   case Intrinsic::x86_avx512_psrlv_w_256:
331   case Intrinsic::x86_avx512_psrlv_w_512:
332     LogicalShift = true;
333     ShiftLeft = false;
334     break;
335   case Intrinsic::x86_avx2_psllv_d:
336   case Intrinsic::x86_avx2_psllv_d_256:
337   case Intrinsic::x86_avx2_psllv_q:
338   case Intrinsic::x86_avx2_psllv_q_256:
339   case Intrinsic::x86_avx512_psllv_d_512:
340   case Intrinsic::x86_avx512_psllv_q_512:
341   case Intrinsic::x86_avx512_psllv_w_128:
342   case Intrinsic::x86_avx512_psllv_w_256:
343   case Intrinsic::x86_avx512_psllv_w_512:
344     LogicalShift = true;
345     ShiftLeft = true;
346     break;
347   }
348   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
349 
350   Value *Vec = II.getArgOperand(0);
351   Value *Amt = II.getArgOperand(1);
352   auto *VT = cast<FixedVectorType>(II.getType());
353   Type *SVT = VT->getElementType();
354   int NumElts = VT->getNumElements();
355   int BitWidth = SVT->getIntegerBitWidth();
356 
357   // If the shift amount is guaranteed to be in-range we can replace it with a
358   // generic shift.
359   KnownBits KnownAmt =
360       llvm::computeKnownBits(Amt, II.getDataLayout());
361   if (KnownAmt.getMaxValue().ult(BitWidth)) {
362     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
363                                       : Builder.CreateLShr(Vec, Amt))
364                          : Builder.CreateAShr(Vec, Amt));
365   }
366 
367   // Simplify if all shift amounts are constant/undef.
368   auto *CShift = dyn_cast<Constant>(Amt);
369   if (!CShift)
370     return nullptr;
371 
372   // Collect each element's shift amount.
373   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
374   bool AnyOutOfRange = false;
375   SmallVector<int, 8> ShiftAmts;
376   for (int I = 0; I < NumElts; ++I) {
377     auto *CElt = CShift->getAggregateElement(I);
378     if (isa_and_nonnull<UndefValue>(CElt)) {
379       ShiftAmts.push_back(-1);
380       continue;
381     }
382 
383     auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
384     if (!COp)
385       return nullptr;
386 
387     // Handle out of range shifts.
388     // If LogicalShift - set to BitWidth (special case).
389     // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
390     APInt ShiftVal = COp->getValue();
391     if (ShiftVal.uge(BitWidth)) {
392       AnyOutOfRange = LogicalShift;
393       ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
394       continue;
395     }
396 
397     ShiftAmts.push_back((int)ShiftVal.getZExtValue());
398   }
399 
400   // If all elements out of range or UNDEF, return vector of zeros/undefs.
401   // ArithmeticShift should only hit this if they are all UNDEF.
402   auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
403   if (llvm::all_of(ShiftAmts, OutOfRange)) {
404     SmallVector<Constant *, 8> ConstantVec;
405     for (int Idx : ShiftAmts) {
406       if (Idx < 0) {
407         ConstantVec.push_back(UndefValue::get(SVT));
408       } else {
409         assert(LogicalShift && "Logical shift expected");
410         ConstantVec.push_back(ConstantInt::getNullValue(SVT));
411       }
412     }
413     return ConstantVector::get(ConstantVec);
414   }
415 
416   // We can't handle only some out of range values with generic logical shifts.
417   if (AnyOutOfRange)
418     return nullptr;
419 
420   // Build the shift amount constant vector.
421   SmallVector<Constant *, 8> ShiftVecAmts;
422   for (int Idx : ShiftAmts) {
423     if (Idx < 0)
424       ShiftVecAmts.push_back(UndefValue::get(SVT));
425     else
426       ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
427   }
428   auto ShiftVec = ConstantVector::get(ShiftVecAmts);
429 
430   if (ShiftLeft)
431     return Builder.CreateShl(Vec, ShiftVec);
432 
433   if (LogicalShift)
434     return Builder.CreateLShr(Vec, ShiftVec);
435 
436   return Builder.CreateAShr(Vec, ShiftVec);
437 }
438 
439 static Value *simplifyX86pack(IntrinsicInst &II,
440                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
441   Value *Arg0 = II.getArgOperand(0);
442   Value *Arg1 = II.getArgOperand(1);
443   Type *ResTy = II.getType();
444 
445   // Fast all undef handling.
446   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
447     return UndefValue::get(ResTy);
448 
449   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
450   unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
451   unsigned NumSrcElts = ArgTy->getNumElements();
452   assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
453          "Unexpected packing types");
454 
455   unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
456   unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
457   unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
458   assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
459          "Unexpected packing types");
460 
461   // Constant folding.
462   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
463     return nullptr;
464 
465   // Clamp Values - signed/unsigned both use signed clamp values, but they
466   // differ on the min/max values.
467   APInt MinValue, MaxValue;
468   if (IsSigned) {
469     // PACKSS: Truncate signed value with signed saturation.
470     // Source values less than dst minint are saturated to minint.
471     // Source values greater than dst maxint are saturated to maxint.
472     MinValue =
473         APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
474     MaxValue =
475         APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
476   } else {
477     // PACKUS: Truncate signed value with unsigned saturation.
478     // Source values less than zero are saturated to zero.
479     // Source values greater than dst maxuint are saturated to maxuint.
480     MinValue = APInt::getZero(SrcScalarSizeInBits);
481     MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
482   }
483 
484   auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
485   auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
486   Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
487   Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
488   Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
489   Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
490 
491   // Shuffle clamped args together at the lane level.
492   SmallVector<int, 32> PackMask;
493   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
494     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
495       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
496     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
497       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
498   }
499   auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
500 
501   // Truncate to dst size.
502   return Builder.CreateTrunc(Shuffle, ResTy);
503 }
504 
505 static Value *simplifyX86pmadd(IntrinsicInst &II,
506                                InstCombiner::BuilderTy &Builder,
507                                bool IsPMADDWD) {
508   Value *Arg0 = II.getArgOperand(0);
509   Value *Arg1 = II.getArgOperand(1);
510   auto *ResTy = cast<FixedVectorType>(II.getType());
511   [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
512 
513   unsigned NumDstElts = ResTy->getNumElements();
514   assert(ArgTy->getNumElements() == (2 * NumDstElts) &&
515          ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) &&
516          "Unexpected PMADD types");
517 
518   // Multiply by zero.
519   if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
520     return ConstantAggregateZero::get(ResTy);
521 
522   // Constant folding.
523   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
524     return nullptr;
525 
526   // Split Lo/Hi elements pairs, extend and add together.
527   // PMADDWD(X,Y) =
528   // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1])))
529   // PMADDUBSW(X,Y) =
530   // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1])))
531   SmallVector<int> LoMask, HiMask;
532   for (unsigned I = 0; I != NumDstElts; ++I) {
533     LoMask.push_back(2 * I + 0);
534     HiMask.push_back(2 * I + 1);
535   }
536 
537   auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask);
538   auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask);
539   auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask);
540   auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask);
541 
542   auto LHSCast =
543       IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
544   LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy);
545   LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy);
546   RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy);
547   RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy);
548   Value *Lo = Builder.CreateMul(LHSLo, RHSLo);
549   Value *Hi = Builder.CreateMul(LHSHi, RHSHi);
550   return IsPMADDWD
551              ? Builder.CreateAdd(Lo, Hi)
552              : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi});
553 }
554 
555 static Value *simplifyX86movmsk(const IntrinsicInst &II,
556                                 InstCombiner::BuilderTy &Builder) {
557   Value *Arg = II.getArgOperand(0);
558   Type *ResTy = II.getType();
559 
560   // movmsk(undef) -> zero as we must ensure the upper bits are zero.
561   if (isa<UndefValue>(Arg))
562     return Constant::getNullValue(ResTy);
563 
564   auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
565   // We can't easily peek through x86_mmx types.
566   if (!ArgTy)
567     return nullptr;
568 
569   // Expand MOVMSK to compare/bitcast/zext:
570   // e.g. PMOVMSKB(v16i8 x):
571   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
572   // %int = bitcast <16 x i1> %cmp to i16
573   // %res = zext i16 %int to i32
574   unsigned NumElts = ArgTy->getNumElements();
575   Type *IntegerTy = Builder.getIntNTy(NumElts);
576 
577   Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
578   Res = Builder.CreateIsNeg(Res);
579   Res = Builder.CreateBitCast(Res, IntegerTy);
580   Res = Builder.CreateZExtOrTrunc(Res, ResTy);
581   return Res;
582 }
583 
584 static Value *simplifyX86addcarry(const IntrinsicInst &II,
585                                   InstCombiner::BuilderTy &Builder) {
586   Value *CarryIn = II.getArgOperand(0);
587   Value *Op1 = II.getArgOperand(1);
588   Value *Op2 = II.getArgOperand(2);
589   Type *RetTy = II.getType();
590   Type *OpTy = Op1->getType();
591   assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
592          RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
593          "Unexpected types for x86 addcarry");
594 
595   // If carry-in is zero, this is just an unsigned add with overflow.
596   if (match(CarryIn, PatternMatch::m_ZeroInt())) {
597     Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
598                                           {Op1, Op2});
599     // The types have to be adjusted to match the x86 call types.
600     Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
601     Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
602                                        Builder.getInt8Ty());
603     Value *Res = PoisonValue::get(RetTy);
604     Res = Builder.CreateInsertValue(Res, UAddOV, 0);
605     return Builder.CreateInsertValue(Res, UAddResult, 1);
606   }
607 
608   return nullptr;
609 }
610 
611 static Value *simplifyTernarylogic(const IntrinsicInst &II,
612                                    InstCombiner::BuilderTy &Builder) {
613 
614   auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
615   if (!ArgImm || ArgImm->getValue().uge(256))
616     return nullptr;
617 
618   Value *ArgA = II.getArgOperand(0);
619   Value *ArgB = II.getArgOperand(1);
620   Value *ArgC = II.getArgOperand(2);
621 
622   Type *Ty = II.getType();
623 
624   auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
625     return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
626   };
627   auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
628     return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
629   };
630   auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
631     return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
632   };
633   auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
634     return {Builder.CreateNot(V.first), ~V.second};
635   };
636   auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
637   auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
638   auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
639 
640   bool AIsConst = match(ArgA, PatternMatch::m_ImmConstant());
641   bool BIsConst = match(ArgB, PatternMatch::m_ImmConstant());
642   bool CIsConst = match(ArgC, PatternMatch::m_ImmConstant());
643 
644   bool ABIsConst = AIsConst && BIsConst;
645   bool ACIsConst = AIsConst && CIsConst;
646   bool BCIsConst = BIsConst && CIsConst;
647   bool ABCIsConst = AIsConst && BIsConst && CIsConst;
648 
649   // Use for verification. Its a big table. Its difficult to go from Imm ->
650   // logic ops, but easy to verify that a set of logic ops is correct. We track
651   // the logic ops through the second value in the pair. At the end it should
652   // equal Imm.
653   std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
654   std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
655   std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
656   std::pair<Value *, uint8_t> Res = {nullptr, 0};
657 
658   // Currently we only handle cases that convert directly to another instruction
659   // or cases where all the ops are constant.  This is because we don't properly
660   // handle creating ternary ops in the backend, so splitting them here may
661   // cause regressions. As the backend improves, uncomment more cases.
662 
663   uint8_t Imm = ArgImm->getValue().getZExtValue();
664   switch (Imm) {
665   case 0x0:
666     Res = {Constant::getNullValue(Ty), 0};
667     break;
668   case 0x1:
669     if (ABCIsConst)
670       Res = Nor(Or(A, B), C);
671     break;
672   case 0x2:
673     if (ABCIsConst)
674       Res = And(Nor(A, B), C);
675     break;
676   case 0x3:
677     if (ABIsConst)
678       Res = Nor(A, B);
679     break;
680   case 0x4:
681     if (ABCIsConst)
682       Res = And(Nor(A, C), B);
683     break;
684   case 0x5:
685     if (ACIsConst)
686       Res = Nor(A, C);
687     break;
688   case 0x6:
689     if (ABCIsConst)
690       Res = Nor(A, Xnor(B, C));
691     break;
692   case 0x7:
693     if (ABCIsConst)
694       Res = Nor(A, And(B, C));
695     break;
696   case 0x8:
697     if (ABCIsConst)
698       Res = Nor(A, Nand(B, C));
699     break;
700   case 0x9:
701     if (ABCIsConst)
702       Res = Nor(A, Xor(B, C));
703     break;
704   case 0xa:
705     if (ACIsConst)
706       Res = Nor(A, Not(C));
707     break;
708   case 0xb:
709     if (ABCIsConst)
710       Res = Nor(A, Nor(C, Not(B)));
711     break;
712   case 0xc:
713     if (ABIsConst)
714       Res = Nor(A, Not(B));
715     break;
716   case 0xd:
717     if (ABCIsConst)
718       Res = Nor(A, Nor(B, Not(C)));
719     break;
720   case 0xe:
721     if (ABCIsConst)
722       Res = Nor(A, Nor(B, C));
723     break;
724   case 0xf:
725     Res = Not(A);
726     break;
727   case 0x10:
728     if (ABCIsConst)
729       Res = And(A, Nor(B, C));
730     break;
731   case 0x11:
732     if (BCIsConst)
733       Res = Nor(B, C);
734     break;
735   case 0x12:
736     if (ABCIsConst)
737       Res = Nor(Xnor(A, C), B);
738     break;
739   case 0x13:
740     if (ABCIsConst)
741       Res = Nor(And(A, C), B);
742     break;
743   case 0x14:
744     if (ABCIsConst)
745       Res = Nor(Xnor(A, B), C);
746     break;
747   case 0x15:
748     if (ABCIsConst)
749       Res = Nor(And(A, B), C);
750     break;
751   case 0x16:
752     if (ABCIsConst)
753       Res = Xor(Xor(A, B), And(Nand(A, B), C));
754     break;
755   case 0x17:
756     if (ABCIsConst)
757       Res = Xor(Or(A, B), Or(Xnor(A, B), C));
758     break;
759   case 0x18:
760     if (ABCIsConst)
761       Res = Nor(Xnor(A, B), Xnor(A, C));
762     break;
763   case 0x19:
764     if (ABCIsConst)
765       Res = And(Nand(A, B), Xnor(B, C));
766     break;
767   case 0x1a:
768     if (ABCIsConst)
769       Res = Xor(A, Or(And(A, B), C));
770     break;
771   case 0x1b:
772     if (ABCIsConst)
773       Res = Xor(A, Or(Xnor(A, B), C));
774     break;
775   case 0x1c:
776     if (ABCIsConst)
777       Res = Xor(A, Or(And(A, C), B));
778     break;
779   case 0x1d:
780     if (ABCIsConst)
781       Res = Xor(A, Or(Xnor(A, C), B));
782     break;
783   case 0x1e:
784     if (ABCIsConst)
785       Res = Xor(A, Or(B, C));
786     break;
787   case 0x1f:
788     if (ABCIsConst)
789       Res = Nand(A, Or(B, C));
790     break;
791   case 0x20:
792     if (ABCIsConst)
793       Res = Nor(Nand(A, C), B);
794     break;
795   case 0x21:
796     if (ABCIsConst)
797       Res = Nor(Xor(A, C), B);
798     break;
799   case 0x22:
800     if (BCIsConst)
801       Res = Nor(B, Not(C));
802     break;
803   case 0x23:
804     if (ABCIsConst)
805       Res = Nor(B, Nor(C, Not(A)));
806     break;
807   case 0x24:
808     if (ABCIsConst)
809       Res = Nor(Xnor(A, B), Xor(A, C));
810     break;
811   case 0x25:
812     if (ABCIsConst)
813       Res = Xor(A, Nand(Nand(A, B), C));
814     break;
815   case 0x26:
816     if (ABCIsConst)
817       Res = And(Nand(A, B), Xor(B, C));
818     break;
819   case 0x27:
820     if (ABCIsConst)
821       Res = Xor(Or(Xnor(A, B), C), B);
822     break;
823   case 0x28:
824     if (ABCIsConst)
825       Res = And(Xor(A, B), C);
826     break;
827   case 0x29:
828     if (ABCIsConst)
829       Res = Xor(Xor(A, B), Nor(And(A, B), C));
830     break;
831   case 0x2a:
832     if (ABCIsConst)
833       Res = And(Nand(A, B), C);
834     break;
835   case 0x2b:
836     if (ABCIsConst)
837       Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
838     break;
839   case 0x2c:
840     if (ABCIsConst)
841       Res = Nor(Xnor(A, B), Nor(B, C));
842     break;
843   case 0x2d:
844     if (ABCIsConst)
845       Res = Xor(A, Or(B, Not(C)));
846     break;
847   case 0x2e:
848     if (ABCIsConst)
849       Res = Xor(A, Or(Xor(A, C), B));
850     break;
851   case 0x2f:
852     if (ABCIsConst)
853       Res = Nand(A, Or(B, Not(C)));
854     break;
855   case 0x30:
856     if (ABIsConst)
857       Res = Nor(B, Not(A));
858     break;
859   case 0x31:
860     if (ABCIsConst)
861       Res = Nor(Nor(A, Not(C)), B);
862     break;
863   case 0x32:
864     if (ABCIsConst)
865       Res = Nor(Nor(A, C), B);
866     break;
867   case 0x33:
868     Res = Not(B);
869     break;
870   case 0x34:
871     if (ABCIsConst)
872       Res = And(Xor(A, B), Nand(B, C));
873     break;
874   case 0x35:
875     if (ABCIsConst)
876       Res = Xor(B, Or(A, Xnor(B, C)));
877     break;
878   case 0x36:
879     if (ABCIsConst)
880       Res = Xor(Or(A, C), B);
881     break;
882   case 0x37:
883     if (ABCIsConst)
884       Res = Nand(Or(A, C), B);
885     break;
886   case 0x38:
887     if (ABCIsConst)
888       Res = Nor(Xnor(A, B), Nor(A, C));
889     break;
890   case 0x39:
891     if (ABCIsConst)
892       Res = Xor(Or(A, Not(C)), B);
893     break;
894   case 0x3a:
895     if (ABCIsConst)
896       Res = Xor(B, Or(A, Xor(B, C)));
897     break;
898   case 0x3b:
899     if (ABCIsConst)
900       Res = Nand(Or(A, Not(C)), B);
901     break;
902   case 0x3c:
903     Res = Xor(A, B);
904     break;
905   case 0x3d:
906     if (ABCIsConst)
907       Res = Xor(A, Or(Nor(A, C), B));
908     break;
909   case 0x3e:
910     if (ABCIsConst)
911       Res = Xor(A, Or(Nor(A, Not(C)), B));
912     break;
913   case 0x3f:
914     if (ABIsConst)
915       Res = Nand(A, B);
916     break;
917   case 0x40:
918     if (ABCIsConst)
919       Res = Nor(Nand(A, B), C);
920     break;
921   case 0x41:
922     if (ABCIsConst)
923       Res = Nor(Xor(A, B), C);
924     break;
925   case 0x42:
926     if (ABCIsConst)
927       Res = Nor(Xor(A, B), Xnor(A, C));
928     break;
929   case 0x43:
930     if (ABCIsConst)
931       Res = Xor(A, Nand(Nand(A, C), B));
932     break;
933   case 0x44:
934     if (BCIsConst)
935       Res = Nor(C, Not(B));
936     break;
937   case 0x45:
938     if (ABCIsConst)
939       Res = Nor(Nor(B, Not(A)), C);
940     break;
941   case 0x46:
942     if (ABCIsConst)
943       Res = Xor(Or(And(A, C), B), C);
944     break;
945   case 0x47:
946     if (ABCIsConst)
947       Res = Xor(Or(Xnor(A, C), B), C);
948     break;
949   case 0x48:
950     if (ABCIsConst)
951       Res = And(Xor(A, C), B);
952     break;
953   case 0x49:
954     if (ABCIsConst)
955       Res = Xor(Or(Xnor(A, B), And(A, C)), C);
956     break;
957   case 0x4a:
958     if (ABCIsConst)
959       Res = Nor(Xnor(A, C), Nor(B, C));
960     break;
961   case 0x4b:
962     if (ABCIsConst)
963       Res = Xor(A, Or(C, Not(B)));
964     break;
965   case 0x4c:
966     if (ABCIsConst)
967       Res = And(Nand(A, C), B);
968     break;
969   case 0x4d:
970     if (ABCIsConst)
971       Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
972     break;
973   case 0x4e:
974     if (ABCIsConst)
975       Res = Xor(A, Or(Xor(A, B), C));
976     break;
977   case 0x4f:
978     if (ABCIsConst)
979       Res = Nand(A, Nand(B, Not(C)));
980     break;
981   case 0x50:
982     if (ACIsConst)
983       Res = Nor(C, Not(A));
984     break;
985   case 0x51:
986     if (ABCIsConst)
987       Res = Nor(Nor(A, Not(B)), C);
988     break;
989   case 0x52:
990     if (ABCIsConst)
991       Res = And(Xor(A, C), Nand(B, C));
992     break;
993   case 0x53:
994     if (ABCIsConst)
995       Res = Xor(Or(Xnor(B, C), A), C);
996     break;
997   case 0x54:
998     if (ABCIsConst)
999       Res = Nor(Nor(A, B), C);
1000     break;
1001   case 0x55:
1002     Res = Not(C);
1003     break;
1004   case 0x56:
1005     if (ABCIsConst)
1006       Res = Xor(Or(A, B), C);
1007     break;
1008   case 0x57:
1009     if (ABCIsConst)
1010       Res = Nand(Or(A, B), C);
1011     break;
1012   case 0x58:
1013     if (ABCIsConst)
1014       Res = Nor(Nor(A, B), Xnor(A, C));
1015     break;
1016   case 0x59:
1017     if (ABCIsConst)
1018       Res = Xor(Or(A, Not(B)), C);
1019     break;
1020   case 0x5a:
1021     Res = Xor(A, C);
1022     break;
1023   case 0x5b:
1024     if (ABCIsConst)
1025       Res = Xor(A, Or(Nor(A, B), C));
1026     break;
1027   case 0x5c:
1028     if (ABCIsConst)
1029       Res = Xor(Or(Xor(B, C), A), C);
1030     break;
1031   case 0x5d:
1032     if (ABCIsConst)
1033       Res = Nand(Or(A, Not(B)), C);
1034     break;
1035   case 0x5e:
1036     if (ABCIsConst)
1037       Res = Xor(A, Or(Nor(A, Not(B)), C));
1038     break;
1039   case 0x5f:
1040     if (ACIsConst)
1041       Res = Nand(A, C);
1042     break;
1043   case 0x60:
1044     if (ABCIsConst)
1045       Res = And(A, Xor(B, C));
1046     break;
1047   case 0x61:
1048     if (ABCIsConst)
1049       Res = Xor(Or(Xnor(A, B), And(B, C)), C);
1050     break;
1051   case 0x62:
1052     if (ABCIsConst)
1053       Res = Nor(Nor(A, C), Xnor(B, C));
1054     break;
1055   case 0x63:
1056     if (ABCIsConst)
1057       Res = Xor(B, Or(C, Not(A)));
1058     break;
1059   case 0x64:
1060     if (ABCIsConst)
1061       Res = Nor(Nor(A, B), Xnor(B, C));
1062     break;
1063   case 0x65:
1064     if (ABCIsConst)
1065       Res = Xor(Or(B, Not(A)), C);
1066     break;
1067   case 0x66:
1068     Res = Xor(B, C);
1069     break;
1070   case 0x67:
1071     if (ABCIsConst)
1072       Res = Or(Nor(A, B), Xor(B, C));
1073     break;
1074   case 0x68:
1075     if (ABCIsConst)
1076       Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1077     break;
1078   case 0x69:
1079     if (ABCIsConst)
1080       Res = Xor(Xnor(A, B), C);
1081     break;
1082   case 0x6a:
1083     if (ABCIsConst)
1084       Res = Xor(And(A, B), C);
1085     break;
1086   case 0x6b:
1087     if (ABCIsConst)
1088       Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1089     break;
1090   case 0x6c:
1091     if (ABCIsConst)
1092       Res = Xor(And(A, C), B);
1093     break;
1094   case 0x6d:
1095     if (ABCIsConst)
1096       Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1097     break;
1098   case 0x6e:
1099     if (ABCIsConst)
1100       Res = Or(Nor(A, Not(B)), Xor(B, C));
1101     break;
1102   case 0x6f:
1103     if (ABCIsConst)
1104       Res = Nand(A, Xnor(B, C));
1105     break;
1106   case 0x70:
1107     if (ABCIsConst)
1108       Res = And(A, Nand(B, C));
1109     break;
1110   case 0x71:
1111     if (ABCIsConst)
1112       Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1113     break;
1114   case 0x72:
1115     if (ABCIsConst)
1116       Res = Xor(Or(Xor(A, B), C), B);
1117     break;
1118   case 0x73:
1119     if (ABCIsConst)
1120       Res = Nand(Nand(A, Not(C)), B);
1121     break;
1122   case 0x74:
1123     if (ABCIsConst)
1124       Res = Xor(Or(Xor(A, C), B), C);
1125     break;
1126   case 0x75:
1127     if (ABCIsConst)
1128       Res = Nand(Nand(A, Not(B)), C);
1129     break;
1130   case 0x76:
1131     if (ABCIsConst)
1132       Res = Xor(B, Or(Nor(B, Not(A)), C));
1133     break;
1134   case 0x77:
1135     if (BCIsConst)
1136       Res = Nand(B, C);
1137     break;
1138   case 0x78:
1139     if (ABCIsConst)
1140       Res = Xor(A, And(B, C));
1141     break;
1142   case 0x79:
1143     if (ABCIsConst)
1144       Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1145     break;
1146   case 0x7a:
1147     if (ABCIsConst)
1148       Res = Or(Xor(A, C), Nor(B, Not(A)));
1149     break;
1150   case 0x7b:
1151     if (ABCIsConst)
1152       Res = Nand(Xnor(A, C), B);
1153     break;
1154   case 0x7c:
1155     if (ABCIsConst)
1156       Res = Or(Xor(A, B), Nor(C, Not(A)));
1157     break;
1158   case 0x7d:
1159     if (ABCIsConst)
1160       Res = Nand(Xnor(A, B), C);
1161     break;
1162   case 0x7e:
1163     if (ABCIsConst)
1164       Res = Or(Xor(A, B), Xor(A, C));
1165     break;
1166   case 0x7f:
1167     if (ABCIsConst)
1168       Res = Nand(And(A, B), C);
1169     break;
1170   case 0x80:
1171     if (ABCIsConst)
1172       Res = And(And(A, B), C);
1173     break;
1174   case 0x81:
1175     if (ABCIsConst)
1176       Res = Nor(Xor(A, B), Xor(A, C));
1177     break;
1178   case 0x82:
1179     if (ABCIsConst)
1180       Res = And(Xnor(A, B), C);
1181     break;
1182   case 0x83:
1183     if (ABCIsConst)
1184       Res = Nor(Xor(A, B), Nor(C, Not(A)));
1185     break;
1186   case 0x84:
1187     if (ABCIsConst)
1188       Res = And(Xnor(A, C), B);
1189     break;
1190   case 0x85:
1191     if (ABCIsConst)
1192       Res = Nor(Xor(A, C), Nor(B, Not(A)));
1193     break;
1194   case 0x86:
1195     if (ABCIsConst)
1196       Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1197     break;
1198   case 0x87:
1199     if (ABCIsConst)
1200       Res = Xor(A, Nand(B, C));
1201     break;
1202   case 0x88:
1203     Res = And(B, C);
1204     break;
1205   case 0x89:
1206     if (ABCIsConst)
1207       Res = Xor(B, Nor(Nor(B, Not(A)), C));
1208     break;
1209   case 0x8a:
1210     if (ABCIsConst)
1211       Res = And(Nand(A, Not(B)), C);
1212     break;
1213   case 0x8b:
1214     if (ABCIsConst)
1215       Res = Xor(Nor(Xor(A, C), B), C);
1216     break;
1217   case 0x8c:
1218     if (ABCIsConst)
1219       Res = And(Nand(A, Not(C)), B);
1220     break;
1221   case 0x8d:
1222     if (ABCIsConst)
1223       Res = Xor(Nor(Xor(A, B), C), B);
1224     break;
1225   case 0x8e:
1226     if (ABCIsConst)
1227       Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1228     break;
1229   case 0x8f:
1230     if (ABCIsConst)
1231       Res = Nand(A, Nand(B, C));
1232     break;
1233   case 0x90:
1234     if (ABCIsConst)
1235       Res = And(A, Xnor(B, C));
1236     break;
1237   case 0x91:
1238     if (ABCIsConst)
1239       Res = Nor(Nor(A, Not(B)), Xor(B, C));
1240     break;
1241   case 0x92:
1242     if (ABCIsConst)
1243       Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1244     break;
1245   case 0x93:
1246     if (ABCIsConst)
1247       Res = Xor(Nand(A, C), B);
1248     break;
1249   case 0x94:
1250     if (ABCIsConst)
1251       Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1252     break;
1253   case 0x95:
1254     if (ABCIsConst)
1255       Res = Xor(Nand(A, B), C);
1256     break;
1257   case 0x96:
1258     if (ABCIsConst)
1259       Res = Xor(Xor(A, B), C);
1260     break;
1261   case 0x97:
1262     if (ABCIsConst)
1263       Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1264     break;
1265   case 0x98:
1266     if (ABCIsConst)
1267       Res = Nor(Nor(A, B), Xor(B, C));
1268     break;
1269   case 0x99:
1270     if (BCIsConst)
1271       Res = Xnor(B, C);
1272     break;
1273   case 0x9a:
1274     if (ABCIsConst)
1275       Res = Xor(Nor(B, Not(A)), C);
1276     break;
1277   case 0x9b:
1278     if (ABCIsConst)
1279       Res = Or(Nor(A, B), Xnor(B, C));
1280     break;
1281   case 0x9c:
1282     if (ABCIsConst)
1283       Res = Xor(B, Nor(C, Not(A)));
1284     break;
1285   case 0x9d:
1286     if (ABCIsConst)
1287       Res = Or(Nor(A, C), Xnor(B, C));
1288     break;
1289   case 0x9e:
1290     if (ABCIsConst)
1291       Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1292     break;
1293   case 0x9f:
1294     if (ABCIsConst)
1295       Res = Nand(A, Xor(B, C));
1296     break;
1297   case 0xa0:
1298     Res = And(A, C);
1299     break;
1300   case 0xa1:
1301     if (ABCIsConst)
1302       Res = Xor(A, Nor(Nor(A, Not(B)), C));
1303     break;
1304   case 0xa2:
1305     if (ABCIsConst)
1306       Res = And(Or(A, Not(B)), C);
1307     break;
1308   case 0xa3:
1309     if (ABCIsConst)
1310       Res = Xor(Nor(Xor(B, C), A), C);
1311     break;
1312   case 0xa4:
1313     if (ABCIsConst)
1314       Res = Xor(A, Nor(Nor(A, B), C));
1315     break;
1316   case 0xa5:
1317     if (ACIsConst)
1318       Res = Xnor(A, C);
1319     break;
1320   case 0xa6:
1321     if (ABCIsConst)
1322       Res = Xor(Nor(A, Not(B)), C);
1323     break;
1324   case 0xa7:
1325     if (ABCIsConst)
1326       Res = Or(Nor(A, B), Xnor(A, C));
1327     break;
1328   case 0xa8:
1329     if (ABCIsConst)
1330       Res = And(Or(A, B), C);
1331     break;
1332   case 0xa9:
1333     if (ABCIsConst)
1334       Res = Xor(Nor(A, B), C);
1335     break;
1336   case 0xaa:
1337     Res = C;
1338     break;
1339   case 0xab:
1340     if (ABCIsConst)
1341       Res = Or(Nor(A, B), C);
1342     break;
1343   case 0xac:
1344     if (ABCIsConst)
1345       Res = Xor(Nor(Xnor(B, C), A), C);
1346     break;
1347   case 0xad:
1348     if (ABCIsConst)
1349       Res = Or(Xnor(A, C), And(B, C));
1350     break;
1351   case 0xae:
1352     if (ABCIsConst)
1353       Res = Or(Nor(A, Not(B)), C);
1354     break;
1355   case 0xaf:
1356     if (ACIsConst)
1357       Res = Or(C, Not(A));
1358     break;
1359   case 0xb0:
1360     if (ABCIsConst)
1361       Res = And(A, Nand(B, Not(C)));
1362     break;
1363   case 0xb1:
1364     if (ABCIsConst)
1365       Res = Xor(A, Nor(Xor(A, B), C));
1366     break;
1367   case 0xb2:
1368     if (ABCIsConst)
1369       Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1370     break;
1371   case 0xb3:
1372     if (ABCIsConst)
1373       Res = Nand(Nand(A, C), B);
1374     break;
1375   case 0xb4:
1376     if (ABCIsConst)
1377       Res = Xor(A, Nor(C, Not(B)));
1378     break;
1379   case 0xb5:
1380     if (ABCIsConst)
1381       Res = Or(Xnor(A, C), Nor(B, C));
1382     break;
1383   case 0xb6:
1384     if (ABCIsConst)
1385       Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1386     break;
1387   case 0xb7:
1388     if (ABCIsConst)
1389       Res = Nand(Xor(A, C), B);
1390     break;
1391   case 0xb8:
1392     if (ABCIsConst)
1393       Res = Xor(Nor(Xnor(A, C), B), C);
1394     break;
1395   case 0xb9:
1396     if (ABCIsConst)
1397       Res = Xor(Nor(And(A, C), B), C);
1398     break;
1399   case 0xba:
1400     if (ABCIsConst)
1401       Res = Or(Nor(B, Not(A)), C);
1402     break;
1403   case 0xbb:
1404     if (BCIsConst)
1405       Res = Or(C, Not(B));
1406     break;
1407   case 0xbc:
1408     if (ABCIsConst)
1409       Res = Xor(A, And(Nand(A, C), B));
1410     break;
1411   case 0xbd:
1412     if (ABCIsConst)
1413       Res = Or(Xor(A, B), Xnor(A, C));
1414     break;
1415   case 0xbe:
1416     if (ABCIsConst)
1417       Res = Or(Xor(A, B), C);
1418     break;
1419   case 0xbf:
1420     if (ABCIsConst)
1421       Res = Or(Nand(A, B), C);
1422     break;
1423   case 0xc0:
1424     Res = And(A, B);
1425     break;
1426   case 0xc1:
1427     if (ABCIsConst)
1428       Res = Xor(A, Nor(Nor(A, Not(C)), B));
1429     break;
1430   case 0xc2:
1431     if (ABCIsConst)
1432       Res = Xor(A, Nor(Nor(A, C), B));
1433     break;
1434   case 0xc3:
1435     if (ABIsConst)
1436       Res = Xnor(A, B);
1437     break;
1438   case 0xc4:
1439     if (ABCIsConst)
1440       Res = And(Or(A, Not(C)), B);
1441     break;
1442   case 0xc5:
1443     if (ABCIsConst)
1444       Res = Xor(B, Nor(A, Xor(B, C)));
1445     break;
1446   case 0xc6:
1447     if (ABCIsConst)
1448       Res = Xor(Nor(A, Not(C)), B);
1449     break;
1450   case 0xc7:
1451     if (ABCIsConst)
1452       Res = Or(Xnor(A, B), Nor(A, C));
1453     break;
1454   case 0xc8:
1455     if (ABCIsConst)
1456       Res = And(Or(A, C), B);
1457     break;
1458   case 0xc9:
1459     if (ABCIsConst)
1460       Res = Xor(Nor(A, C), B);
1461     break;
1462   case 0xca:
1463     if (ABCIsConst)
1464       Res = Xor(B, Nor(A, Xnor(B, C)));
1465     break;
1466   case 0xcb:
1467     if (ABCIsConst)
1468       Res = Or(Xnor(A, B), And(B, C));
1469     break;
1470   case 0xcc:
1471     Res = B;
1472     break;
1473   case 0xcd:
1474     if (ABCIsConst)
1475       Res = Or(Nor(A, C), B);
1476     break;
1477   case 0xce:
1478     if (ABCIsConst)
1479       Res = Or(Nor(A, Not(C)), B);
1480     break;
1481   case 0xcf:
1482     if (ABIsConst)
1483       Res = Or(B, Not(A));
1484     break;
1485   case 0xd0:
1486     if (ABCIsConst)
1487       Res = And(A, Or(B, Not(C)));
1488     break;
1489   case 0xd1:
1490     if (ABCIsConst)
1491       Res = Xor(A, Nor(Xor(A, C), B));
1492     break;
1493   case 0xd2:
1494     if (ABCIsConst)
1495       Res = Xor(A, Nor(B, Not(C)));
1496     break;
1497   case 0xd3:
1498     if (ABCIsConst)
1499       Res = Or(Xnor(A, B), Nor(B, C));
1500     break;
1501   case 0xd4:
1502     if (ABCIsConst)
1503       Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1504     break;
1505   case 0xd5:
1506     if (ABCIsConst)
1507       Res = Nand(Nand(A, B), C);
1508     break;
1509   case 0xd6:
1510     if (ABCIsConst)
1511       Res = Xor(Xor(A, B), Or(And(A, B), C));
1512     break;
1513   case 0xd7:
1514     if (ABCIsConst)
1515       Res = Nand(Xor(A, B), C);
1516     break;
1517   case 0xd8:
1518     if (ABCIsConst)
1519       Res = Xor(Nor(Xnor(A, B), C), B);
1520     break;
1521   case 0xd9:
1522     if (ABCIsConst)
1523       Res = Or(And(A, B), Xnor(B, C));
1524     break;
1525   case 0xda:
1526     if (ABCIsConst)
1527       Res = Xor(A, And(Nand(A, B), C));
1528     break;
1529   case 0xdb:
1530     if (ABCIsConst)
1531       Res = Or(Xnor(A, B), Xor(A, C));
1532     break;
1533   case 0xdc:
1534     if (ABCIsConst)
1535       Res = Or(B, Nor(C, Not(A)));
1536     break;
1537   case 0xdd:
1538     if (BCIsConst)
1539       Res = Or(B, Not(C));
1540     break;
1541   case 0xde:
1542     if (ABCIsConst)
1543       Res = Or(Xor(A, C), B);
1544     break;
1545   case 0xdf:
1546     if (ABCIsConst)
1547       Res = Or(Nand(A, C), B);
1548     break;
1549   case 0xe0:
1550     if (ABCIsConst)
1551       Res = And(A, Or(B, C));
1552     break;
1553   case 0xe1:
1554     if (ABCIsConst)
1555       Res = Xor(A, Nor(B, C));
1556     break;
1557   case 0xe2:
1558     if (ABCIsConst)
1559       Res = Xor(A, Nor(Xnor(A, C), B));
1560     break;
1561   case 0xe3:
1562     if (ABCIsConst)
1563       Res = Xor(A, Nor(And(A, C), B));
1564     break;
1565   case 0xe4:
1566     if (ABCIsConst)
1567       Res = Xor(A, Nor(Xnor(A, B), C));
1568     break;
1569   case 0xe5:
1570     if (ABCIsConst)
1571       Res = Xor(A, Nor(And(A, B), C));
1572     break;
1573   case 0xe6:
1574     if (ABCIsConst)
1575       Res = Or(And(A, B), Xor(B, C));
1576     break;
1577   case 0xe7:
1578     if (ABCIsConst)
1579       Res = Or(Xnor(A, B), Xnor(A, C));
1580     break;
1581   case 0xe8:
1582     if (ABCIsConst)
1583       Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1584     break;
1585   case 0xe9:
1586     if (ABCIsConst)
1587       Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1588     break;
1589   case 0xea:
1590     if (ABCIsConst)
1591       Res = Or(And(A, B), C);
1592     break;
1593   case 0xeb:
1594     if (ABCIsConst)
1595       Res = Or(Xnor(A, B), C);
1596     break;
1597   case 0xec:
1598     if (ABCIsConst)
1599       Res = Or(And(A, C), B);
1600     break;
1601   case 0xed:
1602     if (ABCIsConst)
1603       Res = Or(Xnor(A, C), B);
1604     break;
1605   case 0xee:
1606     Res = Or(B, C);
1607     break;
1608   case 0xef:
1609     if (ABCIsConst)
1610       Res = Nand(A, Nor(B, C));
1611     break;
1612   case 0xf0:
1613     Res = A;
1614     break;
1615   case 0xf1:
1616     if (ABCIsConst)
1617       Res = Or(A, Nor(B, C));
1618     break;
1619   case 0xf2:
1620     if (ABCIsConst)
1621       Res = Or(A, Nor(B, Not(C)));
1622     break;
1623   case 0xf3:
1624     if (ABIsConst)
1625       Res = Or(A, Not(B));
1626     break;
1627   case 0xf4:
1628     if (ABCIsConst)
1629       Res = Or(A, Nor(C, Not(B)));
1630     break;
1631   case 0xf5:
1632     if (ACIsConst)
1633       Res = Or(A, Not(C));
1634     break;
1635   case 0xf6:
1636     if (ABCIsConst)
1637       Res = Or(A, Xor(B, C));
1638     break;
1639   case 0xf7:
1640     if (ABCIsConst)
1641       Res = Or(A, Nand(B, C));
1642     break;
1643   case 0xf8:
1644     if (ABCIsConst)
1645       Res = Or(A, And(B, C));
1646     break;
1647   case 0xf9:
1648     if (ABCIsConst)
1649       Res = Or(A, Xnor(B, C));
1650     break;
1651   case 0xfa:
1652     Res = Or(A, C);
1653     break;
1654   case 0xfb:
1655     if (ABCIsConst)
1656       Res = Nand(Nor(A, C), B);
1657     break;
1658   case 0xfc:
1659     Res = Or(A, B);
1660     break;
1661   case 0xfd:
1662     if (ABCIsConst)
1663       Res = Nand(Nor(A, B), C);
1664     break;
1665   case 0xfe:
1666     if (ABCIsConst)
1667       Res = Or(Or(A, B), C);
1668     break;
1669   case 0xff:
1670     Res = {Constant::getAllOnesValue(Ty), 0xff};
1671     break;
1672   }
1673 
1674   assert((Res.first == nullptr || Res.second == Imm) &&
1675          "Simplification of ternary logic does not verify!");
1676   return Res.first;
1677 }
1678 
1679 static Value *simplifyX86insertps(const IntrinsicInst &II,
1680                                   InstCombiner::BuilderTy &Builder) {
1681   auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1682   if (!CInt)
1683     return nullptr;
1684 
1685   auto *VecTy = cast<FixedVectorType>(II.getType());
1686   assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1687 
1688   // The immediate permute control byte looks like this:
1689   //    [3:0] - zero mask for each 32-bit lane
1690   //    [5:4] - select one 32-bit destination lane
1691   //    [7:6] - select one 32-bit source lane
1692 
1693   uint8_t Imm = CInt->getZExtValue();
1694   uint8_t ZMask = Imm & 0xf;
1695   uint8_t DestLane = (Imm >> 4) & 0x3;
1696   uint8_t SourceLane = (Imm >> 6) & 0x3;
1697 
1698   ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
1699 
1700   // If all zero mask bits are set, this was just a weird way to
1701   // generate a zero vector.
1702   if (ZMask == 0xf)
1703     return ZeroVector;
1704 
1705   // Initialize by passing all of the first source bits through.
1706   int ShuffleMask[4] = {0, 1, 2, 3};
1707 
1708   // We may replace the second operand with the zero vector.
1709   Value *V1 = II.getArgOperand(1);
1710 
1711   if (ZMask) {
1712     // If the zero mask is being used with a single input or the zero mask
1713     // overrides the destination lane, this is a shuffle with the zero vector.
1714     if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1715         (ZMask & (1 << DestLane))) {
1716       V1 = ZeroVector;
1717       // We may still move 32-bits of the first source vector from one lane
1718       // to another.
1719       ShuffleMask[DestLane] = SourceLane;
1720       // The zero mask may override the previous insert operation.
1721       for (unsigned i = 0; i < 4; ++i)
1722         if ((ZMask >> i) & 0x1)
1723           ShuffleMask[i] = i + 4;
1724     } else {
1725       // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1726       return nullptr;
1727     }
1728   } else {
1729     // Replace the selected destination lane with the selected source lane.
1730     ShuffleMask[DestLane] = SourceLane + 4;
1731   }
1732 
1733   return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1734 }
1735 
1736 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1737 /// or conversion to a shuffle vector.
1738 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
1739                                ConstantInt *CILength, ConstantInt *CIIndex,
1740                                InstCombiner::BuilderTy &Builder) {
1741   auto LowConstantHighUndef = [&](uint64_t Val) {
1742     Type *IntTy64 = Type::getInt64Ty(II.getContext());
1743     Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1744                         UndefValue::get(IntTy64)};
1745     return ConstantVector::get(Args);
1746   };
1747 
1748   // See if we're dealing with constant values.
1749   auto *C0 = dyn_cast<Constant>(Op0);
1750   auto *CI0 =
1751       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1752          : nullptr;
1753 
1754   // Attempt to constant fold.
1755   if (CILength && CIIndex) {
1756     // From AMD documentation: "The bit index and field length are each six
1757     // bits in length other bits of the field are ignored."
1758     APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1759     APInt APLength = CILength->getValue().zextOrTrunc(6);
1760 
1761     unsigned Index = APIndex.getZExtValue();
1762 
1763     // From AMD documentation: "a value of zero in the field length is
1764     // defined as length of 64".
1765     unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1766 
1767     // From AMD documentation: "If the sum of the bit index + length field
1768     // is greater than 64, the results are undefined".
1769     unsigned End = Index + Length;
1770 
1771     // Note that both field index and field length are 8-bit quantities.
1772     // Since variables 'Index' and 'Length' are unsigned values
1773     // obtained from zero-extending field index and field length
1774     // respectively, their sum should never wrap around.
1775     if (End > 64)
1776       return UndefValue::get(II.getType());
1777 
1778     // If we are inserting whole bytes, we can convert this to a shuffle.
1779     // Lowering can recognize EXTRQI shuffle masks.
1780     if ((Length % 8) == 0 && (Index % 8) == 0) {
1781       // Convert bit indices to byte indices.
1782       Length /= 8;
1783       Index /= 8;
1784 
1785       Type *IntTy8 = Type::getInt8Ty(II.getContext());
1786       auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1787 
1788       SmallVector<int, 16> ShuffleMask;
1789       for (int i = 0; i != (int)Length; ++i)
1790         ShuffleMask.push_back(i + Index);
1791       for (int i = Length; i != 8; ++i)
1792         ShuffleMask.push_back(i + 16);
1793       for (int i = 8; i != 16; ++i)
1794         ShuffleMask.push_back(-1);
1795 
1796       Value *SV = Builder.CreateShuffleVector(
1797           Builder.CreateBitCast(Op0, ShufTy),
1798           ConstantAggregateZero::get(ShufTy), ShuffleMask);
1799       return Builder.CreateBitCast(SV, II.getType());
1800     }
1801 
1802     // Constant Fold - shift Index'th bit to lowest position and mask off
1803     // Length bits.
1804     if (CI0) {
1805       APInt Elt = CI0->getValue();
1806       Elt.lshrInPlace(Index);
1807       Elt = Elt.zextOrTrunc(Length);
1808       return LowConstantHighUndef(Elt.getZExtValue());
1809     }
1810 
1811     // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1812     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1813       Value *Args[] = {Op0, CILength, CIIndex};
1814       Module *M = II.getModule();
1815       Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
1816       return Builder.CreateCall(F, Args);
1817     }
1818   }
1819 
1820   // Constant Fold - extraction from zero is always {zero, undef}.
1821   if (CI0 && CI0->isZero())
1822     return LowConstantHighUndef(0);
1823 
1824   return nullptr;
1825 }
1826 
1827 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1828 /// folding or conversion to a shuffle vector.
1829 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
1830                                  APInt APLength, APInt APIndex,
1831                                  InstCombiner::BuilderTy &Builder) {
1832   // From AMD documentation: "The bit index and field length are each six bits
1833   // in length other bits of the field are ignored."
1834   APIndex = APIndex.zextOrTrunc(6);
1835   APLength = APLength.zextOrTrunc(6);
1836 
1837   // Attempt to constant fold.
1838   unsigned Index = APIndex.getZExtValue();
1839 
1840   // From AMD documentation: "a value of zero in the field length is
1841   // defined as length of 64".
1842   unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1843 
1844   // From AMD documentation: "If the sum of the bit index + length field
1845   // is greater than 64, the results are undefined".
1846   unsigned End = Index + Length;
1847 
1848   // Note that both field index and field length are 8-bit quantities.
1849   // Since variables 'Index' and 'Length' are unsigned values
1850   // obtained from zero-extending field index and field length
1851   // respectively, their sum should never wrap around.
1852   if (End > 64)
1853     return UndefValue::get(II.getType());
1854 
1855   // If we are inserting whole bytes, we can convert this to a shuffle.
1856   // Lowering can recognize INSERTQI shuffle masks.
1857   if ((Length % 8) == 0 && (Index % 8) == 0) {
1858     // Convert bit indices to byte indices.
1859     Length /= 8;
1860     Index /= 8;
1861 
1862     Type *IntTy8 = Type::getInt8Ty(II.getContext());
1863     auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1864 
1865     SmallVector<int, 16> ShuffleMask;
1866     for (int i = 0; i != (int)Index; ++i)
1867       ShuffleMask.push_back(i);
1868     for (int i = 0; i != (int)Length; ++i)
1869       ShuffleMask.push_back(i + 16);
1870     for (int i = Index + Length; i != 8; ++i)
1871       ShuffleMask.push_back(i);
1872     for (int i = 8; i != 16; ++i)
1873       ShuffleMask.push_back(-1);
1874 
1875     Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1876                                             Builder.CreateBitCast(Op1, ShufTy),
1877                                             ShuffleMask);
1878     return Builder.CreateBitCast(SV, II.getType());
1879   }
1880 
1881   // See if we're dealing with constant values.
1882   auto *C0 = dyn_cast<Constant>(Op0);
1883   auto *C1 = dyn_cast<Constant>(Op1);
1884   auto *CI00 =
1885       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1886          : nullptr;
1887   auto *CI10 =
1888       C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1889          : nullptr;
1890 
1891   // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1892   if (CI00 && CI10) {
1893     APInt V00 = CI00->getValue();
1894     APInt V10 = CI10->getValue();
1895     APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
1896     V00 = V00 & ~Mask;
1897     V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
1898     APInt Val = V00 | V10;
1899     Type *IntTy64 = Type::getInt64Ty(II.getContext());
1900     Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
1901                         UndefValue::get(IntTy64)};
1902     return ConstantVector::get(Args);
1903   }
1904 
1905   // If we were an INSERTQ call, we'll save demanded elements if we convert to
1906   // INSERTQI.
1907   if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
1908     Type *IntTy8 = Type::getInt8Ty(II.getContext());
1909     Constant *CILength = ConstantInt::get(IntTy8, Length, false);
1910     Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
1911 
1912     Value *Args[] = {Op0, Op1, CILength, CIIndex};
1913     Module *M = II.getModule();
1914     Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
1915     return Builder.CreateCall(F, Args);
1916   }
1917 
1918   return nullptr;
1919 }
1920 
1921 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
1922 static Value *simplifyX86pshufb(const IntrinsicInst &II,
1923                                 InstCombiner::BuilderTy &Builder) {
1924   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1925   if (!V)
1926     return nullptr;
1927 
1928   auto *VecTy = cast<FixedVectorType>(II.getType());
1929   unsigned NumElts = VecTy->getNumElements();
1930   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
1931          "Unexpected number of elements in shuffle mask!");
1932 
1933   // Construct a shuffle mask from constant integers or UNDEFs.
1934   int Indexes[64];
1935 
1936   // Each byte in the shuffle control mask forms an index to permute the
1937   // corresponding byte in the destination operand.
1938   for (unsigned I = 0; I < NumElts; ++I) {
1939     Constant *COp = V->getAggregateElement(I);
1940     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1941       return nullptr;
1942 
1943     if (isa<UndefValue>(COp)) {
1944       Indexes[I] = -1;
1945       continue;
1946     }
1947 
1948     int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
1949 
1950     // If the most significant bit (bit[7]) of each byte of the shuffle
1951     // control mask is set, then zero is written in the result byte.
1952     // The zero vector is in the right-hand side of the resulting
1953     // shufflevector.
1954 
1955     // The value of each index for the high 128-bit lane is the least
1956     // significant 4 bits of the respective shuffle control byte.
1957     Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
1958     Indexes[I] = Index;
1959   }
1960 
1961   auto V1 = II.getArgOperand(0);
1962   auto V2 = Constant::getNullValue(VecTy);
1963   return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
1964 }
1965 
1966 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
1967 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
1968                                     InstCombiner::BuilderTy &Builder) {
1969   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1970   if (!V)
1971     return nullptr;
1972 
1973   auto *VecTy = cast<FixedVectorType>(II.getType());
1974   unsigned NumElts = VecTy->getNumElements();
1975   bool IsPD = VecTy->getScalarType()->isDoubleTy();
1976   unsigned NumLaneElts = IsPD ? 2 : 4;
1977   assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
1978 
1979   // Construct a shuffle mask from constant integers or UNDEFs.
1980   int Indexes[16];
1981 
1982   // The intrinsics only read one or two bits, clear the rest.
1983   for (unsigned I = 0; I < NumElts; ++I) {
1984     Constant *COp = V->getAggregateElement(I);
1985     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1986       return nullptr;
1987 
1988     if (isa<UndefValue>(COp)) {
1989       Indexes[I] = -1;
1990       continue;
1991     }
1992 
1993     APInt Index = cast<ConstantInt>(COp)->getValue();
1994     Index = Index.zextOrTrunc(32).getLoBits(2);
1995 
1996     // The PD variants uses bit 1 to select per-lane element index, so
1997     // shift down to convert to generic shuffle mask index.
1998     if (IsPD)
1999       Index.lshrInPlace(1);
2000 
2001     // The _256 variants are a bit trickier since the mask bits always index
2002     // into the corresponding 128 half. In order to convert to a generic
2003     // shuffle, we have to make that explicit.
2004     Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
2005 
2006     Indexes[I] = Index.getZExtValue();
2007   }
2008 
2009   auto V1 = II.getArgOperand(0);
2010   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
2011 }
2012 
2013 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
2014 static Value *simplifyX86vpermv(const IntrinsicInst &II,
2015                                 InstCombiner::BuilderTy &Builder) {
2016   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2017   if (!V)
2018     return nullptr;
2019 
2020   auto *VecTy = cast<FixedVectorType>(II.getType());
2021   unsigned Size = VecTy->getNumElements();
2022   assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
2023          "Unexpected shuffle mask size");
2024 
2025   // Construct a shuffle mask from constant integers or UNDEFs.
2026   int Indexes[64];
2027 
2028   for (unsigned I = 0; I < Size; ++I) {
2029     Constant *COp = V->getAggregateElement(I);
2030     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2031       return nullptr;
2032 
2033     if (isa<UndefValue>(COp)) {
2034       Indexes[I] = -1;
2035       continue;
2036     }
2037 
2038     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2039     Index &= Size - 1;
2040     Indexes[I] = Index;
2041   }
2042 
2043   auto V1 = II.getArgOperand(0);
2044   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
2045 }
2046 
2047 std::optional<Instruction *>
2048 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
2049   auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
2050                                              unsigned DemandedWidth) {
2051     APInt UndefElts(Width, 0);
2052     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2053     return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2054   };
2055 
2056   Intrinsic::ID IID = II.getIntrinsicID();
2057   switch (IID) {
2058   case Intrinsic::x86_bmi_bextr_32:
2059   case Intrinsic::x86_bmi_bextr_64:
2060   case Intrinsic::x86_tbm_bextri_u32:
2061   case Intrinsic::x86_tbm_bextri_u64:
2062     // If the RHS is a constant we can try some simplifications.
2063     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2064       uint64_t Shift = C->getZExtValue();
2065       uint64_t Length = (Shift >> 8) & 0xff;
2066       Shift &= 0xff;
2067       unsigned BitWidth = II.getType()->getIntegerBitWidth();
2068       // If the length is 0 or the shift is out of range, replace with zero.
2069       if (Length == 0 || Shift >= BitWidth) {
2070         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2071       }
2072       // If the LHS is also a constant, we can completely constant fold this.
2073       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2074         uint64_t Result = InC->getZExtValue() >> Shift;
2075         if (Length > BitWidth)
2076           Length = BitWidth;
2077         Result &= maskTrailingOnes<uint64_t>(Length);
2078         return IC.replaceInstUsesWith(II,
2079                                       ConstantInt::get(II.getType(), Result));
2080       }
2081       // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2082       // are only masking bits that a shift already cleared?
2083     }
2084     break;
2085 
2086   case Intrinsic::x86_bmi_bzhi_32:
2087   case Intrinsic::x86_bmi_bzhi_64:
2088     // If the RHS is a constant we can try some simplifications.
2089     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2090       uint64_t Index = C->getZExtValue() & 0xff;
2091       unsigned BitWidth = II.getType()->getIntegerBitWidth();
2092       if (Index >= BitWidth) {
2093         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2094       }
2095       if (Index == 0) {
2096         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2097       }
2098       // If the LHS is also a constant, we can completely constant fold this.
2099       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2100         uint64_t Result = InC->getZExtValue();
2101         Result &= maskTrailingOnes<uint64_t>(Index);
2102         return IC.replaceInstUsesWith(II,
2103                                       ConstantInt::get(II.getType(), Result));
2104       }
2105       // TODO should we convert this to an AND if the RHS is constant?
2106     }
2107     break;
2108   case Intrinsic::x86_bmi_pext_32:
2109   case Intrinsic::x86_bmi_pext_64:
2110     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2111       if (MaskC->isNullValue()) {
2112         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2113       }
2114       if (MaskC->isAllOnesValue()) {
2115         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2116       }
2117 
2118       unsigned MaskIdx, MaskLen;
2119       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2120         // any single contingous sequence of 1s anywhere in the mask simply
2121         // describes a subset of the input bits shifted to the appropriate
2122         // position.  Replace with the straight forward IR.
2123         Value *Input = II.getArgOperand(0);
2124         Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2125         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2126         Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2127         return IC.replaceInstUsesWith(II, Shifted);
2128       }
2129 
2130       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2131         uint64_t Src = SrcC->getZExtValue();
2132         uint64_t Mask = MaskC->getZExtValue();
2133         uint64_t Result = 0;
2134         uint64_t BitToSet = 1;
2135 
2136         while (Mask) {
2137           // Isolate lowest set bit.
2138           uint64_t BitToTest = Mask & -Mask;
2139           if (BitToTest & Src)
2140             Result |= BitToSet;
2141 
2142           BitToSet <<= 1;
2143           // Clear lowest set bit.
2144           Mask &= Mask - 1;
2145         }
2146 
2147         return IC.replaceInstUsesWith(II,
2148                                       ConstantInt::get(II.getType(), Result));
2149       }
2150     }
2151     break;
2152   case Intrinsic::x86_bmi_pdep_32:
2153   case Intrinsic::x86_bmi_pdep_64:
2154     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2155       if (MaskC->isNullValue()) {
2156         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2157       }
2158       if (MaskC->isAllOnesValue()) {
2159         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2160       }
2161 
2162       unsigned MaskIdx, MaskLen;
2163       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2164         // any single contingous sequence of 1s anywhere in the mask simply
2165         // describes a subset of the input bits shifted to the appropriate
2166         // position.  Replace with the straight forward IR.
2167         Value *Input = II.getArgOperand(0);
2168         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2169         Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2170         Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2171         return IC.replaceInstUsesWith(II, Masked);
2172       }
2173 
2174       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2175         uint64_t Src = SrcC->getZExtValue();
2176         uint64_t Mask = MaskC->getZExtValue();
2177         uint64_t Result = 0;
2178         uint64_t BitToTest = 1;
2179 
2180         while (Mask) {
2181           // Isolate lowest set bit.
2182           uint64_t BitToSet = Mask & -Mask;
2183           if (BitToTest & Src)
2184             Result |= BitToSet;
2185 
2186           BitToTest <<= 1;
2187           // Clear lowest set bit;
2188           Mask &= Mask - 1;
2189         }
2190 
2191         return IC.replaceInstUsesWith(II,
2192                                       ConstantInt::get(II.getType(), Result));
2193       }
2194     }
2195     break;
2196 
2197   case Intrinsic::x86_sse_cvtss2si:
2198   case Intrinsic::x86_sse_cvtss2si64:
2199   case Intrinsic::x86_sse_cvttss2si:
2200   case Intrinsic::x86_sse_cvttss2si64:
2201   case Intrinsic::x86_sse2_cvtsd2si:
2202   case Intrinsic::x86_sse2_cvtsd2si64:
2203   case Intrinsic::x86_sse2_cvttsd2si:
2204   case Intrinsic::x86_sse2_cvttsd2si64:
2205   case Intrinsic::x86_avx512_vcvtss2si32:
2206   case Intrinsic::x86_avx512_vcvtss2si64:
2207   case Intrinsic::x86_avx512_vcvtss2usi32:
2208   case Intrinsic::x86_avx512_vcvtss2usi64:
2209   case Intrinsic::x86_avx512_vcvtsd2si32:
2210   case Intrinsic::x86_avx512_vcvtsd2si64:
2211   case Intrinsic::x86_avx512_vcvtsd2usi32:
2212   case Intrinsic::x86_avx512_vcvtsd2usi64:
2213   case Intrinsic::x86_avx512_cvttss2si:
2214   case Intrinsic::x86_avx512_cvttss2si64:
2215   case Intrinsic::x86_avx512_cvttss2usi:
2216   case Intrinsic::x86_avx512_cvttss2usi64:
2217   case Intrinsic::x86_avx512_cvttsd2si:
2218   case Intrinsic::x86_avx512_cvttsd2si64:
2219   case Intrinsic::x86_avx512_cvttsd2usi:
2220   case Intrinsic::x86_avx512_cvttsd2usi64: {
2221     // These intrinsics only demand the 0th element of their input vectors. If
2222     // we can simplify the input based on that, do so now.
2223     Value *Arg = II.getArgOperand(0);
2224     unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2225     if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2226       return IC.replaceOperand(II, 0, V);
2227     }
2228     break;
2229   }
2230 
2231   case Intrinsic::x86_mmx_pmovmskb:
2232   case Intrinsic::x86_sse_movmsk_ps:
2233   case Intrinsic::x86_sse2_movmsk_pd:
2234   case Intrinsic::x86_sse2_pmovmskb_128:
2235   case Intrinsic::x86_avx_movmsk_pd_256:
2236   case Intrinsic::x86_avx_movmsk_ps_256:
2237   case Intrinsic::x86_avx2_pmovmskb:
2238     if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2239       return IC.replaceInstUsesWith(II, V);
2240     }
2241     break;
2242 
2243   case Intrinsic::x86_sse_comieq_ss:
2244   case Intrinsic::x86_sse_comige_ss:
2245   case Intrinsic::x86_sse_comigt_ss:
2246   case Intrinsic::x86_sse_comile_ss:
2247   case Intrinsic::x86_sse_comilt_ss:
2248   case Intrinsic::x86_sse_comineq_ss:
2249   case Intrinsic::x86_sse_ucomieq_ss:
2250   case Intrinsic::x86_sse_ucomige_ss:
2251   case Intrinsic::x86_sse_ucomigt_ss:
2252   case Intrinsic::x86_sse_ucomile_ss:
2253   case Intrinsic::x86_sse_ucomilt_ss:
2254   case Intrinsic::x86_sse_ucomineq_ss:
2255   case Intrinsic::x86_sse2_comieq_sd:
2256   case Intrinsic::x86_sse2_comige_sd:
2257   case Intrinsic::x86_sse2_comigt_sd:
2258   case Intrinsic::x86_sse2_comile_sd:
2259   case Intrinsic::x86_sse2_comilt_sd:
2260   case Intrinsic::x86_sse2_comineq_sd:
2261   case Intrinsic::x86_sse2_ucomieq_sd:
2262   case Intrinsic::x86_sse2_ucomige_sd:
2263   case Intrinsic::x86_sse2_ucomigt_sd:
2264   case Intrinsic::x86_sse2_ucomile_sd:
2265   case Intrinsic::x86_sse2_ucomilt_sd:
2266   case Intrinsic::x86_sse2_ucomineq_sd:
2267   case Intrinsic::x86_avx512_vcomi_ss:
2268   case Intrinsic::x86_avx512_vcomi_sd:
2269   case Intrinsic::x86_avx512_mask_cmp_ss:
2270   case Intrinsic::x86_avx512_mask_cmp_sd: {
2271     // These intrinsics only demand the 0th element of their input vectors. If
2272     // we can simplify the input based on that, do so now.
2273     bool MadeChange = false;
2274     Value *Arg0 = II.getArgOperand(0);
2275     Value *Arg1 = II.getArgOperand(1);
2276     unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2277     if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2278       IC.replaceOperand(II, 0, V);
2279       MadeChange = true;
2280     }
2281     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2282       IC.replaceOperand(II, 1, V);
2283       MadeChange = true;
2284     }
2285     if (MadeChange) {
2286       return &II;
2287     }
2288     break;
2289   }
2290 
2291   case Intrinsic::x86_avx512_add_ps_512:
2292   case Intrinsic::x86_avx512_div_ps_512:
2293   case Intrinsic::x86_avx512_mul_ps_512:
2294   case Intrinsic::x86_avx512_sub_ps_512:
2295   case Intrinsic::x86_avx512_add_pd_512:
2296   case Intrinsic::x86_avx512_div_pd_512:
2297   case Intrinsic::x86_avx512_mul_pd_512:
2298   case Intrinsic::x86_avx512_sub_pd_512:
2299     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2300     // IR operations.
2301     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2302       if (R->getValue() == 4) {
2303         Value *Arg0 = II.getArgOperand(0);
2304         Value *Arg1 = II.getArgOperand(1);
2305 
2306         Value *V;
2307         switch (IID) {
2308         default:
2309           llvm_unreachable("Case stmts out of sync!");
2310         case Intrinsic::x86_avx512_add_ps_512:
2311         case Intrinsic::x86_avx512_add_pd_512:
2312           V = IC.Builder.CreateFAdd(Arg0, Arg1);
2313           break;
2314         case Intrinsic::x86_avx512_sub_ps_512:
2315         case Intrinsic::x86_avx512_sub_pd_512:
2316           V = IC.Builder.CreateFSub(Arg0, Arg1);
2317           break;
2318         case Intrinsic::x86_avx512_mul_ps_512:
2319         case Intrinsic::x86_avx512_mul_pd_512:
2320           V = IC.Builder.CreateFMul(Arg0, Arg1);
2321           break;
2322         case Intrinsic::x86_avx512_div_ps_512:
2323         case Intrinsic::x86_avx512_div_pd_512:
2324           V = IC.Builder.CreateFDiv(Arg0, Arg1);
2325           break;
2326         }
2327 
2328         return IC.replaceInstUsesWith(II, V);
2329       }
2330     }
2331     break;
2332 
2333   case Intrinsic::x86_avx512_mask_add_ss_round:
2334   case Intrinsic::x86_avx512_mask_div_ss_round:
2335   case Intrinsic::x86_avx512_mask_mul_ss_round:
2336   case Intrinsic::x86_avx512_mask_sub_ss_round:
2337   case Intrinsic::x86_avx512_mask_add_sd_round:
2338   case Intrinsic::x86_avx512_mask_div_sd_round:
2339   case Intrinsic::x86_avx512_mask_mul_sd_round:
2340   case Intrinsic::x86_avx512_mask_sub_sd_round:
2341     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2342     // IR operations.
2343     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2344       if (R->getValue() == 4) {
2345         // Extract the element as scalars.
2346         Value *Arg0 = II.getArgOperand(0);
2347         Value *Arg1 = II.getArgOperand(1);
2348         Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
2349         Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
2350 
2351         Value *V;
2352         switch (IID) {
2353         default:
2354           llvm_unreachable("Case stmts out of sync!");
2355         case Intrinsic::x86_avx512_mask_add_ss_round:
2356         case Intrinsic::x86_avx512_mask_add_sd_round:
2357           V = IC.Builder.CreateFAdd(LHS, RHS);
2358           break;
2359         case Intrinsic::x86_avx512_mask_sub_ss_round:
2360         case Intrinsic::x86_avx512_mask_sub_sd_round:
2361           V = IC.Builder.CreateFSub(LHS, RHS);
2362           break;
2363         case Intrinsic::x86_avx512_mask_mul_ss_round:
2364         case Intrinsic::x86_avx512_mask_mul_sd_round:
2365           V = IC.Builder.CreateFMul(LHS, RHS);
2366           break;
2367         case Intrinsic::x86_avx512_mask_div_ss_round:
2368         case Intrinsic::x86_avx512_mask_div_sd_round:
2369           V = IC.Builder.CreateFDiv(LHS, RHS);
2370           break;
2371         }
2372 
2373         // Handle the masking aspect of the intrinsic.
2374         Value *Mask = II.getArgOperand(3);
2375         auto *C = dyn_cast<ConstantInt>(Mask);
2376         // We don't need a select if we know the mask bit is a 1.
2377         if (!C || !C->getValue()[0]) {
2378           // Cast the mask to an i1 vector and then extract the lowest element.
2379           auto *MaskTy = FixedVectorType::get(
2380               IC.Builder.getInt1Ty(),
2381               cast<IntegerType>(Mask->getType())->getBitWidth());
2382           Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2383           Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2384           // Extract the lowest element from the passthru operand.
2385           Value *Passthru =
2386               IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2387           V = IC.Builder.CreateSelect(Mask, V, Passthru);
2388         }
2389 
2390         // Insert the result back into the original argument 0.
2391         V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2392 
2393         return IC.replaceInstUsesWith(II, V);
2394       }
2395     }
2396     break;
2397 
2398   // Constant fold ashr( <A x Bi>, Ci ).
2399   // Constant fold lshr( <A x Bi>, Ci ).
2400   // Constant fold shl( <A x Bi>, Ci ).
2401   case Intrinsic::x86_sse2_psrai_d:
2402   case Intrinsic::x86_sse2_psrai_w:
2403   case Intrinsic::x86_avx2_psrai_d:
2404   case Intrinsic::x86_avx2_psrai_w:
2405   case Intrinsic::x86_avx512_psrai_q_128:
2406   case Intrinsic::x86_avx512_psrai_q_256:
2407   case Intrinsic::x86_avx512_psrai_d_512:
2408   case Intrinsic::x86_avx512_psrai_q_512:
2409   case Intrinsic::x86_avx512_psrai_w_512:
2410   case Intrinsic::x86_sse2_psrli_d:
2411   case Intrinsic::x86_sse2_psrli_q:
2412   case Intrinsic::x86_sse2_psrli_w:
2413   case Intrinsic::x86_avx2_psrli_d:
2414   case Intrinsic::x86_avx2_psrli_q:
2415   case Intrinsic::x86_avx2_psrli_w:
2416   case Intrinsic::x86_avx512_psrli_d_512:
2417   case Intrinsic::x86_avx512_psrli_q_512:
2418   case Intrinsic::x86_avx512_psrli_w_512:
2419   case Intrinsic::x86_sse2_pslli_d:
2420   case Intrinsic::x86_sse2_pslli_q:
2421   case Intrinsic::x86_sse2_pslli_w:
2422   case Intrinsic::x86_avx2_pslli_d:
2423   case Intrinsic::x86_avx2_pslli_q:
2424   case Intrinsic::x86_avx2_pslli_w:
2425   case Intrinsic::x86_avx512_pslli_d_512:
2426   case Intrinsic::x86_avx512_pslli_q_512:
2427   case Intrinsic::x86_avx512_pslli_w_512:
2428     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2429       return IC.replaceInstUsesWith(II, V);
2430     }
2431     break;
2432 
2433   case Intrinsic::x86_sse2_psra_d:
2434   case Intrinsic::x86_sse2_psra_w:
2435   case Intrinsic::x86_avx2_psra_d:
2436   case Intrinsic::x86_avx2_psra_w:
2437   case Intrinsic::x86_avx512_psra_q_128:
2438   case Intrinsic::x86_avx512_psra_q_256:
2439   case Intrinsic::x86_avx512_psra_d_512:
2440   case Intrinsic::x86_avx512_psra_q_512:
2441   case Intrinsic::x86_avx512_psra_w_512:
2442   case Intrinsic::x86_sse2_psrl_d:
2443   case Intrinsic::x86_sse2_psrl_q:
2444   case Intrinsic::x86_sse2_psrl_w:
2445   case Intrinsic::x86_avx2_psrl_d:
2446   case Intrinsic::x86_avx2_psrl_q:
2447   case Intrinsic::x86_avx2_psrl_w:
2448   case Intrinsic::x86_avx512_psrl_d_512:
2449   case Intrinsic::x86_avx512_psrl_q_512:
2450   case Intrinsic::x86_avx512_psrl_w_512:
2451   case Intrinsic::x86_sse2_psll_d:
2452   case Intrinsic::x86_sse2_psll_q:
2453   case Intrinsic::x86_sse2_psll_w:
2454   case Intrinsic::x86_avx2_psll_d:
2455   case Intrinsic::x86_avx2_psll_q:
2456   case Intrinsic::x86_avx2_psll_w:
2457   case Intrinsic::x86_avx512_psll_d_512:
2458   case Intrinsic::x86_avx512_psll_q_512:
2459   case Intrinsic::x86_avx512_psll_w_512: {
2460     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2461       return IC.replaceInstUsesWith(II, V);
2462     }
2463 
2464     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2465     // operand to compute the shift amount.
2466     Value *Arg1 = II.getArgOperand(1);
2467     assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2468            "Unexpected packed shift size");
2469     unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2470 
2471     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2472       return IC.replaceOperand(II, 1, V);
2473     }
2474     break;
2475   }
2476 
2477   case Intrinsic::x86_avx2_psllv_d:
2478   case Intrinsic::x86_avx2_psllv_d_256:
2479   case Intrinsic::x86_avx2_psllv_q:
2480   case Intrinsic::x86_avx2_psllv_q_256:
2481   case Intrinsic::x86_avx512_psllv_d_512:
2482   case Intrinsic::x86_avx512_psllv_q_512:
2483   case Intrinsic::x86_avx512_psllv_w_128:
2484   case Intrinsic::x86_avx512_psllv_w_256:
2485   case Intrinsic::x86_avx512_psllv_w_512:
2486   case Intrinsic::x86_avx2_psrav_d:
2487   case Intrinsic::x86_avx2_psrav_d_256:
2488   case Intrinsic::x86_avx512_psrav_q_128:
2489   case Intrinsic::x86_avx512_psrav_q_256:
2490   case Intrinsic::x86_avx512_psrav_d_512:
2491   case Intrinsic::x86_avx512_psrav_q_512:
2492   case Intrinsic::x86_avx512_psrav_w_128:
2493   case Intrinsic::x86_avx512_psrav_w_256:
2494   case Intrinsic::x86_avx512_psrav_w_512:
2495   case Intrinsic::x86_avx2_psrlv_d:
2496   case Intrinsic::x86_avx2_psrlv_d_256:
2497   case Intrinsic::x86_avx2_psrlv_q:
2498   case Intrinsic::x86_avx2_psrlv_q_256:
2499   case Intrinsic::x86_avx512_psrlv_d_512:
2500   case Intrinsic::x86_avx512_psrlv_q_512:
2501   case Intrinsic::x86_avx512_psrlv_w_128:
2502   case Intrinsic::x86_avx512_psrlv_w_256:
2503   case Intrinsic::x86_avx512_psrlv_w_512:
2504     if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2505       return IC.replaceInstUsesWith(II, V);
2506     }
2507     break;
2508 
2509   case Intrinsic::x86_sse2_packssdw_128:
2510   case Intrinsic::x86_sse2_packsswb_128:
2511   case Intrinsic::x86_avx2_packssdw:
2512   case Intrinsic::x86_avx2_packsswb:
2513   case Intrinsic::x86_avx512_packssdw_512:
2514   case Intrinsic::x86_avx512_packsswb_512:
2515     if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2516       return IC.replaceInstUsesWith(II, V);
2517     }
2518     break;
2519 
2520   case Intrinsic::x86_sse2_packuswb_128:
2521   case Intrinsic::x86_sse41_packusdw:
2522   case Intrinsic::x86_avx2_packusdw:
2523   case Intrinsic::x86_avx2_packuswb:
2524   case Intrinsic::x86_avx512_packusdw_512:
2525   case Intrinsic::x86_avx512_packuswb_512:
2526     if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2527       return IC.replaceInstUsesWith(II, V);
2528     }
2529     break;
2530 
2531   case Intrinsic::x86_sse2_pmadd_wd:
2532   case Intrinsic::x86_avx2_pmadd_wd:
2533   case Intrinsic::x86_avx512_pmaddw_d_512:
2534     if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) {
2535       return IC.replaceInstUsesWith(II, V);
2536     }
2537     break;
2538 
2539   case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
2540   case Intrinsic::x86_avx2_pmadd_ub_sw:
2541   case Intrinsic::x86_avx512_pmaddubs_w_512:
2542     if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) {
2543       return IC.replaceInstUsesWith(II, V);
2544     }
2545     break;
2546 
2547   case Intrinsic::x86_pclmulqdq:
2548   case Intrinsic::x86_pclmulqdq_256:
2549   case Intrinsic::x86_pclmulqdq_512: {
2550     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2551       unsigned Imm = C->getZExtValue();
2552 
2553       bool MadeChange = false;
2554       Value *Arg0 = II.getArgOperand(0);
2555       Value *Arg1 = II.getArgOperand(1);
2556       unsigned VWidth =
2557           cast<FixedVectorType>(Arg0->getType())->getNumElements();
2558 
2559       APInt UndefElts1(VWidth, 0);
2560       APInt DemandedElts1 =
2561           APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2562       if (Value *V =
2563               IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2564         IC.replaceOperand(II, 0, V);
2565         MadeChange = true;
2566       }
2567 
2568       APInt UndefElts2(VWidth, 0);
2569       APInt DemandedElts2 =
2570           APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2571       if (Value *V =
2572               IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2573         IC.replaceOperand(II, 1, V);
2574         MadeChange = true;
2575       }
2576 
2577       // If either input elements are undef, the result is zero.
2578       if (DemandedElts1.isSubsetOf(UndefElts1) ||
2579           DemandedElts2.isSubsetOf(UndefElts2)) {
2580         return IC.replaceInstUsesWith(II,
2581                                       ConstantAggregateZero::get(II.getType()));
2582       }
2583 
2584       if (MadeChange) {
2585         return &II;
2586       }
2587     }
2588     break;
2589   }
2590 
2591   case Intrinsic::x86_sse41_insertps:
2592     if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2593       return IC.replaceInstUsesWith(II, V);
2594     }
2595     break;
2596 
2597   case Intrinsic::x86_sse4a_extrq: {
2598     Value *Op0 = II.getArgOperand(0);
2599     Value *Op1 = II.getArgOperand(1);
2600     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2601     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2602     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2603            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2604            VWidth1 == 16 && "Unexpected operand sizes");
2605 
2606     // See if we're dealing with constant values.
2607     auto *C1 = dyn_cast<Constant>(Op1);
2608     auto *CILength =
2609         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2610            : nullptr;
2611     auto *CIIndex =
2612         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2613            : nullptr;
2614 
2615     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2616     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2617       return IC.replaceInstUsesWith(II, V);
2618     }
2619 
2620     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2621     // operands and the lowest 16-bits of the second.
2622     bool MadeChange = false;
2623     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2624       IC.replaceOperand(II, 0, V);
2625       MadeChange = true;
2626     }
2627     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2628       IC.replaceOperand(II, 1, V);
2629       MadeChange = true;
2630     }
2631     if (MadeChange) {
2632       return &II;
2633     }
2634     break;
2635   }
2636 
2637   case Intrinsic::x86_sse4a_extrqi: {
2638     // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2639     // bits of the lower 64-bits. The upper 64-bits are undefined.
2640     Value *Op0 = II.getArgOperand(0);
2641     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2642     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2643            "Unexpected operand size");
2644 
2645     // See if we're dealing with constant values.
2646     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2647     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2648 
2649     // Attempt to simplify to a constant or shuffle vector.
2650     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2651       return IC.replaceInstUsesWith(II, V);
2652     }
2653 
2654     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2655     // operand.
2656     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2657       return IC.replaceOperand(II, 0, V);
2658     }
2659     break;
2660   }
2661 
2662   case Intrinsic::x86_sse4a_insertq: {
2663     Value *Op0 = II.getArgOperand(0);
2664     Value *Op1 = II.getArgOperand(1);
2665     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2666     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2667            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2668            cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2669            "Unexpected operand size");
2670 
2671     // See if we're dealing with constant values.
2672     auto *C1 = dyn_cast<Constant>(Op1);
2673     auto *CI11 =
2674         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2675            : nullptr;
2676 
2677     // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2678     if (CI11) {
2679       const APInt &V11 = CI11->getValue();
2680       APInt Len = V11.zextOrTrunc(6);
2681       APInt Idx = V11.lshr(8).zextOrTrunc(6);
2682       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2683         return IC.replaceInstUsesWith(II, V);
2684       }
2685     }
2686 
2687     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2688     // operand.
2689     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2690       return IC.replaceOperand(II, 0, V);
2691     }
2692     break;
2693   }
2694 
2695   case Intrinsic::x86_sse4a_insertqi: {
2696     // INSERTQI: Extract lowest Length bits from lower half of second source and
2697     // insert over first source starting at Index bit. The upper 64-bits are
2698     // undefined.
2699     Value *Op0 = II.getArgOperand(0);
2700     Value *Op1 = II.getArgOperand(1);
2701     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2702     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2703     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2704            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2705            VWidth1 == 2 && "Unexpected operand sizes");
2706 
2707     // See if we're dealing with constant values.
2708     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2709     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2710 
2711     // Attempt to simplify to a constant or shuffle vector.
2712     if (CILength && CIIndex) {
2713       APInt Len = CILength->getValue().zextOrTrunc(6);
2714       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2715       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2716         return IC.replaceInstUsesWith(II, V);
2717       }
2718     }
2719 
2720     // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2721     // operands.
2722     bool MadeChange = false;
2723     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2724       IC.replaceOperand(II, 0, V);
2725       MadeChange = true;
2726     }
2727     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2728       IC.replaceOperand(II, 1, V);
2729       MadeChange = true;
2730     }
2731     if (MadeChange) {
2732       return &II;
2733     }
2734     break;
2735   }
2736 
2737   case Intrinsic::x86_sse41_pblendvb:
2738   case Intrinsic::x86_sse41_blendvps:
2739   case Intrinsic::x86_sse41_blendvpd:
2740   case Intrinsic::x86_avx_blendv_ps_256:
2741   case Intrinsic::x86_avx_blendv_pd_256:
2742   case Intrinsic::x86_avx2_pblendvb: {
2743     // fold (blend A, A, Mask) -> A
2744     Value *Op0 = II.getArgOperand(0);
2745     Value *Op1 = II.getArgOperand(1);
2746     Value *Mask = II.getArgOperand(2);
2747     if (Op0 == Op1) {
2748       return IC.replaceInstUsesWith(II, Op0);
2749     }
2750 
2751     // Zero Mask - select 1st argument.
2752     if (isa<ConstantAggregateZero>(Mask)) {
2753       return IC.replaceInstUsesWith(II, Op0);
2754     }
2755 
2756     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2757     if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2758       Constant *NewSelector =
2759           getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
2760       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2761     }
2762 
2763     // Convert to a vector select if we can bypass casts and find a boolean
2764     // vector condition value.
2765     Value *BoolVec;
2766     Mask = InstCombiner::peekThroughBitcast(Mask);
2767     if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
2768         BoolVec->getType()->isVectorTy() &&
2769         BoolVec->getType()->getScalarSizeInBits() == 1) {
2770       auto *MaskTy = cast<FixedVectorType>(Mask->getType());
2771       auto *OpTy = cast<FixedVectorType>(II.getType());
2772       assert(MaskTy->getPrimitiveSizeInBits() ==
2773                  OpTy->getPrimitiveSizeInBits() &&
2774              "Not expecting mask and operands with different sizes");
2775       unsigned NumMaskElts = MaskTy->getNumElements();
2776       unsigned NumOperandElts = OpTy->getNumElements();
2777 
2778       if (NumMaskElts == NumOperandElts) {
2779         return SelectInst::Create(BoolVec, Op1, Op0);
2780       }
2781 
2782       // If the mask has less elements than the operands, each mask bit maps to
2783       // multiple elements of the operands. Bitcast back and forth.
2784       if (NumMaskElts < NumOperandElts) {
2785         Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);
2786         Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);
2787         Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
2788         return new BitCastInst(Sel, II.getType());
2789       }
2790     }
2791 
2792     break;
2793   }
2794 
2795   case Intrinsic::x86_ssse3_pshuf_b_128:
2796   case Intrinsic::x86_avx2_pshuf_b:
2797   case Intrinsic::x86_avx512_pshuf_b_512:
2798     if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
2799       return IC.replaceInstUsesWith(II, V);
2800     }
2801     break;
2802 
2803   case Intrinsic::x86_avx_vpermilvar_ps:
2804   case Intrinsic::x86_avx_vpermilvar_ps_256:
2805   case Intrinsic::x86_avx512_vpermilvar_ps_512:
2806   case Intrinsic::x86_avx_vpermilvar_pd:
2807   case Intrinsic::x86_avx_vpermilvar_pd_256:
2808   case Intrinsic::x86_avx512_vpermilvar_pd_512:
2809     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2810       return IC.replaceInstUsesWith(II, V);
2811     }
2812     break;
2813 
2814   case Intrinsic::x86_avx2_permd:
2815   case Intrinsic::x86_avx2_permps:
2816   case Intrinsic::x86_avx512_permvar_df_256:
2817   case Intrinsic::x86_avx512_permvar_df_512:
2818   case Intrinsic::x86_avx512_permvar_di_256:
2819   case Intrinsic::x86_avx512_permvar_di_512:
2820   case Intrinsic::x86_avx512_permvar_hi_128:
2821   case Intrinsic::x86_avx512_permvar_hi_256:
2822   case Intrinsic::x86_avx512_permvar_hi_512:
2823   case Intrinsic::x86_avx512_permvar_qi_128:
2824   case Intrinsic::x86_avx512_permvar_qi_256:
2825   case Intrinsic::x86_avx512_permvar_qi_512:
2826   case Intrinsic::x86_avx512_permvar_sf_512:
2827   case Intrinsic::x86_avx512_permvar_si_512:
2828     if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
2829       return IC.replaceInstUsesWith(II, V);
2830     }
2831     break;
2832 
2833   case Intrinsic::x86_avx_maskload_ps:
2834   case Intrinsic::x86_avx_maskload_pd:
2835   case Intrinsic::x86_avx_maskload_ps_256:
2836   case Intrinsic::x86_avx_maskload_pd_256:
2837   case Intrinsic::x86_avx2_maskload_d:
2838   case Intrinsic::x86_avx2_maskload_q:
2839   case Intrinsic::x86_avx2_maskload_d_256:
2840   case Intrinsic::x86_avx2_maskload_q_256:
2841     if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
2842       return I;
2843     }
2844     break;
2845 
2846   case Intrinsic::x86_sse2_maskmov_dqu:
2847   case Intrinsic::x86_avx_maskstore_ps:
2848   case Intrinsic::x86_avx_maskstore_pd:
2849   case Intrinsic::x86_avx_maskstore_ps_256:
2850   case Intrinsic::x86_avx_maskstore_pd_256:
2851   case Intrinsic::x86_avx2_maskstore_d:
2852   case Intrinsic::x86_avx2_maskstore_q:
2853   case Intrinsic::x86_avx2_maskstore_d_256:
2854   case Intrinsic::x86_avx2_maskstore_q_256:
2855     if (simplifyX86MaskedStore(II, IC)) {
2856       return nullptr;
2857     }
2858     break;
2859 
2860   case Intrinsic::x86_addcarry_32:
2861   case Intrinsic::x86_addcarry_64:
2862     if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
2863       return IC.replaceInstUsesWith(II, V);
2864     }
2865     break;
2866 
2867   case Intrinsic::x86_avx512_pternlog_d_128:
2868   case Intrinsic::x86_avx512_pternlog_d_256:
2869   case Intrinsic::x86_avx512_pternlog_d_512:
2870   case Intrinsic::x86_avx512_pternlog_q_128:
2871   case Intrinsic::x86_avx512_pternlog_q_256:
2872   case Intrinsic::x86_avx512_pternlog_q_512:
2873     if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
2874       return IC.replaceInstUsesWith(II, V);
2875     }
2876     break;
2877   default:
2878     break;
2879   }
2880   return std::nullopt;
2881 }
2882 
2883 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
2884     InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
2885     bool &KnownBitsComputed) const {
2886   switch (II.getIntrinsicID()) {
2887   default:
2888     break;
2889   case Intrinsic::x86_mmx_pmovmskb:
2890   case Intrinsic::x86_sse_movmsk_ps:
2891   case Intrinsic::x86_sse2_movmsk_pd:
2892   case Intrinsic::x86_sse2_pmovmskb_128:
2893   case Intrinsic::x86_avx_movmsk_ps_256:
2894   case Intrinsic::x86_avx_movmsk_pd_256:
2895   case Intrinsic::x86_avx2_pmovmskb: {
2896     // MOVMSK copies the vector elements' sign bits to the low bits
2897     // and zeros the high bits.
2898     unsigned ArgWidth;
2899     if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
2900       ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
2901     } else {
2902       auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
2903       ArgWidth = ArgType->getNumElements();
2904     }
2905 
2906     // If we don't need any of low bits then return zero,
2907     // we know that DemandedMask is non-zero already.
2908     APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
2909     Type *VTy = II.getType();
2910     if (DemandedElts.isZero()) {
2911       return ConstantInt::getNullValue(VTy);
2912     }
2913 
2914     // We know that the upper bits are set to zero.
2915     Known.Zero.setBitsFrom(ArgWidth);
2916     KnownBitsComputed = true;
2917     break;
2918   }
2919   }
2920   return std::nullopt;
2921 }
2922 
2923 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
2924     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2925     APInt &UndefElts2, APInt &UndefElts3,
2926     std::function<void(Instruction *, unsigned, APInt, APInt &)>
2927         simplifyAndSetOp) const {
2928   unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
2929   switch (II.getIntrinsicID()) {
2930   default:
2931     break;
2932   case Intrinsic::x86_xop_vfrcz_ss:
2933   case Intrinsic::x86_xop_vfrcz_sd:
2934     // The instructions for these intrinsics are speced to zero upper bits not
2935     // pass them through like other scalar intrinsics. So we shouldn't just
2936     // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
2937     // Instead we should return a zero vector.
2938     if (!DemandedElts[0]) {
2939       IC.addToWorklist(&II);
2940       return ConstantAggregateZero::get(II.getType());
2941     }
2942 
2943     // Only the lower element is used.
2944     DemandedElts = 1;
2945     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2946 
2947     // Only the lower element is undefined. The high elements are zero.
2948     UndefElts = UndefElts[0];
2949     break;
2950 
2951   // Unary scalar-as-vector operations that work column-wise.
2952   case Intrinsic::x86_sse_rcp_ss:
2953   case Intrinsic::x86_sse_rsqrt_ss:
2954     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2955 
2956     // If lowest element of a scalar op isn't used then use Arg0.
2957     if (!DemandedElts[0]) {
2958       IC.addToWorklist(&II);
2959       return II.getArgOperand(0);
2960     }
2961     // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
2962     // checks).
2963     break;
2964 
2965   // Binary scalar-as-vector operations that work column-wise. The high
2966   // elements come from operand 0. The low element is a function of both
2967   // operands.
2968   case Intrinsic::x86_sse_min_ss:
2969   case Intrinsic::x86_sse_max_ss:
2970   case Intrinsic::x86_sse_cmp_ss:
2971   case Intrinsic::x86_sse2_min_sd:
2972   case Intrinsic::x86_sse2_max_sd:
2973   case Intrinsic::x86_sse2_cmp_sd: {
2974     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2975 
2976     // If lowest element of a scalar op isn't used then use Arg0.
2977     if (!DemandedElts[0]) {
2978       IC.addToWorklist(&II);
2979       return II.getArgOperand(0);
2980     }
2981 
2982     // Only lower element is used for operand 1.
2983     DemandedElts = 1;
2984     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2985 
2986     // Lower element is undefined if both lower elements are undefined.
2987     // Consider things like undef&0.  The result is known zero, not undef.
2988     if (!UndefElts2[0])
2989       UndefElts.clearBit(0);
2990 
2991     break;
2992   }
2993 
2994   // Binary scalar-as-vector operations that work column-wise. The high
2995   // elements come from operand 0 and the low element comes from operand 1.
2996   case Intrinsic::x86_sse41_round_ss:
2997   case Intrinsic::x86_sse41_round_sd: {
2998     // Don't use the low element of operand 0.
2999     APInt DemandedElts2 = DemandedElts;
3000     DemandedElts2.clearBit(0);
3001     simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
3002 
3003     // If lowest element of a scalar op isn't used then use Arg0.
3004     if (!DemandedElts[0]) {
3005       IC.addToWorklist(&II);
3006       return II.getArgOperand(0);
3007     }
3008 
3009     // Only lower element is used for operand 1.
3010     DemandedElts = 1;
3011     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3012 
3013     // Take the high undef elements from operand 0 and take the lower element
3014     // from operand 1.
3015     UndefElts.clearBit(0);
3016     UndefElts |= UndefElts2[0];
3017     break;
3018   }
3019 
3020   // Three input scalar-as-vector operations that work column-wise. The high
3021   // elements come from operand 0 and the low element is a function of all
3022   // three inputs.
3023   case Intrinsic::x86_avx512_mask_add_ss_round:
3024   case Intrinsic::x86_avx512_mask_div_ss_round:
3025   case Intrinsic::x86_avx512_mask_mul_ss_round:
3026   case Intrinsic::x86_avx512_mask_sub_ss_round:
3027   case Intrinsic::x86_avx512_mask_max_ss_round:
3028   case Intrinsic::x86_avx512_mask_min_ss_round:
3029   case Intrinsic::x86_avx512_mask_add_sd_round:
3030   case Intrinsic::x86_avx512_mask_div_sd_round:
3031   case Intrinsic::x86_avx512_mask_mul_sd_round:
3032   case Intrinsic::x86_avx512_mask_sub_sd_round:
3033   case Intrinsic::x86_avx512_mask_max_sd_round:
3034   case Intrinsic::x86_avx512_mask_min_sd_round:
3035     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3036 
3037     // If lowest element of a scalar op isn't used then use Arg0.
3038     if (!DemandedElts[0]) {
3039       IC.addToWorklist(&II);
3040       return II.getArgOperand(0);
3041     }
3042 
3043     // Only lower element is used for operand 1 and 2.
3044     DemandedElts = 1;
3045     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3046     simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
3047 
3048     // Lower element is undefined if all three lower elements are undefined.
3049     // Consider things like undef&0.  The result is known zero, not undef.
3050     if (!UndefElts2[0] || !UndefElts3[0])
3051       UndefElts.clearBit(0);
3052     break;
3053 
3054   // TODO: Add fmaddsub support?
3055   case Intrinsic::x86_sse3_addsub_pd:
3056   case Intrinsic::x86_sse3_addsub_ps:
3057   case Intrinsic::x86_avx_addsub_pd_256:
3058   case Intrinsic::x86_avx_addsub_ps_256: {
3059     // If none of the even or none of the odd lanes are required, turn this
3060     // into a generic FP math instruction.
3061     APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
3062     APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
3063     bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
3064     bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
3065     if (IsSubOnly || IsAddOnly) {
3066       assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
3067       IRBuilderBase::InsertPointGuard Guard(IC.Builder);
3068       IC.Builder.SetInsertPoint(&II);
3069       Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3070       return IC.Builder.CreateBinOp(
3071           IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3072     }
3073 
3074     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3075     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3076     UndefElts &= UndefElts2;
3077     break;
3078   }
3079 
3080   // General per-element vector operations.
3081   case Intrinsic::x86_avx2_psllv_d:
3082   case Intrinsic::x86_avx2_psllv_d_256:
3083   case Intrinsic::x86_avx2_psllv_q:
3084   case Intrinsic::x86_avx2_psllv_q_256:
3085   case Intrinsic::x86_avx2_psrlv_d:
3086   case Intrinsic::x86_avx2_psrlv_d_256:
3087   case Intrinsic::x86_avx2_psrlv_q:
3088   case Intrinsic::x86_avx2_psrlv_q_256:
3089   case Intrinsic::x86_avx2_psrav_d:
3090   case Intrinsic::x86_avx2_psrav_d_256: {
3091     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3092     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3093     UndefElts &= UndefElts2;
3094     break;
3095   }
3096 
3097   case Intrinsic::x86_sse2_packssdw_128:
3098   case Intrinsic::x86_sse2_packsswb_128:
3099   case Intrinsic::x86_sse2_packuswb_128:
3100   case Intrinsic::x86_sse41_packusdw:
3101   case Intrinsic::x86_avx2_packssdw:
3102   case Intrinsic::x86_avx2_packsswb:
3103   case Intrinsic::x86_avx2_packusdw:
3104   case Intrinsic::x86_avx2_packuswb:
3105   case Intrinsic::x86_avx512_packssdw_512:
3106   case Intrinsic::x86_avx512_packsswb_512:
3107   case Intrinsic::x86_avx512_packusdw_512:
3108   case Intrinsic::x86_avx512_packuswb_512: {
3109     auto *Ty0 = II.getArgOperand(0)->getType();
3110     unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3111     assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3112 
3113     unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3114     unsigned VWidthPerLane = VWidth / NumLanes;
3115     unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3116 
3117     // Per lane, pack the elements of the first input and then the second.
3118     // e.g.
3119     // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3120     // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3121     for (int OpNum = 0; OpNum != 2; ++OpNum) {
3122       APInt OpDemandedElts(InnerVWidth, 0);
3123       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3124         unsigned LaneIdx = Lane * VWidthPerLane;
3125         for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3126           unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3127           if (DemandedElts[Idx])
3128             OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3129         }
3130       }
3131 
3132       // Demand elements from the operand.
3133       APInt OpUndefElts(InnerVWidth, 0);
3134       simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3135 
3136       // Pack the operand's UNDEF elements, one lane at a time.
3137       OpUndefElts = OpUndefElts.zext(VWidth);
3138       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3139         APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3140         LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3141         LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3142         UndefElts |= LaneElts;
3143       }
3144     }
3145     break;
3146   }
3147 
3148   case Intrinsic::x86_sse2_pmadd_wd:
3149   case Intrinsic::x86_avx2_pmadd_wd:
3150   case Intrinsic::x86_avx512_pmaddw_d_512:
3151   case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
3152   case Intrinsic::x86_avx2_pmadd_ub_sw:
3153   case Intrinsic::x86_avx512_pmaddubs_w_512: {
3154     // PMADD - demand both src elements that map to each dst element.
3155     auto *ArgTy = II.getArgOperand(0)->getType();
3156     unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements();
3157     assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
3158     APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth);
3159     APInt Op0UndefElts(InnerVWidth, 0);
3160     APInt Op1UndefElts(InnerVWidth, 0);
3161     simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts);
3162     simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts);
3163     break;
3164   }
3165 
3166   // PSHUFB
3167   case Intrinsic::x86_ssse3_pshuf_b_128:
3168   case Intrinsic::x86_avx2_pshuf_b:
3169   case Intrinsic::x86_avx512_pshuf_b_512:
3170   // PERMILVAR
3171   case Intrinsic::x86_avx_vpermilvar_ps:
3172   case Intrinsic::x86_avx_vpermilvar_ps_256:
3173   case Intrinsic::x86_avx512_vpermilvar_ps_512:
3174   case Intrinsic::x86_avx_vpermilvar_pd:
3175   case Intrinsic::x86_avx_vpermilvar_pd_256:
3176   case Intrinsic::x86_avx512_vpermilvar_pd_512:
3177   // PERMV
3178   case Intrinsic::x86_avx2_permd:
3179   case Intrinsic::x86_avx2_permps: {
3180     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3181     break;
3182   }
3183 
3184   // SSE4A instructions leave the upper 64-bits of the 128-bit result
3185   // in an undefined state.
3186   case Intrinsic::x86_sse4a_extrq:
3187   case Intrinsic::x86_sse4a_extrqi:
3188   case Intrinsic::x86_sse4a_insertq:
3189   case Intrinsic::x86_sse4a_insertqi:
3190     UndefElts.setHighBits(VWidth / 2);
3191     break;
3192   }
3193   return std::nullopt;
3194 }
3195