xref: /llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp (revision 89ca3e72ca03efbbfb5ae9b1c71d81f2d1753521)
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of a
17 /// specific CPU model. Usually the numbers correspond to the CPU where the
18 /// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost,
21 /// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22 ///
23 /// Some examples of other technologies/CPUs:
24 ///   SSE 3   - Pentium4 / Athlon64
25 ///   SSE 4.1 - Penryn
26 ///   SSE 4.2 - Nehalem / Silvermont
27 ///   AVX     - Sandy Bridge / Jaguar / Bulldozer
28 ///   AVX2    - Haswell / Ryzen
29 ///   AVX-512 - Xeon Phi / Skylake
30 ///
31 /// And some examples of instruction target dependent costs (latency)
32 ///                   divss     sqrtss          rsqrtss
33 ///   AMD K7          11-16     19              3
34 ///   Piledriver      9-24      13-15           5
35 ///   Jaguar          14        16              2
36 ///   Pentium II,III  18        30              2
37 ///   Nehalem         7-14      7-18            3
38 ///   Haswell         10-13     11              5
39 ///
40 /// Interpreting the 4 TargetCostKind types:
41 /// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42 /// values reported by the CPU scheduler models (and llvm-mca).
43 /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44 /// actual encoding size of the instruction.
45 /// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46 /// by the CPU scheduler models (and llvm-mca), to ensure that they are
47 /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48 /// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49 //===----------------------------------------------------------------------===//
50 
51 #include "X86TargetTransformInfo.h"
52 #include "llvm/Analysis/TargetTransformInfo.h"
53 #include "llvm/CodeGen/BasicTTIImpl.h"
54 #include "llvm/CodeGen/CostTable.h"
55 #include "llvm/CodeGen/TargetLowering.h"
56 #include "llvm/IR/InstIterator.h"
57 #include "llvm/IR/IntrinsicInst.h"
58 #include <optional>
59 
60 using namespace llvm;
61 
62 #define DEBUG_TYPE "x86tti"
63 
64 //===----------------------------------------------------------------------===//
65 //
66 // X86 cost model.
67 //
68 //===----------------------------------------------------------------------===//
69 
70 // Helper struct to store/access costs for each cost kind.
71 // TODO: Move this to allow other targets to use it?
72 struct CostKindCosts {
73   unsigned RecipThroughputCost = ~0U;
74   unsigned LatencyCost = ~0U;
75   unsigned CodeSizeCost = ~0U;
76   unsigned SizeAndLatencyCost = ~0U;
77 
78   std::optional<unsigned>
79   operator[](TargetTransformInfo::TargetCostKind Kind) const {
80     unsigned Cost = ~0U;
81     switch (Kind) {
82     case TargetTransformInfo::TCK_RecipThroughput:
83       Cost = RecipThroughputCost;
84       break;
85     case TargetTransformInfo::TCK_Latency:
86       Cost = LatencyCost;
87       break;
88     case TargetTransformInfo::TCK_CodeSize:
89       Cost = CodeSizeCost;
90       break;
91     case TargetTransformInfo::TCK_SizeAndLatency:
92       Cost = SizeAndLatencyCost;
93       break;
94     }
95     if (Cost == ~0U)
96       return std::nullopt;
97     return Cost;
98   }
99 };
100 using CostKindTblEntry = CostTblEntryT<CostKindCosts>;
101 using TypeConversionCostKindTblEntry = TypeConversionCostTblEntryT<CostKindCosts>;
102 
103 TargetTransformInfo::PopcntSupportKind
104 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
105   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106   // TODO: Currently the __builtin_popcount() implementation using SSE3
107   //   instructions is inefficient. Once the problem is fixed, we should
108   //   call ST->hasSSE3() instead of ST->hasPOPCNT().
109   return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110 }
111 
112 std::optional<unsigned> X86TTIImpl::getCacheSize(
113   TargetTransformInfo::CacheLevel Level) const {
114   switch (Level) {
115   case TargetTransformInfo::CacheLevel::L1D:
116     //   - Penryn
117     //   - Nehalem
118     //   - Westmere
119     //   - Sandy Bridge
120     //   - Ivy Bridge
121     //   - Haswell
122     //   - Broadwell
123     //   - Skylake
124     //   - Kabylake
125     return 32 * 1024;  //  32 KByte
126   case TargetTransformInfo::CacheLevel::L2D:
127     //   - Penryn
128     //   - Nehalem
129     //   - Westmere
130     //   - Sandy Bridge
131     //   - Ivy Bridge
132     //   - Haswell
133     //   - Broadwell
134     //   - Skylake
135     //   - Kabylake
136     return 256 * 1024; // 256 KByte
137   }
138 
139   llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140 }
141 
142 std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
143   TargetTransformInfo::CacheLevel Level) const {
144   //   - Penryn
145   //   - Nehalem
146   //   - Westmere
147   //   - Sandy Bridge
148   //   - Ivy Bridge
149   //   - Haswell
150   //   - Broadwell
151   //   - Skylake
152   //   - Kabylake
153   switch (Level) {
154   case TargetTransformInfo::CacheLevel::L1D:
155     [[fallthrough]];
156   case TargetTransformInfo::CacheLevel::L2D:
157     return 8;
158   }
159 
160   llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161 }
162 
163 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164   bool Vector = (ClassID == 1);
165   if (Vector && !ST->hasSSE1())
166     return 0;
167 
168   if (ST->is64Bit()) {
169     if (Vector && ST->hasAVX512())
170       return 32;
171     if (!Vector && ST->hasEGPR())
172       return 32;
173     return 16;
174   }
175   return 8;
176 }
177 
178 bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty) const {
179   if (!ST->hasCF())
180     return false;
181   if (!Ty)
182     return true;
183   // Conditional faulting is supported by CFCMOV, which only accepts
184   // 16/32/64-bit operands.
185   // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
186   // profitable.
187   auto *VTy = dyn_cast<FixedVectorType>(Ty);
188   if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
189     return false;
190   auto *ScalarTy = Ty->getScalarType();
191   switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
192   default:
193     return false;
194   case 16:
195   case 32:
196   case 64:
197     return true;
198   }
199 }
200 
201 TypeSize
202 X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
203   unsigned PreferVectorWidth = ST->getPreferVectorWidth();
204   switch (K) {
205   case TargetTransformInfo::RGK_Scalar:
206     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
207   case TargetTransformInfo::RGK_FixedWidthVector:
208     if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
209       return TypeSize::getFixed(512);
210     if (ST->hasAVX() && PreferVectorWidth >= 256)
211       return TypeSize::getFixed(256);
212     if (ST->hasSSE1() && PreferVectorWidth >= 128)
213       return TypeSize::getFixed(128);
214     return TypeSize::getFixed(0);
215   case TargetTransformInfo::RGK_ScalableVector:
216     return TypeSize::getScalable(0);
217   }
218 
219   llvm_unreachable("Unsupported register kind");
220 }
221 
222 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
223   return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
224       .getFixedValue();
225 }
226 
227 unsigned X86TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
228   // If the loop will not be vectorized, don't interleave the loop.
229   // Let regular unroll to unroll the loop, which saves the overflow
230   // check and memory check cost.
231   if (VF.isScalar())
232     return 1;
233 
234   if (ST->isAtom())
235     return 1;
236 
237   // Sandybridge and Haswell have multiple execution ports and pipelined
238   // vector units.
239   if (ST->hasAVX())
240     return 4;
241 
242   return 2;
243 }
244 
245 InstructionCost X86TTIImpl::getArithmeticInstrCost(
246     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
247     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
248     ArrayRef<const Value *> Args,
249     const Instruction *CxtI) {
250 
251   // vXi8 multiplications are always promoted to vXi16.
252   // Sub-128-bit types can be extended/packed more efficiently.
253   if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
254       Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
255     Type *WideVecTy =
256         VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
257     return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
258                             TargetTransformInfo::CastContextHint::None,
259                             CostKind) +
260            getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
261                             TargetTransformInfo::CastContextHint::None,
262                             CostKind) +
263            getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
264   }
265 
266   // Legalize the type.
267   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
268 
269   int ISD = TLI->InstructionOpcodeToISD(Opcode);
270   assert(ISD && "Invalid opcode");
271 
272   if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
273       (LT.second.getScalarType() == MVT::i32 ||
274        LT.second.getScalarType() == MVT::i64)) {
275     // Check if the operands can be represented as a smaller datatype.
276     bool Op1Signed = false, Op2Signed = false;
277     unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
278     unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
279     unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
280     bool SignedMode = Op1Signed || Op2Signed;
281 
282     // If both vXi32 are representable as i15 and at least one is constant,
283     // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
284     // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
285     if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
286         LT.second.getScalarType() == MVT::i32) {
287       bool Op1Constant =
288           isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
289       bool Op2Constant =
290           isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
291       bool Op1Sext = isa<SExtInst>(Args[0]) &&
292                      (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
293       bool Op2Sext = isa<SExtInst>(Args[1]) &&
294                      (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
295 
296       bool IsZeroExtended = !Op1Signed || !Op2Signed;
297       bool IsConstant = Op1Constant || Op2Constant;
298       bool IsSext = Op1Sext || Op2Sext;
299       if (IsConstant || IsZeroExtended || IsSext)
300         LT.second =
301             MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
302     }
303 
304     // Check if the vXi32 operands can be shrunk into a smaller datatype.
305     // This should match the codegen from reduceVMULWidth.
306     // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
307     if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
308       if (OpMinSize <= 7)
309         return LT.first * 3; // pmullw/sext
310       if (!SignedMode && OpMinSize <= 8)
311         return LT.first * 3; // pmullw/zext
312       if (OpMinSize <= 15)
313         return LT.first * 5; // pmullw/pmulhw/pshuf
314       if (!SignedMode && OpMinSize <= 16)
315         return LT.first * 5; // pmullw/pmulhw/pshuf
316     }
317 
318     // If both vXi64 are representable as (unsigned) i32, then we can perform
319     // the multiple with a single PMULUDQ instruction.
320     // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
321     if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
322       ISD = X86ISD::PMULUDQ;
323   }
324 
325   // Vector multiply by pow2 will be simplified to shifts.
326   // Vector multiply by -pow2 will be simplified to shifts/negates.
327   if (ISD == ISD::MUL && Op2Info.isConstant() &&
328       (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
329     InstructionCost Cost =
330         getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
331                                Op1Info.getNoProps(), Op2Info.getNoProps());
332     if (Op2Info.isNegatedPowerOf2())
333       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
334     return Cost;
335   }
336 
337   // On X86, vector signed division by constants power-of-two are
338   // normally expanded to the sequence SRA + SRL + ADD + SRA.
339   // The OperandValue properties may not be the same as that of the previous
340   // operation; conservatively assume OP_None.
341   if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
342       Op2Info.isConstant() && Op2Info.isPowerOf2()) {
343     InstructionCost Cost =
344         2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
345                                    Op1Info.getNoProps(), Op2Info.getNoProps());
346     Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
347                                    Op1Info.getNoProps(), Op2Info.getNoProps());
348     Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
349                                    Op1Info.getNoProps(), Op2Info.getNoProps());
350 
351     if (ISD == ISD::SREM) {
352       // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
353       Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
354                                      Op2Info.getNoProps());
355       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
356                                      Op2Info.getNoProps());
357     }
358 
359     return Cost;
360   }
361 
362   // Vector unsigned division/remainder will be simplified to shifts/masks.
363   if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
364       Op2Info.isConstant() && Op2Info.isPowerOf2()) {
365     if (ISD == ISD::UDIV)
366       return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
367                                     Op1Info.getNoProps(), Op2Info.getNoProps());
368     // UREM
369     return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
370                                   Op1Info.getNoProps(), Op2Info.getNoProps());
371   }
372 
373   static const CostKindTblEntry GFNIUniformConstCostTable[] = {
374     { ISD::SHL,  MVT::v16i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
375     { ISD::SRL,  MVT::v16i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
376     { ISD::SRA,  MVT::v16i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
377     { ISD::SHL,  MVT::v32i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
378     { ISD::SRL,  MVT::v32i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
379     { ISD::SRA,  MVT::v32i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
380     { ISD::SHL,  MVT::v64i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
381     { ISD::SRL,  MVT::v64i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
382     { ISD::SRA,  MVT::v64i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
383   };
384 
385   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
386     if (const auto *Entry =
387             CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
388       if (auto KindCost = Entry->Cost[CostKind])
389         return LT.first * *KindCost;
390 
391   static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
392     { ISD::SHL,  MVT::v16i8,  { 1, 7, 2, 3 } }, // psllw + pand.
393     { ISD::SRL,  MVT::v16i8,  { 1, 7, 2, 3 } }, // psrlw + pand.
394     { ISD::SRA,  MVT::v16i8,  { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
395     { ISD::SHL,  MVT::v32i8,  { 1, 8, 2, 3 } }, // psllw + pand.
396     { ISD::SRL,  MVT::v32i8,  { 1, 8, 2, 3 } }, // psrlw + pand.
397     { ISD::SRA,  MVT::v32i8,  { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
398     { ISD::SHL,  MVT::v64i8,  { 1, 8, 2, 3 } }, // psllw + pand.
399     { ISD::SRL,  MVT::v64i8,  { 1, 8, 2, 3 } }, // psrlw + pand.
400     { ISD::SRA,  MVT::v64i8,  { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
401 
402     { ISD::SHL,  MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
403     { ISD::SRL,  MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
404     { ISD::SRA,  MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
405     { ISD::SHL,  MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
406     { ISD::SRL,  MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
407     { ISD::SRA,  MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
408   };
409 
410   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
411     if (const auto *Entry =
412             CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
413       if (auto KindCost = Entry->Cost[CostKind])
414         return LT.first * *KindCost;
415 
416   static const CostKindTblEntry AVX512UniformConstCostTable[] = {
417     { ISD::SHL,  MVT::v64i8,  {  2, 12,  5,  6 } }, // psllw + pand.
418     { ISD::SRL,  MVT::v64i8,  {  2, 12,  5,  6 } }, // psrlw + pand.
419     { ISD::SRA,  MVT::v64i8,  {  3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
420 
421     { ISD::SHL,  MVT::v16i16, {  2,  7,  4,  4 } }, // psllw + split.
422     { ISD::SRL,  MVT::v16i16, {  2,  7,  4,  4 } }, // psrlw + split.
423     { ISD::SRA,  MVT::v16i16, {  2,  7,  4,  4 } }, // psraw + split.
424 
425     { ISD::SHL,  MVT::v8i32,  {  1,  1,  1,  1 } }, // pslld
426     { ISD::SRL,  MVT::v8i32,  {  1,  1,  1,  1 } }, // psrld
427     { ISD::SRA,  MVT::v8i32,  {  1,  1,  1,  1 } }, // psrad
428     { ISD::SHL,  MVT::v16i32, {  1,  1,  1,  1 } }, // pslld
429     { ISD::SRL,  MVT::v16i32, {  1,  1,  1,  1 } }, // psrld
430     { ISD::SRA,  MVT::v16i32, {  1,  1,  1,  1 } }, // psrad
431 
432     { ISD::SRA,  MVT::v2i64,  {  1,  1,  1,  1 } }, // psraq
433     { ISD::SHL,  MVT::v4i64,  {  1,  1,  1,  1 } }, // psllq
434     { ISD::SRL,  MVT::v4i64,  {  1,  1,  1,  1 } }, // psrlq
435     { ISD::SRA,  MVT::v4i64,  {  1,  1,  1,  1 } }, // psraq
436     { ISD::SHL,  MVT::v8i64,  {  1,  1,  1,  1 } }, // psllq
437     { ISD::SRL,  MVT::v8i64,  {  1,  1,  1,  1 } }, // psrlq
438     { ISD::SRA,  MVT::v8i64,  {  1,  1,  1,  1 } }, // psraq
439 
440     { ISD::SDIV, MVT::v16i32, {  6 } }, // pmuludq sequence
441     { ISD::SREM, MVT::v16i32, {  8 } }, // pmuludq+mul+sub sequence
442     { ISD::UDIV, MVT::v16i32, {  5 } }, // pmuludq sequence
443     { ISD::UREM, MVT::v16i32, {  7 } }, // pmuludq+mul+sub sequence
444   };
445 
446   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
447     if (const auto *Entry =
448             CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
449       if (auto KindCost = Entry->Cost[CostKind])
450         return LT.first * *KindCost;
451 
452   static const CostKindTblEntry AVX2UniformConstCostTable[] = {
453     { ISD::SHL,  MVT::v16i8, {  1,  8,  2,  3 } }, // psllw + pand.
454     { ISD::SRL,  MVT::v16i8, {  1,  8,  2,  3 } }, // psrlw + pand.
455     { ISD::SRA,  MVT::v16i8, {  2, 10,  5,  6 } }, // psrlw, pand, pxor, psubb.
456     { ISD::SHL,  MVT::v32i8, {  2,  8,  2,  4 } }, // psllw + pand.
457     { ISD::SRL,  MVT::v32i8, {  2,  8,  2,  4 } }, // psrlw + pand.
458     { ISD::SRA,  MVT::v32i8, {  3, 10,  5,  9 } }, // psrlw, pand, pxor, psubb.
459 
460     { ISD::SHL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psllw
461     { ISD::SRL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psrlw
462     { ISD::SRA,  MVT::v8i16, {  1,  1,  1,  1 } }, // psraw
463     { ISD::SHL,  MVT::v16i16,{  2,  2,  1,  2 } }, // psllw
464     { ISD::SRL,  MVT::v16i16,{  2,  2,  1,  2 } }, // psrlw
465     { ISD::SRA,  MVT::v16i16,{  2,  2,  1,  2 } }, // psraw
466 
467     { ISD::SHL,  MVT::v4i32, {  1,  1,  1,  1 } }, // pslld
468     { ISD::SRL,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrld
469     { ISD::SRA,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrad
470     { ISD::SHL,  MVT::v8i32, {  2,  2,  1,  2 } }, // pslld
471     { ISD::SRL,  MVT::v8i32, {  2,  2,  1,  2 } }, // psrld
472     { ISD::SRA,  MVT::v8i32, {  2,  2,  1,  2 } }, // psrad
473 
474     { ISD::SHL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psllq
475     { ISD::SRL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psrlq
476     { ISD::SRA,  MVT::v2i64, {  2,  3,  3,  3 } }, // psrad + shuffle.
477     { ISD::SHL,  MVT::v4i64, {  2,  2,  1,  2 } }, // psllq
478     { ISD::SRL,  MVT::v4i64, {  2,  2,  1,  2 } }, // psrlq
479     { ISD::SRA,  MVT::v4i64, {  4,  4,  3,  6 } }, // psrad + shuffle + split.
480 
481     { ISD::SDIV, MVT::v8i32, {  6 } }, // pmuludq sequence
482     { ISD::SREM, MVT::v8i32, {  8 } }, // pmuludq+mul+sub sequence
483     { ISD::UDIV, MVT::v8i32, {  5 } }, // pmuludq sequence
484     { ISD::UREM, MVT::v8i32, {  7 } }, // pmuludq+mul+sub sequence
485   };
486 
487   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
488     if (const auto *Entry =
489             CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
490       if (auto KindCost = Entry->Cost[CostKind])
491         return LT.first * *KindCost;
492 
493   static const CostKindTblEntry AVXUniformConstCostTable[] = {
494     { ISD::SHL,  MVT::v16i8, {  2,  7,  2,  3 } }, // psllw + pand.
495     { ISD::SRL,  MVT::v16i8, {  2,  7,  2,  3 } }, // psrlw + pand.
496     { ISD::SRA,  MVT::v16i8, {  3,  9,  5,  6 } }, // psrlw, pand, pxor, psubb.
497     { ISD::SHL,  MVT::v32i8, {  4,  7,  7,  8 } }, // 2*(psllw + pand) + split.
498     { ISD::SRL,  MVT::v32i8, {  4,  7,  7,  8 } }, // 2*(psrlw + pand) + split.
499     { ISD::SRA,  MVT::v32i8, {  7,  7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
500 
501     { ISD::SHL,  MVT::v8i16, {  1,  2,  1,  1 } }, // psllw.
502     { ISD::SRL,  MVT::v8i16, {  1,  2,  1,  1 } }, // psrlw.
503     { ISD::SRA,  MVT::v8i16, {  1,  2,  1,  1 } }, // psraw.
504     { ISD::SHL,  MVT::v16i16,{  3,  6,  4,  5 } }, // psllw + split.
505     { ISD::SRL,  MVT::v16i16,{  3,  6,  4,  5 } }, // psrlw + split.
506     { ISD::SRA,  MVT::v16i16,{  3,  6,  4,  5 } }, // psraw + split.
507 
508     { ISD::SHL,  MVT::v4i32, {  1,  2,  1,  1 } }, // pslld.
509     { ISD::SRL,  MVT::v4i32, {  1,  2,  1,  1 } }, // psrld.
510     { ISD::SRA,  MVT::v4i32, {  1,  2,  1,  1 } }, // psrad.
511     { ISD::SHL,  MVT::v8i32, {  3,  6,  4,  5 } }, // pslld + split.
512     { ISD::SRL,  MVT::v8i32, {  3,  6,  4,  5 } }, // psrld + split.
513     { ISD::SRA,  MVT::v8i32, {  3,  6,  4,  5 } }, // psrad + split.
514 
515     { ISD::SHL,  MVT::v2i64, {  1,  2,  1,  1 } }, // psllq.
516     { ISD::SRL,  MVT::v2i64, {  1,  2,  1,  1 } }, // psrlq.
517     { ISD::SRA,  MVT::v2i64, {  2,  3,  3,  3 } }, // psrad + shuffle.
518     { ISD::SHL,  MVT::v4i64, {  3,  6,  4,  5 } }, // 2 x psllq + split.
519     { ISD::SRL,  MVT::v4i64, {  3,  6,  4,  5 } }, // 2 x psllq + split.
520     { ISD::SRA,  MVT::v4i64, {  5,  7,  8,  9 } }, // 2 x psrad + shuffle + split.
521 
522     { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
523     { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
524     { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
525     { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
526   };
527 
528   // XOP has faster vXi8 shifts.
529   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
530       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
531     if (const auto *Entry =
532             CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
533       if (auto KindCost = Entry->Cost[CostKind])
534         return LT.first * *KindCost;
535 
536   static const CostKindTblEntry SSE2UniformConstCostTable[] = {
537     { ISD::SHL,  MVT::v16i8, {  1,  7,  2,  3 } }, // psllw + pand.
538     { ISD::SRL,  MVT::v16i8, {  1,  7,  2,  3 } }, // psrlw + pand.
539     { ISD::SRA,  MVT::v16i8, {  3,  9,  5,  6 } }, // psrlw, pand, pxor, psubb.
540 
541     { ISD::SHL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psllw.
542     { ISD::SRL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psrlw.
543     { ISD::SRA,  MVT::v8i16, {  1,  1,  1,  1 } }, // psraw.
544 
545     { ISD::SHL,  MVT::v4i32, {  1,  1,  1,  1 } }, // pslld
546     { ISD::SRL,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrld.
547     { ISD::SRA,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrad.
548 
549     { ISD::SHL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psllq.
550     { ISD::SRL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psrlq.
551     { ISD::SRA,  MVT::v2i64, {  3,  5,  6,  6 } }, // 2 x psrad + shuffle.
552 
553     { ISD::SDIV, MVT::v4i32, {  6 } }, // pmuludq sequence
554     { ISD::SREM, MVT::v4i32, {  8 } }, // pmuludq+mul+sub sequence
555     { ISD::UDIV, MVT::v4i32, {  5 } }, // pmuludq sequence
556     { ISD::UREM, MVT::v4i32, {  7 } }, // pmuludq+mul+sub sequence
557   };
558 
559   // XOP has faster vXi8 shifts.
560   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
561       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
562     if (const auto *Entry =
563             CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
564       if (auto KindCost = Entry->Cost[CostKind])
565         return LT.first * *KindCost;
566 
567   static const CostKindTblEntry AVX512BWConstCostTable[] = {
568     { ISD::SDIV, MVT::v64i8,  { 14 } }, // 2*ext+2*pmulhw sequence
569     { ISD::SREM, MVT::v64i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570     { ISD::UDIV, MVT::v64i8,  { 14 } }, // 2*ext+2*pmulhw sequence
571     { ISD::UREM, MVT::v64i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
572 
573     { ISD::SDIV, MVT::v32i16, {  6 } }, // vpmulhw sequence
574     { ISD::SREM, MVT::v32i16, {  8 } }, // vpmulhw+mul+sub sequence
575     { ISD::UDIV, MVT::v32i16, {  6 } }, // vpmulhuw sequence
576     { ISD::UREM, MVT::v32i16, {  8 } }, // vpmulhuw+mul+sub sequence
577   };
578 
579   if (Op2Info.isConstant() && ST->hasBWI())
580     if (const auto *Entry =
581             CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
582       if (auto KindCost = Entry->Cost[CostKind])
583         return LT.first * *KindCost;
584 
585   static const CostKindTblEntry AVX512ConstCostTable[] = {
586     { ISD::SDIV, MVT::v64i8,  { 28 } }, // 4*ext+4*pmulhw sequence
587     { ISD::SREM, MVT::v64i8,  { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
588     { ISD::UDIV, MVT::v64i8,  { 28 } }, // 4*ext+4*pmulhw sequence
589     { ISD::UREM, MVT::v64i8,  { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
590 
591     { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
592     { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
593     { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
594     { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
595 
596     { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
597     { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
598     { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
599     { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
600   };
601 
602   if (Op2Info.isConstant() && ST->hasAVX512())
603     if (const auto *Entry =
604             CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
605       if (auto KindCost = Entry->Cost[CostKind])
606         return LT.first * *KindCost;
607 
608   static const CostKindTblEntry AVX2ConstCostTable[] = {
609     { ISD::SDIV, MVT::v32i8,  { 14 } }, // 2*ext+2*pmulhw sequence
610     { ISD::SREM, MVT::v32i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
611     { ISD::UDIV, MVT::v32i8,  { 14 } }, // 2*ext+2*pmulhw sequence
612     { ISD::UREM, MVT::v32i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
613 
614     { ISD::SDIV, MVT::v16i16, {  6 } }, // vpmulhw sequence
615     { ISD::SREM, MVT::v16i16, {  8 } }, // vpmulhw+mul+sub sequence
616     { ISD::UDIV, MVT::v16i16, {  6 } }, // vpmulhuw sequence
617     { ISD::UREM, MVT::v16i16, {  8 } }, // vpmulhuw+mul+sub sequence
618 
619     { ISD::SDIV, MVT::v8i32,  { 15 } }, // vpmuldq sequence
620     { ISD::SREM, MVT::v8i32,  { 19 } }, // vpmuldq+mul+sub sequence
621     { ISD::UDIV, MVT::v8i32,  { 15 } }, // vpmuludq sequence
622     { ISD::UREM, MVT::v8i32,  { 19 } }, // vpmuludq+mul+sub sequence
623   };
624 
625   if (Op2Info.isConstant() && ST->hasAVX2())
626     if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
627       if (auto KindCost = Entry->Cost[CostKind])
628         return LT.first * *KindCost;
629 
630   static const CostKindTblEntry AVXConstCostTable[] = {
631     { ISD::SDIV, MVT::v32i8,  { 30 } }, // 4*ext+4*pmulhw sequence + split.
632     { ISD::SREM, MVT::v32i8,  { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
633     { ISD::UDIV, MVT::v32i8,  { 30 } }, // 4*ext+4*pmulhw sequence + split.
634     { ISD::UREM, MVT::v32i8,  { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
635 
636     { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
637     { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
638     { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
639     { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
640 
641     { ISD::SDIV, MVT::v8i32,  { 32 } }, // vpmuludq sequence
642     { ISD::SREM, MVT::v8i32,  { 38 } }, // vpmuludq+mul+sub sequence
643     { ISD::UDIV, MVT::v8i32,  { 32 } }, // 2*pmuludq sequence + split.
644     { ISD::UREM, MVT::v8i32,  { 42 } }, // 2*pmuludq+mul+sub sequence + split.
645   };
646 
647   if (Op2Info.isConstant() && ST->hasAVX())
648     if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
649       if (auto KindCost = Entry->Cost[CostKind])
650         return LT.first * *KindCost;
651 
652   static const CostKindTblEntry SSE41ConstCostTable[] = {
653     { ISD::SDIV, MVT::v4i32,  { 15 } }, // vpmuludq sequence
654     { ISD::SREM, MVT::v4i32,  { 20 } }, // vpmuludq+mul+sub sequence
655   };
656 
657   if (Op2Info.isConstant() && ST->hasSSE41())
658     if (const auto *Entry =
659             CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
660       if (auto KindCost = Entry->Cost[CostKind])
661         return LT.first * *KindCost;
662 
663   static const CostKindTblEntry SSE2ConstCostTable[] = {
664     { ISD::SDIV, MVT::v16i8,  { 14 } }, // 2*ext+2*pmulhw sequence
665     { ISD::SREM, MVT::v16i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
666     { ISD::UDIV, MVT::v16i8,  { 14 } }, // 2*ext+2*pmulhw sequence
667     { ISD::UREM, MVT::v16i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
668 
669     { ISD::SDIV, MVT::v8i16,  {  6 } }, // pmulhw sequence
670     { ISD::SREM, MVT::v8i16,  {  8 } }, // pmulhw+mul+sub sequence
671     { ISD::UDIV, MVT::v8i16,  {  6 } }, // pmulhuw sequence
672     { ISD::UREM, MVT::v8i16,  {  8 } }, // pmulhuw+mul+sub sequence
673 
674     { ISD::SDIV, MVT::v4i32,  { 19 } }, // pmuludq sequence
675     { ISD::SREM, MVT::v4i32,  { 24 } }, // pmuludq+mul+sub sequence
676     { ISD::UDIV, MVT::v4i32,  { 15 } }, // pmuludq sequence
677     { ISD::UREM, MVT::v4i32,  { 20 } }, // pmuludq+mul+sub sequence
678   };
679 
680   if (Op2Info.isConstant() && ST->hasSSE2())
681     if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
682       if (auto KindCost = Entry->Cost[CostKind])
683         return LT.first * *KindCost;
684 
685   static const CostKindTblEntry AVX512BWUniformCostTable[] = {
686     { ISD::SHL,  MVT::v16i8,  { 3, 5, 5, 7 } }, // psllw + pand.
687     { ISD::SRL,  MVT::v16i8,  { 3,10, 5, 8 } }, // psrlw + pand.
688     { ISD::SRA,  MVT::v16i8,  { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
689     { ISD::SHL,  MVT::v32i8,  { 4, 7, 6, 8 } }, // psllw + pand.
690     { ISD::SRL,  MVT::v32i8,  { 4, 8, 7, 9 } }, // psrlw + pand.
691     { ISD::SRA,  MVT::v32i8,  { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
692     { ISD::SHL,  MVT::v64i8,  { 4, 7, 6, 8 } }, // psllw + pand.
693     { ISD::SRL,  MVT::v64i8,  { 4, 8, 7,10 } }, // psrlw + pand.
694     { ISD::SRA,  MVT::v64i8,  { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
695 
696     { ISD::SHL,  MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
697     { ISD::SRL,  MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
698     { ISD::SRA,  MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
699   };
700 
701   if (ST->hasBWI() && Op2Info.isUniform())
702     if (const auto *Entry =
703             CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
704       if (auto KindCost = Entry->Cost[CostKind])
705         return LT.first * *KindCost;
706 
707   static const CostKindTblEntry AVX512UniformCostTable[] = {
708     { ISD::SHL,  MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
709     { ISD::SRL,  MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
710     { ISD::SRA,  MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
711 
712     { ISD::SHL,  MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
713     { ISD::SRL,  MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
714     { ISD::SRA,  MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
715 
716     { ISD::SRA,  MVT::v2i64,  { 1, 2, 1, 2 } }, // psraq
717     { ISD::SHL,  MVT::v4i64,  { 1, 4, 1, 2 } }, // psllq
718     { ISD::SRL,  MVT::v4i64,  { 1, 4, 1, 2 } }, // psrlq
719     { ISD::SRA,  MVT::v4i64,  { 1, 4, 1, 2 } }, // psraq
720     { ISD::SHL,  MVT::v8i64,  { 1, 4, 1, 2 } }, // psllq
721     { ISD::SRL,  MVT::v8i64,  { 1, 4, 1, 2 } }, // psrlq
722     { ISD::SRA,  MVT::v8i64,  { 1, 4, 1, 2 } }, // psraq
723   };
724 
725   if (ST->hasAVX512() && Op2Info.isUniform())
726     if (const auto *Entry =
727             CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
728       if (auto KindCost = Entry->Cost[CostKind])
729         return LT.first * *KindCost;
730 
731   static const CostKindTblEntry AVX2UniformCostTable[] = {
732     // Uniform splats are cheaper for the following instructions.
733     { ISD::SHL,  MVT::v16i8,  { 3, 5, 5, 7 } }, // psllw + pand.
734     { ISD::SRL,  MVT::v16i8,  { 3, 9, 5, 8 } }, // psrlw + pand.
735     { ISD::SRA,  MVT::v16i8,  { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
736     { ISD::SHL,  MVT::v32i8,  { 4, 7, 6, 8 } }, // psllw + pand.
737     { ISD::SRL,  MVT::v32i8,  { 4, 8, 7, 9 } }, // psrlw + pand.
738     { ISD::SRA,  MVT::v32i8,  { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
739 
740     { ISD::SHL,  MVT::v8i16,  { 1, 2, 1, 2 } }, // psllw.
741     { ISD::SRL,  MVT::v8i16,  { 1, 2, 1, 2 } }, // psrlw.
742     { ISD::SRA,  MVT::v8i16,  { 1, 2, 1, 2 } }, // psraw.
743     { ISD::SHL,  MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
744     { ISD::SRL,  MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
745     { ISD::SRA,  MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
746 
747     { ISD::SHL,  MVT::v4i32,  { 1, 2, 1, 2 } }, // pslld
748     { ISD::SRL,  MVT::v4i32,  { 1, 2, 1, 2 } }, // psrld
749     { ISD::SRA,  MVT::v4i32,  { 1, 2, 1, 2 } }, // psrad
750     { ISD::SHL,  MVT::v8i32,  { 2, 4, 2, 3 } }, // pslld
751     { ISD::SRL,  MVT::v8i32,  { 2, 4, 2, 3 } }, // psrld
752     { ISD::SRA,  MVT::v8i32,  { 2, 4, 2, 3 } }, // psrad
753 
754     { ISD::SHL,  MVT::v2i64,  { 1, 2, 1, 2 } }, // psllq
755     { ISD::SRL,  MVT::v2i64,  { 1, 2, 1, 2 } }, // psrlq
756     { ISD::SRA,  MVT::v2i64,  { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
757     { ISD::SHL,  MVT::v4i64,  { 2, 4, 1, 2 } }, // psllq
758     { ISD::SRL,  MVT::v4i64,  { 2, 4, 1, 2 } }, // psrlq
759     { ISD::SRA,  MVT::v4i64,  { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
760   };
761 
762   if (ST->hasAVX2() && Op2Info.isUniform())
763     if (const auto *Entry =
764             CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
765       if (auto KindCost = Entry->Cost[CostKind])
766         return LT.first * *KindCost;
767 
768   static const CostKindTblEntry AVXUniformCostTable[] = {
769     { ISD::SHL,  MVT::v16i8,  {  4, 4, 6, 8 } }, // psllw + pand.
770     { ISD::SRL,  MVT::v16i8,  {  4, 8, 5, 8 } }, // psrlw + pand.
771     { ISD::SRA,  MVT::v16i8,  {  6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
772     { ISD::SHL,  MVT::v32i8,  {  7, 8,11,14 } }, // psllw + pand + split.
773     { ISD::SRL,  MVT::v32i8,  {  7, 9,10,14 } }, // psrlw + pand + split.
774     { ISD::SRA,  MVT::v32i8,  { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
775 
776     { ISD::SHL,  MVT::v8i16,  {  1, 3, 1, 2 } }, // psllw.
777     { ISD::SRL,  MVT::v8i16,  {  1, 3, 1, 2 } }, // psrlw.
778     { ISD::SRA,  MVT::v8i16,  {  1, 3, 1, 2 } }, // psraw.
779     { ISD::SHL,  MVT::v16i16, {  3, 7, 5, 7 } }, // psllw + split.
780     { ISD::SRL,  MVT::v16i16, {  3, 7, 5, 7 } }, // psrlw + split.
781     { ISD::SRA,  MVT::v16i16, {  3, 7, 5, 7 } }, // psraw + split.
782 
783     { ISD::SHL,  MVT::v4i32,  {  1, 3, 1, 2 } }, // pslld.
784     { ISD::SRL,  MVT::v4i32,  {  1, 3, 1, 2 } }, // psrld.
785     { ISD::SRA,  MVT::v4i32,  {  1, 3, 1, 2 } }, // psrad.
786     { ISD::SHL,  MVT::v8i32,  {  3, 7, 5, 7 } }, // pslld + split.
787     { ISD::SRL,  MVT::v8i32,  {  3, 7, 5, 7 } }, // psrld + split.
788     { ISD::SRA,  MVT::v8i32,  {  3, 7, 5, 7 } }, // psrad + split.
789 
790     { ISD::SHL,  MVT::v2i64,  {  1, 3, 1, 2 } }, // psllq.
791     { ISD::SRL,  MVT::v2i64,  {  1, 3, 1, 2 } }, // psrlq.
792     { ISD::SRA,  MVT::v2i64,  {  3, 4, 5, 7 } }, // 2 x psrad + shuffle.
793     { ISD::SHL,  MVT::v4i64,  {  3, 7, 4, 6 } }, // psllq + split.
794     { ISD::SRL,  MVT::v4i64,  {  3, 7, 4, 6 } }, // psrlq + split.
795     { ISD::SRA,  MVT::v4i64,  {  6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
796   };
797 
798   // XOP has faster vXi8 shifts.
799   if (ST->hasAVX() && Op2Info.isUniform() &&
800       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
801     if (const auto *Entry =
802             CostTableLookup(AVXUniformCostTable, ISD, LT.second))
803       if (auto KindCost = Entry->Cost[CostKind])
804         return LT.first * *KindCost;
805 
806   static const CostKindTblEntry SSE2UniformCostTable[] = {
807     // Uniform splats are cheaper for the following instructions.
808     { ISD::SHL,  MVT::v16i8, {  9, 10, 6, 9 } }, // psllw + pand.
809     { ISD::SRL,  MVT::v16i8, {  9, 13, 5, 9 } }, // psrlw + pand.
810     { ISD::SRA,  MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
811 
812     { ISD::SHL,  MVT::v8i16, {  2, 2, 1, 2 } }, // psllw.
813     { ISD::SRL,  MVT::v8i16, {  2, 2, 1, 2 } }, // psrlw.
814     { ISD::SRA,  MVT::v8i16, {  2, 2, 1, 2 } }, // psraw.
815 
816     { ISD::SHL,  MVT::v4i32, {  2, 2, 1, 2 } }, // pslld
817     { ISD::SRL,  MVT::v4i32, {  2, 2, 1, 2 } }, // psrld.
818     { ISD::SRA,  MVT::v4i32, {  2, 2, 1, 2 } }, // psrad.
819 
820     { ISD::SHL,  MVT::v2i64, {  2, 2, 1, 2 } }, // psllq.
821     { ISD::SRL,  MVT::v2i64, {  2, 2, 1, 2 } }, // psrlq.
822     { ISD::SRA,  MVT::v2i64, {  5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
823   };
824 
825   if (ST->hasSSE2() && Op2Info.isUniform() &&
826       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
827     if (const auto *Entry =
828             CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
829       if (auto KindCost = Entry->Cost[CostKind])
830         return LT.first * *KindCost;
831 
832   static const CostKindTblEntry AVX512DQCostTable[] = {
833     { ISD::MUL,  MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
834     { ISD::MUL,  MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
835     { ISD::MUL,  MVT::v8i64, { 3, 15, 1, 3 } }  // pmullq
836   };
837 
838   // Look for AVX512DQ lowering tricks for custom cases.
839   if (ST->hasDQI())
840     if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
841       if (auto KindCost = Entry->Cost[CostKind])
842         return LT.first * *KindCost;
843 
844   static const CostKindTblEntry AVX512BWCostTable[] = {
845     { ISD::SHL,   MVT::v16i8,   {  4,  8, 4, 5 } }, // extend/vpsllvw/pack sequence.
846     { ISD::SRL,   MVT::v16i8,   {  4,  8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
847     { ISD::SRA,   MVT::v16i8,   {  4,  8, 4, 5 } }, // extend/vpsravw/pack sequence.
848     { ISD::SHL,   MVT::v32i8,   {  4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
849     { ISD::SRL,   MVT::v32i8,   {  4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
850     { ISD::SRA,   MVT::v32i8,   {  6, 13,24,30 } }, // extend/vpsravw/pack sequence.
851     { ISD::SHL,   MVT::v64i8,   {  6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
852     { ISD::SRL,   MVT::v64i8,   {  7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
853     { ISD::SRA,   MVT::v64i8,   { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
854 
855     { ISD::SHL,   MVT::v8i16,   {  1,  1, 1, 1 } }, // vpsllvw
856     { ISD::SRL,   MVT::v8i16,   {  1,  1, 1, 1 } }, // vpsrlvw
857     { ISD::SRA,   MVT::v8i16,   {  1,  1, 1, 1 } }, // vpsravw
858     { ISD::SHL,   MVT::v16i16,  {  1,  1, 1, 1 } }, // vpsllvw
859     { ISD::SRL,   MVT::v16i16,  {  1,  1, 1, 1 } }, // vpsrlvw
860     { ISD::SRA,   MVT::v16i16,  {  1,  1, 1, 1 } }, // vpsravw
861     { ISD::SHL,   MVT::v32i16,  {  1,  1, 1, 1 } }, // vpsllvw
862     { ISD::SRL,   MVT::v32i16,  {  1,  1, 1, 1 } }, // vpsrlvw
863     { ISD::SRA,   MVT::v32i16,  {  1,  1, 1, 1 } }, // vpsravw
864 
865     { ISD::ADD,   MVT::v64i8,   {  1,  1, 1, 1 } }, // paddb
866     { ISD::ADD,   MVT::v32i16,  {  1,  1, 1, 1 } }, // paddw
867 
868     { ISD::ADD,   MVT::v32i8,   {  1,  1, 1, 1 } }, // paddb
869     { ISD::ADD,   MVT::v16i16,  {  1,  1, 1, 1 } }, // paddw
870     { ISD::ADD,   MVT::v8i32,   {  1,  1, 1, 1 } }, // paddd
871     { ISD::ADD,   MVT::v4i64,   {  1,  1, 1, 1 } }, // paddq
872 
873     { ISD::SUB,   MVT::v64i8,   {  1,  1, 1, 1 } }, // psubb
874     { ISD::SUB,   MVT::v32i16,  {  1,  1, 1, 1 } }, // psubw
875 
876     { ISD::MUL,   MVT::v16i8,   {  4, 12, 4, 5 } }, // extend/pmullw/trunc
877     { ISD::MUL,   MVT::v32i8,   {  3, 10, 7,10 } }, // pmaddubsw
878     { ISD::MUL,   MVT::v64i8,   {  3, 11, 7,10 } }, // pmaddubsw
879     { ISD::MUL,   MVT::v32i16,  {  1,  5, 1, 1 } }, // pmullw
880 
881     { ISD::SUB,   MVT::v32i8,   {  1,  1, 1, 1 } }, // psubb
882     { ISD::SUB,   MVT::v16i16,  {  1,  1, 1, 1 } }, // psubw
883     { ISD::SUB,   MVT::v8i32,   {  1,  1, 1, 1 } }, // psubd
884     { ISD::SUB,   MVT::v4i64,   {  1,  1, 1, 1 } }, // psubq
885   };
886 
887   // Look for AVX512BW lowering tricks for custom cases.
888   if (ST->hasBWI())
889     if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
890       if (auto KindCost = Entry->Cost[CostKind])
891         return LT.first * *KindCost;
892 
893   static const CostKindTblEntry AVX512CostTable[] = {
894     { ISD::SHL,     MVT::v64i8,   { 15, 19,27,33 } }, // vpblendv+split sequence.
895     { ISD::SRL,     MVT::v64i8,   { 15, 19,30,36 } }, // vpblendv+split sequence.
896     { ISD::SRA,     MVT::v64i8,   { 37, 37,51,63 } }, // vpblendv+split sequence.
897 
898     { ISD::SHL,     MVT::v32i16,  { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
899     { ISD::SRL,     MVT::v32i16,  { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
900     { ISD::SRA,     MVT::v32i16,  { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
901 
902     { ISD::SHL,     MVT::v4i32,   {  1,  1, 1, 1 } },
903     { ISD::SRL,     MVT::v4i32,   {  1,  1, 1, 1 } },
904     { ISD::SRA,     MVT::v4i32,   {  1,  1, 1, 1 } },
905     { ISD::SHL,     MVT::v8i32,   {  1,  1, 1, 1 } },
906     { ISD::SRL,     MVT::v8i32,   {  1,  1, 1, 1 } },
907     { ISD::SRA,     MVT::v8i32,   {  1,  1, 1, 1 } },
908     { ISD::SHL,     MVT::v16i32,  {  1,  1, 1, 1 } },
909     { ISD::SRL,     MVT::v16i32,  {  1,  1, 1, 1 } },
910     { ISD::SRA,     MVT::v16i32,  {  1,  1, 1, 1 } },
911 
912     { ISD::SHL,     MVT::v2i64,   {  1,  1, 1, 1 } },
913     { ISD::SRL,     MVT::v2i64,   {  1,  1, 1, 1 } },
914     { ISD::SRA,     MVT::v2i64,   {  1,  1, 1, 1 } },
915     { ISD::SHL,     MVT::v4i64,   {  1,  1, 1, 1 } },
916     { ISD::SRL,     MVT::v4i64,   {  1,  1, 1, 1 } },
917     { ISD::SRA,     MVT::v4i64,   {  1,  1, 1, 1 } },
918     { ISD::SHL,     MVT::v8i64,   {  1,  1, 1, 1 } },
919     { ISD::SRL,     MVT::v8i64,   {  1,  1, 1, 1 } },
920     { ISD::SRA,     MVT::v8i64,   {  1,  1, 1, 1 } },
921 
922     { ISD::ADD,     MVT::v64i8,   {  3,  7, 5, 5 } }, // 2*paddb + split
923     { ISD::ADD,     MVT::v32i16,  {  3,  7, 5, 5 } }, // 2*paddw + split
924 
925     { ISD::SUB,     MVT::v64i8,   {  3,  7, 5, 5 } }, // 2*psubb + split
926     { ISD::SUB,     MVT::v32i16,  {  3,  7, 5, 5 } }, // 2*psubw + split
927 
928     { ISD::AND,     MVT::v32i8,   {  1,  1, 1, 1 } },
929     { ISD::AND,     MVT::v16i16,  {  1,  1, 1, 1 } },
930     { ISD::AND,     MVT::v8i32,   {  1,  1, 1, 1 } },
931     { ISD::AND,     MVT::v4i64,   {  1,  1, 1, 1 } },
932 
933     { ISD::OR,      MVT::v32i8,   {  1,  1, 1, 1 } },
934     { ISD::OR,      MVT::v16i16,  {  1,  1, 1, 1 } },
935     { ISD::OR,      MVT::v8i32,   {  1,  1, 1, 1 } },
936     { ISD::OR,      MVT::v4i64,   {  1,  1, 1, 1 } },
937 
938     { ISD::XOR,     MVT::v32i8,   {  1,  1, 1, 1 } },
939     { ISD::XOR,     MVT::v16i16,  {  1,  1, 1, 1 } },
940     { ISD::XOR,     MVT::v8i32,   {  1,  1, 1, 1 } },
941     { ISD::XOR,     MVT::v4i64,   {  1,  1, 1, 1 } },
942 
943     { ISD::MUL,     MVT::v16i32,  {  1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
944     { ISD::MUL,     MVT::v8i32,   {  1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
945     { ISD::MUL,     MVT::v4i32,   {  1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
946     { ISD::MUL,     MVT::v8i64,   {  6,  9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
947     { ISD::MUL,     MVT::i64,     {  1 } }, // Skylake from http://www.agner.org/
948 
949     { X86ISD::PMULUDQ, MVT::v8i64, { 1,  5, 1, 1 } },
950 
951     { ISD::FNEG,    MVT::v8f64,   {  1,  1, 1, 2 } }, // Skylake from http://www.agner.org/
952     { ISD::FADD,    MVT::v8f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
953     { ISD::FADD,    MVT::v4f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
954     { ISD::FSUB,    MVT::v8f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
955     { ISD::FSUB,    MVT::v4f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
956     { ISD::FMUL,    MVT::v8f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
957     { ISD::FMUL,    MVT::v4f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
958     { ISD::FMUL,    MVT::v2f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
959     { ISD::FMUL,    MVT::f64,     {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
960 
961     { ISD::FDIV,    MVT::f64,     {  4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
962     { ISD::FDIV,    MVT::v2f64,   {  4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
963     { ISD::FDIV,    MVT::v4f64,   {  8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
964     { ISD::FDIV,    MVT::v8f64,   { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
965 
966     { ISD::FNEG,    MVT::v16f32,  {  1,  1, 1, 2 } }, // Skylake from http://www.agner.org/
967     { ISD::FADD,    MVT::v16f32,  {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
968     { ISD::FADD,    MVT::v8f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
969     { ISD::FSUB,    MVT::v16f32,  {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
970     { ISD::FSUB,    MVT::v8f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
971     { ISD::FMUL,    MVT::v16f32,  {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
972     { ISD::FMUL,    MVT::v8f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
973     { ISD::FMUL,    MVT::v4f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
974     { ISD::FMUL,    MVT::f32,     {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
975 
976     { ISD::FDIV,    MVT::f32,     {  3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
977     { ISD::FDIV,    MVT::v4f32,   {  3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
978     { ISD::FDIV,    MVT::v8f32,   {  5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
979     { ISD::FDIV,    MVT::v16f32,  { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
980   };
981 
982   if (ST->hasAVX512())
983     if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
984       if (auto KindCost = Entry->Cost[CostKind])
985         return LT.first * *KindCost;
986 
987   static const CostKindTblEntry AVX2ShiftCostTable[] = {
988     // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
989     // customize them to detect the cases where shift amount is a scalar one.
990     { ISD::SHL,     MVT::v4i32,  { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
991     { ISD::SRL,     MVT::v4i32,  { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
992     { ISD::SRA,     MVT::v4i32,  { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
993     { ISD::SHL,     MVT::v8i32,  { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
994     { ISD::SRL,     MVT::v8i32,  { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
995     { ISD::SRA,     MVT::v8i32,  { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
996     { ISD::SHL,     MVT::v2i64,  { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
997     { ISD::SRL,     MVT::v2i64,  { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
998     { ISD::SHL,     MVT::v4i64,  { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
999     { ISD::SRL,     MVT::v4i64,  { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1000   };
1001 
1002   if (ST->hasAVX512()) {
1003     if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1004       // On AVX512, a packed v32i16 shift left by a constant build_vector
1005       // is lowered into a vector multiply (vpmullw).
1006       return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1007                                     Op1Info.getNoProps(), Op2Info.getNoProps());
1008   }
1009 
1010   // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1011   if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1012     if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1013         Op2Info.isConstant())
1014       // On AVX2, a packed v16i16 shift left by a constant build_vector
1015       // is lowered into a vector multiply (vpmullw).
1016       return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1017                                     Op1Info.getNoProps(), Op2Info.getNoProps());
1018 
1019     if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1020       if (auto KindCost = Entry->Cost[CostKind])
1021         return LT.first * *KindCost;
1022   }
1023 
1024   static const CostKindTblEntry XOPShiftCostTable[] = {
1025     // 128bit shifts take 1cy, but right shifts require negation beforehand.
1026     { ISD::SHL,     MVT::v16i8,  { 1, 3, 1, 1 } },
1027     { ISD::SRL,     MVT::v16i8,  { 2, 3, 1, 1 } },
1028     { ISD::SRA,     MVT::v16i8,  { 2, 3, 1, 1 } },
1029     { ISD::SHL,     MVT::v8i16,  { 1, 3, 1, 1 } },
1030     { ISD::SRL,     MVT::v8i16,  { 2, 3, 1, 1 } },
1031     { ISD::SRA,     MVT::v8i16,  { 2, 3, 1, 1 } },
1032     { ISD::SHL,     MVT::v4i32,  { 1, 3, 1, 1 } },
1033     { ISD::SRL,     MVT::v4i32,  { 2, 3, 1, 1 } },
1034     { ISD::SRA,     MVT::v4i32,  { 2, 3, 1, 1 } },
1035     { ISD::SHL,     MVT::v2i64,  { 1, 3, 1, 1 } },
1036     { ISD::SRL,     MVT::v2i64,  { 2, 3, 1, 1 } },
1037     { ISD::SRA,     MVT::v2i64,  { 2, 3, 1, 1 } },
1038     // 256bit shifts require splitting if AVX2 didn't catch them above.
1039     { ISD::SHL,     MVT::v32i8,  { 4, 7, 5, 6 } },
1040     { ISD::SRL,     MVT::v32i8,  { 6, 7, 5, 6 } },
1041     { ISD::SRA,     MVT::v32i8,  { 6, 7, 5, 6 } },
1042     { ISD::SHL,     MVT::v16i16, { 4, 7, 5, 6 } },
1043     { ISD::SRL,     MVT::v16i16, { 6, 7, 5, 6 } },
1044     { ISD::SRA,     MVT::v16i16, { 6, 7, 5, 6 } },
1045     { ISD::SHL,     MVT::v8i32,  { 4, 7, 5, 6 } },
1046     { ISD::SRL,     MVT::v8i32,  { 6, 7, 5, 6 } },
1047     { ISD::SRA,     MVT::v8i32,  { 6, 7, 5, 6 } },
1048     { ISD::SHL,     MVT::v4i64,  { 4, 7, 5, 6 } },
1049     { ISD::SRL,     MVT::v4i64,  { 6, 7, 5, 6 } },
1050     { ISD::SRA,     MVT::v4i64,  { 6, 7, 5, 6 } },
1051   };
1052 
1053   // Look for XOP lowering tricks.
1054   if (ST->hasXOP()) {
1055     // If the right shift is constant then we'll fold the negation so
1056     // it's as cheap as a left shift.
1057     int ShiftISD = ISD;
1058     if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1059       ShiftISD = ISD::SHL;
1060     if (const auto *Entry =
1061             CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1062       if (auto KindCost = Entry->Cost[CostKind])
1063         return LT.first * *KindCost;
1064   }
1065 
1066   if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1067     MVT VT = LT.second;
1068     // Vector shift left by non uniform constant can be lowered
1069     // into vector multiply.
1070     if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1071         ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1072       ISD = ISD::MUL;
1073   }
1074 
1075   static const CostKindTblEntry GLMCostTable[] = {
1076     { ISD::FDIV,  MVT::f32,   { 18, 19, 1, 1 } }, // divss
1077     { ISD::FDIV,  MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1078     { ISD::FDIV,  MVT::f64,   { 33, 34, 1, 1 } }, // divsd
1079     { ISD::FDIV,  MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1080   };
1081 
1082   if (ST->useGLMDivSqrtCosts())
1083     if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1084       if (auto KindCost = Entry->Cost[CostKind])
1085         return LT.first * *KindCost;
1086 
1087   static const CostKindTblEntry SLMCostTable[] = {
1088     { ISD::MUL,   MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1089     { ISD::MUL,   MVT::v8i16, {  2,  5, 1, 1 } }, // pmullw
1090     { ISD::FMUL,  MVT::f64,   {  2,  5, 1, 1 } }, // mulsd
1091     { ISD::FMUL,  MVT::f32,   {  1,  4, 1, 1 } }, // mulss
1092     { ISD::FMUL,  MVT::v2f64, {  4,  7, 1, 1 } }, // mulpd
1093     { ISD::FMUL,  MVT::v4f32, {  2,  5, 1, 1 } }, // mulps
1094     { ISD::FDIV,  MVT::f32,   { 17, 19, 1, 1 } }, // divss
1095     { ISD::FDIV,  MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1096     { ISD::FDIV,  MVT::f64,   { 32, 34, 1, 1 } }, // divsd
1097     { ISD::FDIV,  MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1098     { ISD::FADD,  MVT::v2f64, {  2,  4, 1, 1 } }, // addpd
1099     { ISD::FSUB,  MVT::v2f64, {  2,  4, 1, 1 } }, // subpd
1100     // v2i64/v4i64 mul is custom lowered as a series of long:
1101     // multiplies(3), shifts(3) and adds(2)
1102     // slm muldq version throughput is 2 and addq throughput 4
1103     // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1104     //       3X4 (addq throughput) = 17
1105     { ISD::MUL,   MVT::v2i64, { 17, 22, 9, 9 } },
1106     // slm addq\subq throughput is 4
1107     { ISD::ADD,   MVT::v2i64, {  4,  2, 1, 2 } },
1108     { ISD::SUB,   MVT::v2i64, {  4,  2, 1, 2 } },
1109   };
1110 
1111   if (ST->useSLMArithCosts())
1112     if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1113       if (auto KindCost = Entry->Cost[CostKind])
1114         return LT.first * *KindCost;
1115 
1116   static const CostKindTblEntry AVX2CostTable[] = {
1117     { ISD::SHL,  MVT::v16i8,   {  6, 21,11,16 } }, // vpblendvb sequence.
1118     { ISD::SHL,  MVT::v32i8,   {  6, 23,11,22 } }, // vpblendvb sequence.
1119     { ISD::SHL,  MVT::v8i16,   {  5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1120     { ISD::SHL,  MVT::v16i16,  {  8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1121 
1122     { ISD::SRL,  MVT::v16i8,   {  6, 27,12,18 } }, // vpblendvb sequence.
1123     { ISD::SRL,  MVT::v32i8,   {  8, 30,12,24 } }, // vpblendvb sequence.
1124     { ISD::SRL,  MVT::v8i16,   {  5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1125     { ISD::SRL,  MVT::v16i16,  {  8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1126 
1127     { ISD::SRA,  MVT::v16i8,   { 17, 17,24,30 } }, // vpblendvb sequence.
1128     { ISD::SRA,  MVT::v32i8,   { 18, 20,24,43 } }, // vpblendvb sequence.
1129     { ISD::SRA,  MVT::v8i16,   {  5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1130     { ISD::SRA,  MVT::v16i16,  {  8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1131     { ISD::SRA,  MVT::v2i64,   {  4,  5, 5, 5 } }, // srl/xor/sub sequence.
1132     { ISD::SRA,  MVT::v4i64,   {  8,  8, 5, 9 } }, // srl/xor/sub sequence.
1133 
1134     { ISD::SUB,  MVT::v32i8,   {  1,  1, 1, 2 } }, // psubb
1135     { ISD::ADD,  MVT::v32i8,   {  1,  1, 1, 2 } }, // paddb
1136     { ISD::SUB,  MVT::v16i16,  {  1,  1, 1, 2 } }, // psubw
1137     { ISD::ADD,  MVT::v16i16,  {  1,  1, 1, 2 } }, // paddw
1138     { ISD::SUB,  MVT::v8i32,   {  1,  1, 1, 2 } }, // psubd
1139     { ISD::ADD,  MVT::v8i32,   {  1,  1, 1, 2 } }, // paddd
1140     { ISD::SUB,  MVT::v4i64,   {  1,  1, 1, 2 } }, // psubq
1141     { ISD::ADD,  MVT::v4i64,   {  1,  1, 1, 2 } }, // paddq
1142 
1143     { ISD::MUL,  MVT::v16i8,   {  5, 18, 6,12 } }, // extend/pmullw/pack
1144     { ISD::MUL,  MVT::v32i8,   {  4,  8, 8,16 } }, // pmaddubsw
1145     { ISD::MUL,  MVT::v16i16,  {  2,  5, 1, 2 } }, // pmullw
1146     { ISD::MUL,  MVT::v8i32,   {  4, 10, 1, 2 } }, // pmulld
1147     { ISD::MUL,  MVT::v4i32,   {  2, 10, 1, 2 } }, // pmulld
1148     { ISD::MUL,  MVT::v4i64,   {  6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1149     { ISD::MUL,  MVT::v2i64,   {  6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1150 
1151     { X86ISD::PMULUDQ, MVT::v4i64, { 1,  5, 1, 1 } },
1152 
1153     { ISD::FNEG, MVT::v4f64,   {  1,  1, 1, 2 } }, // vxorpd
1154     { ISD::FNEG, MVT::v8f32,   {  1,  1, 1, 2 } }, // vxorps
1155 
1156     { ISD::FADD, MVT::f64,     {  1,  4, 1, 1 } }, // vaddsd
1157     { ISD::FADD, MVT::f32,     {  1,  4, 1, 1 } }, // vaddss
1158     { ISD::FADD, MVT::v2f64,   {  1,  4, 1, 1 } }, // vaddpd
1159     { ISD::FADD, MVT::v4f32,   {  1,  4, 1, 1 } }, // vaddps
1160     { ISD::FADD, MVT::v4f64,   {  1,  4, 1, 2 } }, // vaddpd
1161     { ISD::FADD, MVT::v8f32,   {  1,  4, 1, 2 } }, // vaddps
1162 
1163     { ISD::FSUB, MVT::f64,     {  1,  4, 1, 1 } }, // vsubsd
1164     { ISD::FSUB, MVT::f32,     {  1,  4, 1, 1 } }, // vsubss
1165     { ISD::FSUB, MVT::v2f64,   {  1,  4, 1, 1 } }, // vsubpd
1166     { ISD::FSUB, MVT::v4f32,   {  1,  4, 1, 1 } }, // vsubps
1167     { ISD::FSUB, MVT::v4f64,   {  1,  4, 1, 2 } }, // vsubpd
1168     { ISD::FSUB, MVT::v8f32,   {  1,  4, 1, 2 } }, // vsubps
1169 
1170     { ISD::FMUL, MVT::f64,     {  1,  5, 1, 1 } }, // vmulsd
1171     { ISD::FMUL, MVT::f32,     {  1,  5, 1, 1 } }, // vmulss
1172     { ISD::FMUL, MVT::v2f64,   {  1,  5, 1, 1 } }, // vmulpd
1173     { ISD::FMUL, MVT::v4f32,   {  1,  5, 1, 1 } }, // vmulps
1174     { ISD::FMUL, MVT::v4f64,   {  1,  5, 1, 2 } }, // vmulpd
1175     { ISD::FMUL, MVT::v8f32,   {  1,  5, 1, 2 } }, // vmulps
1176 
1177     { ISD::FDIV, MVT::f32,     {  7, 13, 1, 1 } }, // vdivss
1178     { ISD::FDIV, MVT::v4f32,   {  7, 13, 1, 1 } }, // vdivps
1179     { ISD::FDIV, MVT::v8f32,   { 14, 21, 1, 3 } }, // vdivps
1180     { ISD::FDIV, MVT::f64,     { 14, 20, 1, 1 } }, // vdivsd
1181     { ISD::FDIV, MVT::v2f64,   { 14, 20, 1, 1 } }, // vdivpd
1182     { ISD::FDIV, MVT::v4f64,   { 28, 35, 1, 3 } }, // vdivpd
1183   };
1184 
1185   // Look for AVX2 lowering tricks for custom cases.
1186   if (ST->hasAVX2())
1187     if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1188       if (auto KindCost = Entry->Cost[CostKind])
1189         return LT.first * *KindCost;
1190 
1191   static const CostKindTblEntry AVX1CostTable[] = {
1192     // We don't have to scalarize unsupported ops. We can issue two half-sized
1193     // operations and we only need to extract the upper YMM half.
1194     // Two ops + 1 extract + 1 insert = 4.
1195     { ISD::MUL,     MVT::v32i8,   { 10, 11, 18, 19 } }, // pmaddubsw + split
1196     { ISD::MUL,     MVT::v16i8,   {  5,  6,  8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1197     { ISD::MUL,     MVT::v16i16,  {  4,  8,  5,  6 } }, // pmullw + split
1198     { ISD::MUL,     MVT::v8i32,   {  5,  8,  5, 10 } }, // pmulld + split
1199     { ISD::MUL,     MVT::v4i32,   {  2,  5,  1,  3 } }, // pmulld
1200     { ISD::MUL,     MVT::v4i64,   { 12, 15, 19, 20 } },
1201 
1202     { ISD::AND,     MVT::v32i8,   {  1,  1, 1, 2 } }, // vandps
1203     { ISD::AND,     MVT::v16i16,  {  1,  1, 1, 2 } }, // vandps
1204     { ISD::AND,     MVT::v8i32,   {  1,  1, 1, 2 } }, // vandps
1205     { ISD::AND,     MVT::v4i64,   {  1,  1, 1, 2 } }, // vandps
1206 
1207     { ISD::OR,      MVT::v32i8,   {  1,  1, 1, 2 } }, // vorps
1208     { ISD::OR,      MVT::v16i16,  {  1,  1, 1, 2 } }, // vorps
1209     { ISD::OR,      MVT::v8i32,   {  1,  1, 1, 2 } }, // vorps
1210     { ISD::OR,      MVT::v4i64,   {  1,  1, 1, 2 } }, // vorps
1211 
1212     { ISD::XOR,     MVT::v32i8,   {  1,  1, 1, 2 } }, // vxorps
1213     { ISD::XOR,     MVT::v16i16,  {  1,  1, 1, 2 } }, // vxorps
1214     { ISD::XOR,     MVT::v8i32,   {  1,  1, 1, 2 } }, // vxorps
1215     { ISD::XOR,     MVT::v4i64,   {  1,  1, 1, 2 } }, // vxorps
1216 
1217     { ISD::SUB,     MVT::v32i8,   {  4,  2, 5, 6 } }, // psubb + split
1218     { ISD::ADD,     MVT::v32i8,   {  4,  2, 5, 6 } }, // paddb + split
1219     { ISD::SUB,     MVT::v16i16,  {  4,  2, 5, 6 } }, // psubw + split
1220     { ISD::ADD,     MVT::v16i16,  {  4,  2, 5, 6 } }, // paddw + split
1221     { ISD::SUB,     MVT::v8i32,   {  4,  2, 5, 6 } }, // psubd + split
1222     { ISD::ADD,     MVT::v8i32,   {  4,  2, 5, 6 } }, // paddd + split
1223     { ISD::SUB,     MVT::v4i64,   {  4,  2, 5, 6 } }, // psubq + split
1224     { ISD::ADD,     MVT::v4i64,   {  4,  2, 5, 6 } }, // paddq + split
1225     { ISD::SUB,     MVT::v2i64,   {  1,  1, 1, 1 } }, // psubq
1226     { ISD::ADD,     MVT::v2i64,   {  1,  1, 1, 1 } }, // paddq
1227 
1228     { ISD::SHL,     MVT::v16i8,   { 10, 21,11,17 } }, // pblendvb sequence.
1229     { ISD::SHL,     MVT::v32i8,   { 22, 22,27,40 } }, // pblendvb sequence + split.
1230     { ISD::SHL,     MVT::v8i16,   {  6,  9,11,11 } }, // pblendvb sequence.
1231     { ISD::SHL,     MVT::v16i16,  { 13, 16,24,25 } }, // pblendvb sequence + split.
1232     { ISD::SHL,     MVT::v4i32,   {  3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1233     { ISD::SHL,     MVT::v8i32,   {  9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1234     { ISD::SHL,     MVT::v2i64,   {  2,  4, 4, 6 } }, // Shift each lane + blend.
1235     { ISD::SHL,     MVT::v4i64,   {  6,  7,11,15 } }, // Shift each lane + blend + split.
1236 
1237     { ISD::SRL,     MVT::v16i8,   { 11, 27,12,18 } }, // pblendvb sequence.
1238     { ISD::SRL,     MVT::v32i8,   { 23, 23,30,43 } }, // pblendvb sequence + split.
1239     { ISD::SRL,     MVT::v8i16,   { 13, 16,14,22 } }, // pblendvb sequence.
1240     { ISD::SRL,     MVT::v16i16,  { 28, 30,31,48 } }, // pblendvb sequence + split.
1241     { ISD::SRL,     MVT::v4i32,   {  6,  7,12,16 } }, // Shift each lane + blend.
1242     { ISD::SRL,     MVT::v8i32,   { 14, 14,26,34 } }, // Shift each lane + blend + split.
1243     { ISD::SRL,     MVT::v2i64,   {  2,  4, 4, 6 } }, // Shift each lane + blend.
1244     { ISD::SRL,     MVT::v4i64,   {  6,  7,11,15 } }, // Shift each lane + blend + split.
1245 
1246     { ISD::SRA,     MVT::v16i8,   { 21, 22,24,36 } }, // pblendvb sequence.
1247     { ISD::SRA,     MVT::v32i8,   { 44, 45,51,76 } }, // pblendvb sequence + split.
1248     { ISD::SRA,     MVT::v8i16,   { 13, 16,14,22 } }, // pblendvb sequence.
1249     { ISD::SRA,     MVT::v16i16,  { 28, 30,31,48 } }, // pblendvb sequence + split.
1250     { ISD::SRA,     MVT::v4i32,   {  6,  7,12,16 } }, // Shift each lane + blend.
1251     { ISD::SRA,     MVT::v8i32,   { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252     { ISD::SRA,     MVT::v2i64,   {  5,  6,10,14 } }, // Shift each lane + blend.
1253     { ISD::SRA,     MVT::v4i64,   { 12, 12,22,30 } }, // Shift each lane + blend + split.
1254 
1255     { ISD::FNEG,    MVT::v4f64,   {  2,  2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1256     { ISD::FNEG,    MVT::v8f32,   {  2,  2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1257 
1258     { ISD::FADD,    MVT::f64,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1259     { ISD::FADD,    MVT::f32,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1260     { ISD::FADD,    MVT::v2f64,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1261     { ISD::FADD,    MVT::v4f32,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1262     { ISD::FADD,    MVT::v4f64,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1263     { ISD::FADD,    MVT::v8f32,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1264 
1265     { ISD::FSUB,    MVT::f64,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1266     { ISD::FSUB,    MVT::f32,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1267     { ISD::FSUB,    MVT::v2f64,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268     { ISD::FSUB,    MVT::v4f32,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269     { ISD::FSUB,    MVT::v4f64,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1270     { ISD::FSUB,    MVT::v8f32,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1271 
1272     { ISD::FMUL,    MVT::f64,     {  2,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1273     { ISD::FMUL,    MVT::f32,     {  1,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1274     { ISD::FMUL,    MVT::v2f64,   {  2,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1275     { ISD::FMUL,    MVT::v4f32,   {  1,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1276     { ISD::FMUL,    MVT::v4f64,   {  4,  5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1277     { ISD::FMUL,    MVT::v8f32,   {  2,  5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1278 
1279     { ISD::FDIV,    MVT::f32,     { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1280     { ISD::FDIV,    MVT::v4f32,   { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1281     { ISD::FDIV,    MVT::v8f32,   { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1282     { ISD::FDIV,    MVT::f64,     { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1283     { ISD::FDIV,    MVT::v2f64,   { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1284     { ISD::FDIV,    MVT::v4f64,   { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1285   };
1286 
1287   if (ST->hasAVX())
1288     if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1289       if (auto KindCost = Entry->Cost[CostKind])
1290         return LT.first * *KindCost;
1291 
1292   static const CostKindTblEntry SSE42CostTable[] = {
1293     { ISD::FADD, MVT::f64,    {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1294     { ISD::FADD, MVT::f32,    {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1295     { ISD::FADD, MVT::v2f64,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1296     { ISD::FADD, MVT::v4f32,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1297 
1298     { ISD::FSUB, MVT::f64,    {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1299     { ISD::FSUB, MVT::f32 ,   {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1300     { ISD::FSUB, MVT::v2f64,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1301     { ISD::FSUB, MVT::v4f32,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1302 
1303     { ISD::FMUL, MVT::f64,    {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1304     { ISD::FMUL, MVT::f32,    {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1305     { ISD::FMUL, MVT::v2f64,  {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1306     { ISD::FMUL, MVT::v4f32,  {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1307 
1308     { ISD::FDIV,  MVT::f32,   { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1309     { ISD::FDIV,  MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1310     { ISD::FDIV,  MVT::f64,   { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1311     { ISD::FDIV,  MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1312 
1313     { ISD::MUL,   MVT::v2i64, {  6, 10,10,10 } }  // 3*pmuludq/3*shift/2*add
1314   };
1315 
1316   if (ST->hasSSE42())
1317     if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1318       if (auto KindCost = Entry->Cost[CostKind])
1319         return LT.first * *KindCost;
1320 
1321   static const CostKindTblEntry SSE41CostTable[] = {
1322     { ISD::SHL,  MVT::v16i8,  { 15, 24,17,22 } }, // pblendvb sequence.
1323     { ISD::SHL,  MVT::v8i16,  { 11, 14,11,11 } }, // pblendvb sequence.
1324     { ISD::SHL,  MVT::v4i32,  { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1325 
1326     { ISD::SRL,  MVT::v16i8,  { 16, 27,18,24 } }, // pblendvb sequence.
1327     { ISD::SRL,  MVT::v8i16,  { 22, 26,23,27 } }, // pblendvb sequence.
1328     { ISD::SRL,  MVT::v4i32,  { 16, 17,15,19 } }, // Shift each lane + blend.
1329     { ISD::SRL,  MVT::v2i64,  {  4,  6, 5, 7 } }, // splat+shuffle sequence.
1330 
1331     { ISD::SRA,  MVT::v16i8,  { 38, 41,30,36 } }, // pblendvb sequence.
1332     { ISD::SRA,  MVT::v8i16,  { 22, 26,23,27 } }, // pblendvb sequence.
1333     { ISD::SRA,  MVT::v4i32,  { 16, 17,15,19 } }, // Shift each lane + blend.
1334     { ISD::SRA,  MVT::v2i64,  {  8, 17, 5, 7 } }, // splat+shuffle sequence.
1335 
1336     { ISD::MUL,  MVT::v4i32,  {  2, 11, 1, 1 } }  // pmulld (Nehalem from agner.org)
1337   };
1338 
1339   if (ST->hasSSE41())
1340     if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1341       if (auto KindCost = Entry->Cost[CostKind])
1342         return LT.first * *KindCost;
1343 
1344   static const CostKindTblEntry SSSE3CostTable[] = {
1345     { ISD::MUL,  MVT::v16i8,  {  5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1346   };
1347 
1348   if (ST->hasSSSE3())
1349     if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1350       if (auto KindCost = Entry->Cost[CostKind])
1351         return LT.first * *KindCost;
1352 
1353   static const CostKindTblEntry SSE2CostTable[] = {
1354     // We don't correctly identify costs of casts because they are marked as
1355     // custom.
1356     { ISD::SHL,  MVT::v16i8,  { 13, 21,26,28 } }, // cmpgtb sequence.
1357     { ISD::SHL,  MVT::v8i16,  { 24, 27,16,20 } }, // cmpgtw sequence.
1358     { ISD::SHL,  MVT::v4i32,  { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1359     { ISD::SHL,  MVT::v2i64,  {  4,  6, 5, 7 } }, // splat+shuffle sequence.
1360 
1361     { ISD::SRL,  MVT::v16i8,  { 14, 28,27,30 } }, // cmpgtb sequence.
1362     { ISD::SRL,  MVT::v8i16,  { 16, 19,31,31 } }, // cmpgtw sequence.
1363     { ISD::SRL,  MVT::v4i32,  { 12, 12,15,19 } }, // Shift each lane + blend.
1364     { ISD::SRL,  MVT::v2i64,  {  4,  6, 5, 7 } }, // splat+shuffle sequence.
1365 
1366     { ISD::SRA,  MVT::v16i8,  { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1367     { ISD::SRA,  MVT::v8i16,  { 16, 19,31,31 } }, // cmpgtw sequence.
1368     { ISD::SRA,  MVT::v4i32,  { 12, 12,15,19 } }, // Shift each lane + blend.
1369     { ISD::SRA,  MVT::v2i64,  {  8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1370 
1371     { ISD::AND,  MVT::v16i8,  {  1,  1, 1, 1 } }, // pand
1372     { ISD::AND,  MVT::v8i16,  {  1,  1, 1, 1 } }, // pand
1373     { ISD::AND,  MVT::v4i32,  {  1,  1, 1, 1 } }, // pand
1374     { ISD::AND,  MVT::v2i64,  {  1,  1, 1, 1 } }, // pand
1375 
1376     { ISD::OR,   MVT::v16i8,  {  1,  1, 1, 1 } }, // por
1377     { ISD::OR,   MVT::v8i16,  {  1,  1, 1, 1 } }, // por
1378     { ISD::OR,   MVT::v4i32,  {  1,  1, 1, 1 } }, // por
1379     { ISD::OR,   MVT::v2i64,  {  1,  1, 1, 1 } }, // por
1380 
1381     { ISD::XOR,  MVT::v16i8,  {  1,  1, 1, 1 } }, // pxor
1382     { ISD::XOR,  MVT::v8i16,  {  1,  1, 1, 1 } }, // pxor
1383     { ISD::XOR,  MVT::v4i32,  {  1,  1, 1, 1 } }, // pxor
1384     { ISD::XOR,  MVT::v2i64,  {  1,  1, 1, 1 } }, // pxor
1385 
1386     { ISD::ADD,  MVT::v2i64,  {  1,  2, 1, 2 } }, // paddq
1387     { ISD::SUB,  MVT::v2i64,  {  1,  2, 1, 2 } }, // psubq
1388 
1389     { ISD::MUL,  MVT::v16i8,  {  6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1390     { ISD::MUL,  MVT::v8i16,  {  1,  5, 1, 1 } }, // pmullw
1391     { ISD::MUL,  MVT::v4i32,  {  6,  8, 7, 7 } }, // 3*pmuludq/4*shuffle
1392     { ISD::MUL,  MVT::v2i64,  {  7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1393 
1394     { X86ISD::PMULUDQ, MVT::v2i64, { 1,  5, 1, 1 } },
1395 
1396     { ISD::FDIV, MVT::f32,    { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1397     { ISD::FDIV, MVT::v4f32,  { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1398     { ISD::FDIV, MVT::f64,    { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1399     { ISD::FDIV, MVT::v2f64,  { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1400 
1401     { ISD::FNEG, MVT::f32,    {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1402     { ISD::FNEG, MVT::f64,    {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1403     { ISD::FNEG, MVT::v4f32,  {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1404     { ISD::FNEG, MVT::v2f64,  {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1405 
1406     { ISD::FADD, MVT::f32,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407     { ISD::FADD, MVT::f64,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408     { ISD::FADD, MVT::v2f64,  {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409 
1410     { ISD::FSUB, MVT::f32,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411     { ISD::FSUB, MVT::f64,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412     { ISD::FSUB, MVT::v2f64,  {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413 
1414     { ISD::FMUL, MVT::f64,    {  2,  5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1415     { ISD::FMUL, MVT::v2f64,  {  2,  5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416   };
1417 
1418   if (ST->hasSSE2())
1419     if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1420       if (auto KindCost = Entry->Cost[CostKind])
1421         return LT.first * *KindCost;
1422 
1423   static const CostKindTblEntry SSE1CostTable[] = {
1424     { ISD::FDIV, MVT::f32,   { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1425     { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1426 
1427     { ISD::FNEG, MVT::f32,   {  2,  2, 1, 2 } }, // Pentium III from http://www.agner.org/
1428     { ISD::FNEG, MVT::v4f32, {  2,  2, 1, 2 } }, // Pentium III from http://www.agner.org/
1429 
1430     { ISD::FADD, MVT::f32,   {  1,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1431     { ISD::FADD, MVT::v4f32, {  2,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1432 
1433     { ISD::FSUB, MVT::f32,   {  1,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1434     { ISD::FSUB, MVT::v4f32, {  2,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1435 
1436     { ISD::FMUL, MVT::f32,   {  2,  5, 1, 1 } }, // Pentium III from http://www.agner.org/
1437     { ISD::FMUL, MVT::v4f32, {  2,  5, 1, 1 } }, // Pentium III from http://www.agner.org/
1438   };
1439 
1440   if (ST->hasSSE1())
1441     if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1442       if (auto KindCost = Entry->Cost[CostKind])
1443         return LT.first * *KindCost;
1444 
1445   static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1446     { ISD::ADD,  MVT::i64,  {  1 } }, // Core (Merom) from http://www.agner.org/
1447     { ISD::SUB,  MVT::i64,  {  1 } }, // Core (Merom) from http://www.agner.org/
1448     { ISD::MUL,  MVT::i64,  {  2,  6,  1,  2 } },
1449   };
1450 
1451   if (ST->is64Bit())
1452     if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1453       if (auto KindCost = Entry->Cost[CostKind])
1454         return LT.first * *KindCost;
1455 
1456   static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1457     { ISD::ADD,  MVT::i8,  {  1 } }, // Pentium III from http://www.agner.org/
1458     { ISD::ADD,  MVT::i16, {  1 } }, // Pentium III from http://www.agner.org/
1459     { ISD::ADD,  MVT::i32, {  1 } }, // Pentium III from http://www.agner.org/
1460 
1461     { ISD::SUB,  MVT::i8,  {  1 } }, // Pentium III from http://www.agner.org/
1462     { ISD::SUB,  MVT::i16, {  1 } }, // Pentium III from http://www.agner.org/
1463     { ISD::SUB,  MVT::i32, {  1 } }, // Pentium III from http://www.agner.org/
1464 
1465     { ISD::MUL,  MVT::i8,  {  3,  4, 1, 1 } },
1466     { ISD::MUL,  MVT::i16, {  2,  4, 1, 1 } },
1467     { ISD::MUL,  MVT::i32, {  1,  4, 1, 1 } },
1468 
1469     { ISD::FNEG, MVT::f64, {  2,  2, 1, 3 } }, // (x87)
1470     { ISD::FADD, MVT::f64, {  2,  3, 1, 1 } }, // (x87)
1471     { ISD::FSUB, MVT::f64, {  2,  3, 1, 1 } }, // (x87)
1472     { ISD::FMUL, MVT::f64, {  2,  5, 1, 1 } }, // (x87)
1473     { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1474   };
1475 
1476   if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1477     if (auto KindCost = Entry->Cost[CostKind])
1478       return LT.first * *KindCost;
1479 
1480   // It is not a good idea to vectorize division. We have to scalarize it and
1481   // in the process we will often end up having to spilling regular
1482   // registers. The overhead of division is going to dominate most kernels
1483   // anyways so try hard to prevent vectorization of division - it is
1484   // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1485   // to hide "20 cycles" for each lane.
1486   if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1487       (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1488        ISD == ISD::UREM)) {
1489     InstructionCost ScalarCost =
1490         getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
1491                                Op1Info.getNoProps(), Op2Info.getNoProps());
1492     return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1493   }
1494 
1495   // Handle some basic single instruction code size cases.
1496   if (CostKind == TTI::TCK_CodeSize) {
1497     switch (ISD) {
1498     case ISD::FADD:
1499     case ISD::FSUB:
1500     case ISD::FMUL:
1501     case ISD::FDIV:
1502     case ISD::FNEG:
1503     case ISD::AND:
1504     case ISD::OR:
1505     case ISD::XOR:
1506       return LT.first;
1507       break;
1508     }
1509   }
1510 
1511   // Fallback to the default implementation.
1512   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1513                                        Args, CxtI);
1514 }
1515 
1516 InstructionCost
1517 X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
1518                             unsigned Opcode1, const SmallBitVector &OpcodeMask,
1519                             TTI::TargetCostKind CostKind) const {
1520   if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1521     return TTI::TCC_Basic;
1522   return InstructionCost::getInvalid();
1523 }
1524 
1525 InstructionCost X86TTIImpl::getShuffleCost(
1526     TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1527     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
1528     ArrayRef<const Value *> Args, const Instruction *CxtI) {
1529   // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1530   // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1531   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1532 
1533   Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1534 
1535   // If all args are constant than this will be constant folded away.
1536   if (!Args.empty() &&
1537       all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1538     return TTI::TCC_Free;
1539 
1540   // Recognize a basic concat_vector shuffle.
1541   if (Kind == TTI::SK_PermuteTwoSrc &&
1542       Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1543       ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1544     return getShuffleCost(TTI::SK_InsertSubvector,
1545                           VectorType::getDoubleElementsVectorType(BaseTp), Mask,
1546                           CostKind, Mask.size() / 2, BaseTp);
1547 
1548   // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1549   if (Kind == TTI::SK_Transpose)
1550     Kind = TTI::SK_PermuteTwoSrc;
1551 
1552   if (Kind == TTI::SK_Broadcast) {
1553     // For Broadcasts we are splatting the first element from the first input
1554     // register, so only need to reference that input and all the output
1555     // registers are the same.
1556     LT.first = 1;
1557 
1558     // If we're broadcasting a load then AVX/AVX2 can do this for free.
1559     using namespace PatternMatch;
1560     if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1561         (ST->hasAVX2() ||
1562          (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1563       return TTI::TCC_Free;
1564   }
1565 
1566   // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1567   // permutation.
1568   // Attempt to detect a shuffle mask with a single defined element.
1569   bool IsInLaneShuffle = false;
1570   bool IsSingleElementMask = false;
1571   if (BaseTp->getPrimitiveSizeInBits() > 0 &&
1572       (BaseTp->getPrimitiveSizeInBits() % 128) == 0 &&
1573       BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1574       Mask.size() == BaseTp->getElementCount().getKnownMinValue()) {
1575     unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128;
1576     unsigned NumEltsPerLane = Mask.size() / NumLanes;
1577     if ((Mask.size() % NumLanes) == 0) {
1578       IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1579         return P.value() == PoisonMaskElem ||
1580                ((P.value() % Mask.size()) / NumEltsPerLane) ==
1581                    (P.index() / NumEltsPerLane);
1582       });
1583       IsSingleElementMask =
1584           (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
1585             return M == PoisonMaskElem;
1586           }));
1587     }
1588   }
1589 
1590   // Treat <X x bfloat> shuffles as <X x half>.
1591   if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1592     LT.second = LT.second.changeVectorElementType(MVT::f16);
1593 
1594   // Subvector extractions are free if they start at the beginning of a
1595   // vector and cheap if the subvectors are aligned.
1596   if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1597     int NumElts = LT.second.getVectorNumElements();
1598     if ((Index % NumElts) == 0)
1599       return TTI::TCC_Free;
1600     std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1601     if (SubLT.second.isVector()) {
1602       int NumSubElts = SubLT.second.getVectorNumElements();
1603       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1604         return SubLT.first;
1605       // Handle some cases for widening legalization. For now we only handle
1606       // cases where the original subvector was naturally aligned and evenly
1607       // fit in its legalized subvector type.
1608       // FIXME: Remove some of the alignment restrictions.
1609       // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1610       // vectors.
1611       int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1612       if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1613           (NumSubElts % OrigSubElts) == 0 &&
1614           LT.second.getVectorElementType() ==
1615               SubLT.second.getVectorElementType() &&
1616           LT.second.getVectorElementType().getSizeInBits() ==
1617               BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1618         assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1619                "Unexpected number of elements!");
1620         auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1621                                            LT.second.getVectorNumElements());
1622         auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1623                                            SubLT.second.getVectorNumElements());
1624         int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1625         InstructionCost ExtractCost = getShuffleCost(
1626             TTI::SK_ExtractSubvector, VecTy, {}, CostKind, ExtractIndex, SubTy);
1627 
1628         // If the original size is 32-bits or more, we can use pshufd. Otherwise
1629         // if we have SSSE3 we can use pshufb.
1630         if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1631           return ExtractCost + 1; // pshufd or pshufb
1632 
1633         assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1634                "Unexpected vector size");
1635 
1636         return ExtractCost + 2; // worst case pshufhw + pshufd
1637       }
1638     }
1639     // If the extract subvector is not optimal, treat it as single op shuffle.
1640     Kind = TTI::SK_PermuteSingleSrc;
1641   }
1642 
1643   // Subvector insertions are cheap if the subvectors are aligned.
1644   // Note that in general, the insertion starting at the beginning of a vector
1645   // isn't free, because we need to preserve the rest of the wide vector,
1646   // but if the destination vector legalizes to the same width as the subvector
1647   // then the insertion will simplify to a (free) register copy.
1648   if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1649     int NumElts = LT.second.getVectorNumElements();
1650     std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1651     if (SubLT.second.isVector()) {
1652       int NumSubElts = SubLT.second.getVectorNumElements();
1653       bool MatchingTypes =
1654           NumElts == NumSubElts &&
1655           (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1656       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1657         return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1658     }
1659 
1660     // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1661     // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1662     // v1f32 (legalised to f32) into a v4f32.
1663     if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1664         SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1665       return 1;
1666 
1667     // If the insertion isn't aligned, treat it like a 2-op shuffle.
1668     Kind = TTI::SK_PermuteTwoSrc;
1669   }
1670 
1671   // Handle some common (illegal) sub-vector types as they are often very cheap
1672   // to shuffle even on targets without PSHUFB.
1673   EVT VT = TLI->getValueType(DL, BaseTp);
1674   if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1675       !ST->hasSSSE3()) {
1676      static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
1677       {TTI::SK_Broadcast,        MVT::v4i16, {1,1,1,1}}, // pshuflw
1678       {TTI::SK_Broadcast,        MVT::v2i16, {1,1,1,1}}, // pshuflw
1679       {TTI::SK_Broadcast,        MVT::v8i8,  {2,2,2,2}}, // punpck/pshuflw
1680       {TTI::SK_Broadcast,        MVT::v4i8,  {2,2,2,2}}, // punpck/pshuflw
1681       {TTI::SK_Broadcast,        MVT::v2i8,  {1,1,1,1}}, // punpck
1682 
1683       {TTI::SK_Reverse,          MVT::v4i16, {1,1,1,1}}, // pshuflw
1684       {TTI::SK_Reverse,          MVT::v2i16, {1,1,1,1}}, // pshuflw
1685       {TTI::SK_Reverse,          MVT::v4i8,  {3,3,3,3}}, // punpck/pshuflw/packus
1686       {TTI::SK_Reverse,          MVT::v2i8,  {1,1,1,1}}, // punpck
1687 
1688       {TTI::SK_Splice,           MVT::v4i16, {2,2,2,2}}, // punpck+psrldq
1689       {TTI::SK_Splice,           MVT::v2i16, {2,2,2,2}}, // punpck+psrldq
1690       {TTI::SK_Splice,           MVT::v4i8,  {2,2,2,2}}, // punpck+psrldq
1691       {TTI::SK_Splice,           MVT::v2i8,  {2,2,2,2}}, // punpck+psrldq
1692 
1693       {TTI::SK_PermuteTwoSrc,    MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw
1694       {TTI::SK_PermuteTwoSrc,    MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw
1695       {TTI::SK_PermuteTwoSrc,    MVT::v8i8,  {7,7,7,7}}, // punpck/pshuflw
1696       {TTI::SK_PermuteTwoSrc,    MVT::v4i8,  {4,4,4,4}}, // punpck/pshuflw
1697       {TTI::SK_PermuteTwoSrc,    MVT::v2i8,  {2,2,2,2}}, // punpck
1698 
1699       {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw
1700       {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw
1701       {TTI::SK_PermuteSingleSrc, MVT::v8i8,  {5,5,5,5}}, // punpck/pshuflw
1702       {TTI::SK_PermuteSingleSrc, MVT::v4i8,  {3,3,3,3}}, // punpck/pshuflw
1703       {TTI::SK_PermuteSingleSrc, MVT::v2i8,  {1,1,1,1}}, // punpck
1704     };
1705 
1706     if (ST->hasSSE2())
1707       if (const auto *Entry =
1708               CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1709         if (auto KindCost = Entry->Cost[CostKind])
1710           return LT.first * *KindCost;
1711   }
1712 
1713   // We are going to permute multiple sources and the result will be in multiple
1714   // destinations. Providing an accurate cost only for splits where the element
1715   // type remains the same.
1716   if (LT.first != 1) {
1717     MVT LegalVT = LT.second;
1718     if (LegalVT.isVector() &&
1719         LegalVT.getVectorElementType().getSizeInBits() ==
1720             BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1721         LegalVT.getVectorNumElements() <
1722             cast<FixedVectorType>(BaseTp)->getNumElements()) {
1723       unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1724       unsigned LegalVTSize = LegalVT.getStoreSize();
1725       // Number of source vectors after legalization:
1726       unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1727       // Number of destination vectors after legalization:
1728       InstructionCost NumOfDests = LT.first;
1729 
1730       auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1731                                               LegalVT.getVectorNumElements());
1732 
1733       if (!Mask.empty() && NumOfDests.isValid()) {
1734         // Try to perform better estimation of the permutation.
1735         // 1. Split the source/destination vectors into real registers.
1736         // 2. Do the mask analysis to identify which real registers are
1737         // permuted. If more than 1 source registers are used for the
1738         // destination register building, the cost for this destination register
1739         // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1740         // source register is used, build mask and calculate the cost as a cost
1741         // of PermuteSingleSrc.
1742         // Also, for the single register permute we try to identify if the
1743         // destination register is just a copy of the source register or the
1744         // copy of the previous destination register (the cost is
1745         // TTI::TCC_Basic). If the source register is just reused, the cost for
1746         // this operation is TTI::TCC_Free.
1747         NumOfDests =
1748             getTypeLegalizationCost(
1749                 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1750                 .first;
1751         unsigned E = *NumOfDests.getValue();
1752         unsigned NormalizedVF =
1753             LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1754         unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1755         unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1756         SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1757         copy(Mask, NormalizedMask.begin());
1758         unsigned PrevSrcReg = 0;
1759         ArrayRef<int> PrevRegMask;
1760         InstructionCost Cost = 0;
1761         processShuffleMasks(
1762             NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1763             [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1764              &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1765               if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1766                 // Check if the previous register can be just copied to the next
1767                 // one.
1768                 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1769                     PrevRegMask != RegMask)
1770                   Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
1771                                          RegMask, CostKind, 0, nullptr);
1772                 else
1773                   // Just a copy of previous destination register.
1774                   Cost += TTI::TCC_Basic;
1775                 return;
1776               }
1777               if (SrcReg != DestReg &&
1778                   any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1779                 // Just a copy of the source register.
1780                 Cost += TTI::TCC_Free;
1781               }
1782               PrevSrcReg = SrcReg;
1783               PrevRegMask = RegMask;
1784             },
1785             [this, SingleOpTy, CostKind,
1786              &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1787                     unsigned /*Unused*/, bool /*Unused*/) {
1788               Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1789                                      CostKind, 0, nullptr);
1790             });
1791         return Cost;
1792       }
1793 
1794       InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1795       return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1796                                             {}, CostKind, 0, nullptr);
1797     }
1798 
1799     return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1800   }
1801 
1802   // If we're just moving a single element around (probably as an alternative to
1803   // extracting it), we can assume this is cheap.
1804   if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
1805     return TTI::TCC_Basic;
1806 
1807   static const CostKindTblEntry AVX512VBMIShuffleTbl[] = {
1808     { TTI::SK_Reverse, MVT::v64i8,          { 1, 1, 1, 1 } }, // vpermb
1809     { TTI::SK_Reverse, MVT::v32i8,          { 1, 1, 1, 1 } }, // vpermb
1810     { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1811     { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1812     { TTI::SK_PermuteTwoSrc, MVT::v64i8,    { 2, 2, 2, 2 } }, // vpermt2b
1813     { TTI::SK_PermuteTwoSrc, MVT::v32i8,    { 2, 2, 2, 2 } }, // vpermt2b
1814     { TTI::SK_PermuteTwoSrc, MVT::v16i8,    { 2, 2, 2, 2 } }  // vpermt2b
1815   };
1816 
1817   if (ST->hasVBMI())
1818     if (const auto *Entry =
1819             CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1820       if (auto KindCost = Entry->Cost[CostKind])
1821         return LT.first * *KindCost;
1822 
1823   static const CostKindTblEntry AVX512BWShuffleTbl[] = {
1824     { TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1825     { TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1826     { TTI::SK_Broadcast, MVT::v64i8,  { 1, 1, 1, 1 } }, // vpbroadcastb
1827 
1828     { TTI::SK_Reverse, MVT::v32i16,   { 2, 2, 2, 2 } }, // vpermw
1829     { TTI::SK_Reverse, MVT::v32f16,   { 2, 2, 2, 2 } }, // vpermw
1830     { TTI::SK_Reverse, MVT::v16i16,   { 2, 2, 2, 2 } }, // vpermw
1831     { TTI::SK_Reverse, MVT::v64i8,    { 2, 2, 2, 2 } }, // pshufb + vshufi64x2
1832 
1833     { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
1834     { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
1835     { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1836     { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1837     { TTI::SK_PermuteSingleSrc, MVT::v64i8,  { 8, 8, 8, 8 } },  // extend to v32i16
1838 
1839     { TTI::SK_PermuteTwoSrc, MVT::v32i16,{  2,  2,  2,  2 } }, // vpermt2w
1840     { TTI::SK_PermuteTwoSrc, MVT::v32f16,{  2,  2,  2,  2 } }, // vpermt2w
1841     { TTI::SK_PermuteTwoSrc, MVT::v16i16,{  2,  2,  2,  2 } }, // vpermt2w
1842     { TTI::SK_PermuteTwoSrc, MVT::v8i16, {  2,  2,  2,  2 } },  // vpermt2w
1843     { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
1844 
1845     { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
1846     { TTI::SK_Select, MVT::v64i8,  { 1, 1, 1, 1 } }, // vblendmb
1847 
1848     { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1849     { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1850     { TTI::SK_Splice, MVT::v64i8,  { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1851   };
1852 
1853   if (ST->hasBWI())
1854     if (const auto *Entry =
1855             CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1856       if (auto KindCost = Entry->Cost[CostKind])
1857         return LT.first * *KindCost;
1858 
1859   static const CostKindTblEntry AVX512ShuffleTbl[] = {
1860       {TTI::SK_Broadcast, MVT::v8f64,  { 1, 1, 1, 1 } }, // vbroadcastsd
1861       {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1862       {TTI::SK_Broadcast, MVT::v8i64,  { 1, 1, 1, 1 } }, // vpbroadcastq
1863       {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1864       {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1865       {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1866       {TTI::SK_Broadcast, MVT::v64i8,  { 1, 1, 1, 1 } }, // vpbroadcastb
1867 
1868       {TTI::SK_Reverse, MVT::v8f64,  { 1, 3, 1, 1 } }, // vpermpd
1869       {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1870       {TTI::SK_Reverse, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpermq
1871       {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1872       {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1873       {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1874       {TTI::SK_Reverse, MVT::v64i8,  { 7, 7, 7, 7 } }, // per mca
1875 
1876       {TTI::SK_Splice, MVT::v8f64,  { 1, 1, 1, 1 } }, // vpalignd
1877       {TTI::SK_Splice, MVT::v4f64,  { 1, 1, 1, 1 } }, // vpalignd
1878       {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1879       {TTI::SK_Splice, MVT::v8f32,  { 1, 1, 1, 1 } }, // vpalignd
1880       {TTI::SK_Splice, MVT::v8i64,  { 1, 1, 1, 1 } }, // vpalignd
1881       {TTI::SK_Splice, MVT::v4i64,  { 1, 1, 1, 1 } }, // vpalignd
1882       {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1883       {TTI::SK_Splice, MVT::v8i32,  { 1, 1, 1, 1 } }, // vpalignd
1884       {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1885       {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1886       {TTI::SK_Splice, MVT::v64i8,  { 4, 4, 4, 4 } }, // split + palignr
1887 
1888       {TTI::SK_PermuteSingleSrc, MVT::v8f64,  { 1, 3, 1, 1 } }, // vpermpd
1889       {TTI::SK_PermuteSingleSrc, MVT::v4f64,  { 1, 3, 1, 1 } }, // vpermpd
1890       {TTI::SK_PermuteSingleSrc, MVT::v2f64,  { 1, 3, 1, 1 } }, // vpermpd
1891       {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1892       {TTI::SK_PermuteSingleSrc, MVT::v8f32,  { 1, 3, 1, 1 } }, // vpermps
1893       {TTI::SK_PermuteSingleSrc, MVT::v4f32,  { 1, 3, 1, 1 } }, // vpermps
1894       {TTI::SK_PermuteSingleSrc, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpermq
1895       {TTI::SK_PermuteSingleSrc, MVT::v4i64,  { 1, 3, 1, 1 } }, // vpermq
1896       {TTI::SK_PermuteSingleSrc, MVT::v2i64,  { 1, 3, 1, 1 } }, // vpermq
1897       {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1898       {TTI::SK_PermuteSingleSrc, MVT::v8i32,  { 1, 3, 1, 1 } }, // vpermd
1899       {TTI::SK_PermuteSingleSrc, MVT::v4i32,  { 1, 3, 1, 1 } }, // vpermd
1900       {TTI::SK_PermuteSingleSrc, MVT::v16i8,  { 1, 3, 1, 1 } }, // pshufb
1901 
1902       {TTI::SK_PermuteTwoSrc, MVT::v8f64,  { 1, 3, 1, 1 } }, // vpermt2pd
1903       {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1904       {TTI::SK_PermuteTwoSrc, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpermt2q
1905       {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1906       {TTI::SK_PermuteTwoSrc, MVT::v4f64,  { 1, 3, 1, 1 } }, // vpermt2pd
1907       {TTI::SK_PermuteTwoSrc, MVT::v8f32,  { 1, 3, 1, 1 } }, // vpermt2ps
1908       {TTI::SK_PermuteTwoSrc, MVT::v4i64,  { 1, 3, 1, 1 } }, // vpermt2q
1909       {TTI::SK_PermuteTwoSrc, MVT::v8i32,  { 1, 3, 1, 1 } }, // vpermt2d
1910       {TTI::SK_PermuteTwoSrc, MVT::v2f64,  { 1, 3, 1, 1 } }, // vpermt2pd
1911       {TTI::SK_PermuteTwoSrc, MVT::v4f32,  { 1, 3, 1, 1 } }, // vpermt2ps
1912       {TTI::SK_PermuteTwoSrc, MVT::v2i64,  { 1, 3, 1, 1 } }, // vpermt2q
1913       {TTI::SK_PermuteTwoSrc, MVT::v4i32,  { 1, 3, 1, 1 } }, // vpermt2d
1914 
1915       // FIXME: This just applies the type legalization cost rules above
1916       // assuming these completely split.
1917       {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1918       {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1919       {TTI::SK_PermuteSingleSrc, MVT::v64i8,  { 14, 14, 14, 14 } },
1920       {TTI::SK_PermuteTwoSrc,    MVT::v32i16, { 42, 42, 42, 42 } },
1921       {TTI::SK_PermuteTwoSrc,    MVT::v32f16, { 42, 42, 42, 42 } },
1922       {TTI::SK_PermuteTwoSrc,    MVT::v64i8,  { 42, 42, 42, 42 } },
1923 
1924       {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1925       {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1926       {TTI::SK_Select, MVT::v64i8,  { 1, 1, 1, 1 } }, // vpternlogq
1927       {TTI::SK_Select, MVT::v8f64,  { 1, 1, 1, 1 } }, // vblendmpd
1928       {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1929       {TTI::SK_Select, MVT::v8i64,  { 1, 1, 1, 1 } }, // vblendmq
1930       {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1931   };
1932 
1933   if (ST->hasAVX512())
1934     if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1935       if (auto KindCost = Entry->Cost[CostKind])
1936         return LT.first * *KindCost;
1937 
1938   static const CostKindTblEntry AVX2InLaneShuffleTbl[] = {
1939     { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb
1940     { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb
1941     { TTI::SK_PermuteSingleSrc, MVT::v32i8,  { 1, 1, 1, 1 } }, // vpshufb
1942 
1943     { TTI::SK_PermuteTwoSrc,    MVT::v4f64,  { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
1944     { TTI::SK_PermuteTwoSrc,    MVT::v8f32,  { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
1945     { TTI::SK_PermuteTwoSrc,    MVT::v4i64,  { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1946     { TTI::SK_PermuteTwoSrc,    MVT::v8i32,  { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1947     { TTI::SK_PermuteTwoSrc,    MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1948     { TTI::SK_PermuteTwoSrc,    MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1949     { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1950   };
1951 
1952   if (IsInLaneShuffle && ST->hasAVX2())
1953     if (const auto *Entry =
1954             CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
1955       if (auto KindCost = Entry->Cost[CostKind])
1956         return LT.first * *KindCost;
1957 
1958   static const CostKindTblEntry AVX2ShuffleTbl[] = {
1959     { TTI::SK_Broadcast, MVT::v4f64,  { 1, 1, 1, 1 } }, // vbroadcastpd
1960     { TTI::SK_Broadcast, MVT::v8f32,  { 1, 1, 1, 1 } }, // vbroadcastps
1961     { TTI::SK_Broadcast, MVT::v4i64,  { 1, 1, 1, 1 } }, // vpbroadcastq
1962     { TTI::SK_Broadcast, MVT::v8i32,  { 1, 1, 1, 1 } }, // vpbroadcastd
1963     { TTI::SK_Broadcast, MVT::v16i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1964     { TTI::SK_Broadcast, MVT::v16f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1965     { TTI::SK_Broadcast, MVT::v32i8,  { 1, 1, 1, 1 } }, // vpbroadcastb
1966 
1967     { TTI::SK_Reverse, MVT::v4f64,    { 1, 1, 1, 1 } }, // vpermpd
1968     { TTI::SK_Reverse, MVT::v8f32,    { 1, 1, 1, 1 } }, // vpermps
1969     { TTI::SK_Reverse, MVT::v4i64,    { 1, 1, 1, 1 } }, // vpermq
1970     { TTI::SK_Reverse, MVT::v8i32,    { 1, 1, 1, 1 } }, // vpermd
1971     { TTI::SK_Reverse, MVT::v16i16,   { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
1972     { TTI::SK_Reverse, MVT::v16f16,   { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
1973     { TTI::SK_Reverse, MVT::v32i8,    { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
1974 
1975     { TTI::SK_Select, MVT::v16i16,    { 1, 1, 1, 1 } }, // vpblendvb
1976     { TTI::SK_Select, MVT::v16f16,    { 1, 1, 1, 1 } }, // vpblendvb
1977     { TTI::SK_Select, MVT::v32i8,     { 1, 1, 1, 1 } }, // vpblendvb
1978 
1979     { TTI::SK_Splice, MVT::v8i32,     { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
1980     { TTI::SK_Splice, MVT::v8f32,     { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
1981     { TTI::SK_Splice, MVT::v16i16,    { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
1982     { TTI::SK_Splice, MVT::v16f16,    { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
1983     { TTI::SK_Splice, MVT::v32i8,     { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
1984 
1985     { TTI::SK_PermuteSingleSrc, MVT::v4f64,  { 1, 1, 1, 1 } }, // vpermpd
1986     { TTI::SK_PermuteSingleSrc, MVT::v8f32,  { 1, 1, 1, 1 } }, // vpermps
1987     { TTI::SK_PermuteSingleSrc, MVT::v4i64,  { 1, 1, 1, 1 } }, // vpermq
1988     { TTI::SK_PermuteSingleSrc, MVT::v8i32,  { 1, 1, 1, 1 } }, // vpermd
1989     { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
1990     { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
1991     { TTI::SK_PermuteSingleSrc, MVT::v32i8,  { 4, 4, 4, 4 } },
1992 
1993     { TTI::SK_PermuteTwoSrc, MVT::v4f64,  { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
1994     { TTI::SK_PermuteTwoSrc, MVT::v8f32,  { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
1995     { TTI::SK_PermuteTwoSrc, MVT::v4i64,  { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
1996     { TTI::SK_PermuteTwoSrc, MVT::v8i32,  { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
1997     { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
1998     { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
1999     { TTI::SK_PermuteTwoSrc, MVT::v32i8,  { 7, 7, 7, 7 } },
2000   };
2001 
2002   if (ST->hasAVX2())
2003     if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
2004       if (auto KindCost = Entry->Cost[CostKind])
2005         return LT.first * *KindCost;
2006 
2007   static const CostKindTblEntry XOPShuffleTbl[] = {
2008     { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2009     { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2010     { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2011     { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2012     { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2013                                                              // + vinsertf128
2014     { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2015                                                              // + vinsertf128
2016 
2017     { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2018                                                             // + vinsertf128
2019 
2020     { TTI::SK_PermuteTwoSrc, MVT::v8i16,  { 1, 1, 1, 1 } }, // vpperm
2021     { TTI::SK_PermuteTwoSrc, MVT::v32i8,  { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2022                                                             // + vinsertf128
2023     { TTI::SK_PermuteTwoSrc, MVT::v16i8,  { 1, 1, 1, 1 } }, // vpperm
2024   };
2025 
2026   if (ST->hasXOP())
2027     if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2028       if (auto KindCost = Entry->Cost[CostKind])
2029         return LT.first * *KindCost;
2030 
2031   static const CostKindTblEntry AVX1InLaneShuffleTbl[] = {
2032     { TTI::SK_PermuteSingleSrc, MVT::v4f64,  { 1, 1, 1, 1 } }, // vpermilpd
2033     { TTI::SK_PermuteSingleSrc, MVT::v4i64,  { 1, 1, 1, 1 } }, // vpermilpd
2034     { TTI::SK_PermuteSingleSrc, MVT::v8f32,  { 1, 1, 1, 1 } }, // vpermilps
2035     { TTI::SK_PermuteSingleSrc, MVT::v8i32,  { 1, 1, 1, 1 } }, // vpermilps
2036 
2037     { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2038                                                                // + vpor + vinsertf128
2039     { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2040                                                                // + vpor + vinsertf128
2041     { TTI::SK_PermuteSingleSrc, MVT::v32i8,  { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2042                                                                // + vpor + vinsertf128
2043 
2044     { TTI::SK_PermuteTwoSrc, MVT::v4f64,  { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2045     { TTI::SK_PermuteTwoSrc, MVT::v8f32,  { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2046     { TTI::SK_PermuteTwoSrc, MVT::v4i64,  { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd
2047     { TTI::SK_PermuteTwoSrc, MVT::v8i32,  { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps
2048     { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2049                                                             // + 2*vpor + vinsertf128
2050     { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2051                                                             // + 2*vpor + vinsertf128
2052     { TTI::SK_PermuteTwoSrc, MVT::v32i8,  { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2053                                                             // + 2*vpor + vinsertf128
2054   };
2055 
2056   if (IsInLaneShuffle && ST->hasAVX())
2057     if (const auto *Entry =
2058             CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2059       if (auto KindCost = Entry->Cost[CostKind])
2060         return LT.first * *KindCost;
2061 
2062   static const CostKindTblEntry AVX1ShuffleTbl[] = {
2063       {TTI::SK_Broadcast, MVT::v4f64,  {2,2,2,2}}, // vperm2f128 + vpermilpd
2064       {TTI::SK_Broadcast, MVT::v8f32,  {2,2,2,2}}, // vperm2f128 + vpermilps
2065       {TTI::SK_Broadcast, MVT::v4i64,  {2,2,2,2}}, // vperm2f128 + vpermilpd
2066       {TTI::SK_Broadcast, MVT::v8i32,  {2,2,2,2}}, // vperm2f128 + vpermilps
2067       {TTI::SK_Broadcast, MVT::v16i16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128
2068       {TTI::SK_Broadcast, MVT::v16f16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128
2069       {TTI::SK_Broadcast, MVT::v32i8,  {2,2,2,2}}, // vpshufb + vinsertf128
2070 
2071       {TTI::SK_Reverse, MVT::v4f64,  {2,2,2,2}}, // vperm2f128 + vpermilpd
2072       {TTI::SK_Reverse, MVT::v8f32,  {2,2,2,2}}, // vperm2f128 + vpermilps
2073       {TTI::SK_Reverse, MVT::v4i64,  {2,2,2,2}}, // vperm2f128 + vpermilpd
2074       {TTI::SK_Reverse, MVT::v8i32,  {2,2,2,2}}, // vperm2f128 + vpermilps
2075       {TTI::SK_Reverse, MVT::v16i16, {4,4,4,4}}, // vextractf128 + 2*pshufb
2076                                                  // + vinsertf128
2077       {TTI::SK_Reverse, MVT::v16f16, {4,4,4,4}}, // vextractf128 + 2*pshufb
2078                                                  // + vinsertf128
2079       {TTI::SK_Reverse, MVT::v32i8,  {4,4,4,4}}, // vextractf128 + 2*pshufb
2080                                                  // + vinsertf128
2081 
2082       {TTI::SK_Select, MVT::v4i64,  {1,1,1,1}}, // vblendpd
2083       {TTI::SK_Select, MVT::v4f64,  {1,1,1,1}}, // vblendpd
2084       {TTI::SK_Select, MVT::v8i32,  {1,1,1,1}}, // vblendps
2085       {TTI::SK_Select, MVT::v8f32,  {1,1,1,1}}, // vblendps
2086       {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
2087       {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
2088       {TTI::SK_Select, MVT::v32i8,  {3,3,3,3}}, // vpand + vpandn + vpor
2089 
2090       {TTI::SK_Splice, MVT::v4i64,  {2,2,2,2}}, // vperm2f128 + shufpd
2091       {TTI::SK_Splice, MVT::v4f64,  {2,2,2,2}}, // vperm2f128 + shufpd
2092       {TTI::SK_Splice, MVT::v8i32,  {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2093       {TTI::SK_Splice, MVT::v8f32,  {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2094       {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2095       {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2096       {TTI::SK_Splice, MVT::v32i8,  {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2097 
2098       {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
2099       {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
2100       {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2101       {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2102       {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2103                                                          // + 2*por + vinsertf128
2104       {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2105                                                          // + 2*por + vinsertf128
2106       {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
2107                                                          // + 2*por + vinsertf128
2108 
2109       {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}},   // 2*vperm2f128 + vshufpd
2110       {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}},   // 2*vperm2f128 + vshufpd
2111       {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}},   // 2*vperm2f128 + 2*vshufps
2112       {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}},   // 2*vperm2f128 + 2*vshufps
2113       {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2114                                                           // + 4*por + vinsertf128
2115       {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2116                                                           // + 4*por + vinsertf128
2117       {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2118                                                          // + 4*por + vinsertf128
2119   };
2120 
2121   if (ST->hasAVX())
2122     if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2123       if (auto KindCost = Entry->Cost[CostKind])
2124         return LT.first * *KindCost;
2125 
2126   static const CostKindTblEntry SSE41ShuffleTbl[] = {
2127       {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw
2128       {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd
2129       {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw
2130       {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps
2131       {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw
2132       {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw
2133       {TTI::SK_Select, MVT::v16i8, {1,1,1,1}}  // pblendvb
2134   };
2135 
2136   if (ST->hasSSE41())
2137     if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2138       if (auto KindCost = Entry->Cost[CostKind])
2139         return LT.first * *KindCost;
2140 
2141   static const CostKindTblEntry SSSE3ShuffleTbl[] = {
2142       {TTI::SK_Broadcast, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2143       {TTI::SK_Broadcast, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2144       {TTI::SK_Broadcast, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2145 
2146       {TTI::SK_Reverse, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2147       {TTI::SK_Reverse, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2148       {TTI::SK_Reverse, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2149 
2150       {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2151       {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2152       {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2153 
2154       {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr
2155       {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr
2156       {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr
2157       {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr
2158       {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr
2159 
2160       {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2161       {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2162       {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2163 
2164       {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2165       {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2166       {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2167   };
2168 
2169   if (ST->hasSSSE3())
2170     if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2171       if (auto KindCost = Entry->Cost[CostKind])
2172         return LT.first * *KindCost;
2173 
2174   static const CostKindTblEntry SSE2ShuffleTbl[] = {
2175       {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2176       {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2177       {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2178       {TTI::SK_Broadcast, MVT::v8i16, {2, 2, 2, 2}}, // pshuflw + pshufd
2179       {TTI::SK_Broadcast, MVT::v8f16, {2, 2, 2, 2}}, // pshuflw + pshufd
2180       {TTI::SK_Broadcast, MVT::v16i8, {3, 3, 3, 3}}, // unpck + pshuflw + pshufd
2181 
2182       {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2183       {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2184       {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2185       {TTI::SK_Reverse, MVT::v8i16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2186       {TTI::SK_Reverse, MVT::v8f16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2187       {TTI::SK_Reverse, MVT::v16i8, {9, 9, 9, 9}}, // 2*pshuflw + 2*pshufhw
2188                                                    // + 2*pshufd + 2*unpck + packus
2189 
2190       {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
2191       {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
2192       {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps
2193       {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // pand + pandn + por
2194       {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // pand + pandn + por
2195       {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // pand + pandn + por
2196 
2197       {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2198       {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2199       {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2200       {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2201       {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2202       {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2203 
2204       {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2205       {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2206       {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2207       {TTI::SK_PermuteSingleSrc, MVT::v8i16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2208                                                             // + pshufd/unpck
2209       {TTI::SK_PermuteSingleSrc, MVT::v8f16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2210                                                             // + pshufd/unpck
2211       {TTI::SK_PermuteSingleSrc, MVT::v16i8, {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
2212                                                                // + 2*pshufd + 2*unpck + 2*packus
2213 
2214       {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}},     // shufpd
2215       {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}},     // shufpd
2216       {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}},     // 2*{unpck,movsd,pshufd}
2217       {TTI::SK_PermuteTwoSrc, MVT::v8i16, {6, 8, 8, 8}},     // blend+permute
2218       {TTI::SK_PermuteTwoSrc, MVT::v8f16, {6, 8, 8, 8}},     // blend+permute
2219       {TTI::SK_PermuteTwoSrc, MVT::v16i8, {11, 13, 13, 13}}, // blend+permute
2220   };
2221 
2222   static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2223       {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2224   };
2225 
2226   if (ST->hasSSE2()) {
2227     bool IsLoad =
2228         llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2229     if (ST->hasSSE3() && IsLoad)
2230       if (const auto *Entry =
2231               CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2232         assert(isLegalBroadcastLoad(BaseTp->getElementType(),
2233                                     LT.second.getVectorElementCount()) &&
2234                "Table entry missing from isLegalBroadcastLoad()");
2235         return LT.first * Entry->Cost;
2236       }
2237 
2238     if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2239       if (auto KindCost = Entry->Cost[CostKind])
2240         return LT.first * *KindCost;
2241   }
2242 
2243   static const CostKindTblEntry SSE1ShuffleTbl[] = {
2244     { TTI::SK_Broadcast,        MVT::v4f32, {1,1,1,1} }, // shufps
2245     { TTI::SK_Reverse,          MVT::v4f32, {1,1,1,1} }, // shufps
2246     { TTI::SK_Select,           MVT::v4f32, {2,2,2,2} }, // 2*shufps
2247     { TTI::SK_Splice,           MVT::v4f32, {2,2,2,2} }, // 2*shufps
2248     { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps
2249     { TTI::SK_PermuteTwoSrc,    MVT::v4f32, {2,2,2,2} }, // 2*shufps
2250   };
2251 
2252   if (ST->hasSSE1()) {
2253     if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2254       // SHUFPS: both pairs must come from the same source register.
2255       auto MatchSHUFPS = [](int X, int Y) {
2256         return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2257       };
2258       if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2259         return 1;
2260     }
2261     if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2262       if (auto KindCost = Entry->Cost[CostKind])
2263         return LT.first * *KindCost;
2264   }
2265 
2266   return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2267 }
2268 
2269 InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
2270                                              Type *Src,
2271                                              TTI::CastContextHint CCH,
2272                                              TTI::TargetCostKind CostKind,
2273                                              const Instruction *I) {
2274   int ISD = TLI->InstructionOpcodeToISD(Opcode);
2275   assert(ISD && "Invalid opcode");
2276 
2277   // The cost tables include both specific, custom (non-legal) src/dst type
2278   // conversions and generic, legalized types. We test for customs first, before
2279   // falling back to legalization.
2280   // FIXME: Need a better design of the cost table to handle non-simple types of
2281   // potential massive combinations (elem_num x src_type x dst_type).
2282   static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2283     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8,  { 1, 1, 1, 1 } },
2284     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8,  { 1, 1, 1, 1 } },
2285 
2286     // Mask sign extend has an instruction.
2287     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   { 1, 1, 1, 1 } },
2288     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   { 1, 1, 1, 1 } },
2289     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   { 1, 1, 1, 1 } },
2290     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   { 1, 1, 1, 1 } },
2291     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   { 1, 1, 1, 1 } },
2292     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   { 1, 1, 1, 1 } },
2293     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   { 1, 1, 1, 1 } },
2294     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   { 1, 1, 1, 1 } },
2295     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   { 1, 1, 1, 1 } },
2296     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   { 1, 1, 1, 1 } },
2297     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   { 1, 1, 1, 1 } },
2298     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  { 1, 1, 1, 1 } },
2299     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  { 1, 1, 1, 1 } },
2300     { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  { 1, 1, 1, 1 } },
2301     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1,  { 1, 1, 1, 1 } },
2302     { ISD::SIGN_EXTEND, MVT::v64i8,  MVT::v64i1,  { 1, 1, 1, 1 } },
2303     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1,  { 1, 1, 1, 1 } },
2304 
2305     // Mask zero extend is a sext + shift.
2306     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   { 2, 1, 1, 1 } },
2307     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   { 2, 1, 1, 1 } },
2308     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   { 2, 1, 1, 1 } },
2309     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   { 2, 1, 1, 1 } },
2310     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   { 2, 1, 1, 1 } },
2311     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   { 2, 1, 1, 1 } },
2312     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   { 2, 1, 1, 1 } },
2313     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   { 2, 1, 1, 1 } },
2314     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   { 2, 1, 1, 1 } },
2315     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   { 2, 1, 1, 1 } },
2316     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   { 2, 1, 1, 1 } },
2317     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  { 2, 1, 1, 1 } },
2318     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  { 2, 1, 1, 1 } },
2319     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  { 2, 1, 1, 1 } },
2320     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1,  { 2, 1, 1, 1 } },
2321     { ISD::ZERO_EXTEND, MVT::v64i8,  MVT::v64i1,  { 2, 1, 1, 1 } },
2322     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1,  { 2, 1, 1, 1 } },
2323 
2324     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   { 2, 1, 1, 1 } },
2325     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  { 2, 1, 1, 1 } },
2326     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  { 2, 1, 1, 1 } },
2327     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  { 2, 1, 1, 1 } },
2328     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   { 2, 1, 1, 1 } },
2329     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  { 2, 1, 1, 1 } },
2330     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  { 2, 1, 1, 1 } },
2331     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  { 2, 1, 1, 1 } },
2332     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   { 2, 1, 1, 1 } },
2333     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  { 2, 1, 1, 1 } },
2334     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  { 2, 1, 1, 1 } },
2335     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  { 2, 1, 1, 1 } },
2336     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, { 2, 1, 1, 1 } },
2337     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  { 2, 1, 1, 1 } },
2338     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i16, { 2, 1, 1, 1 } },
2339     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v64i8,  { 2, 1, 1, 1 } },
2340     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i16, { 2, 1, 1, 1 } },
2341 
2342     { ISD::TRUNCATE,    MVT::v32i8,  MVT::v32i16, { 2, 1, 1, 1 } },
2343     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2344     { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i16,  { 2, 1, 1, 1 } }, // vpmovwb
2345     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  { 2, 1, 1, 1 } }, // vpmovwb
2346     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  { 2, 1, 1, 1 } }, // vpmovwb
2347   };
2348 
2349   static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2350     // Mask sign extend has an instruction.
2351     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   { 1, 1, 1, 1 } },
2352     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   { 1, 1, 1, 1 } },
2353     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   { 1, 1, 1, 1 } },
2354     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   { 1, 1, 1, 1 } },
2355     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   { 1, 1, 1, 1 } },
2356     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v16i1,  { 1, 1, 1, 1 } },
2357     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   { 1, 1, 1, 1 } },
2358     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  { 1, 1, 1, 1 } },
2359 
2360     // Mask zero extend is a sext + shift.
2361     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   { 2, 1, 1, 1, } },
2362     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   { 2, 1, 1, 1, } },
2363     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   { 2, 1, 1, 1, } },
2364     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   { 2, 1, 1, 1, } },
2365     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   { 2, 1, 1, 1, } },
2366     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v16i1,  { 2, 1, 1, 1, } },
2367     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   { 2, 1, 1, 1, } },
2368     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  { 2, 1, 1, 1, } },
2369 
2370     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  { 2, 1, 1, 1 } },
2371     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  { 2, 1, 1, 1 } },
2372     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  { 2, 1, 1, 1 } },
2373     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  { 2, 1, 1, 1 } },
2374     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  { 2, 1, 1, 1 } },
2375     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  { 2, 1, 1, 1 } },
2376     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i32, { 2, 1, 1, 1 } },
2377     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i64,  { 2, 1, 1, 1 } },
2378 
2379     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  { 1, 1, 1, 1 } },
2380     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  { 1, 1, 1, 1 } },
2381 
2382     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  { 1, 1, 1, 1 } },
2383     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  { 1, 1, 1, 1 } },
2384 
2385     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  { 1, 1, 1, 1 } },
2386     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  { 1, 1, 1, 1 } },
2387 
2388     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  { 1, 1, 1, 1 } },
2389     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  { 1, 1, 1, 1 } },
2390   };
2391 
2392   // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2393   // 256-bit wide vectors.
2394 
2395   static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2396     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,   { 1, 1, 1, 1 } },
2397     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32,  { 3, 1, 1, 1 } },
2398     { ISD::FP_EXTEND, MVT::v16f64,  MVT::v16f32,  { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2399     { ISD::FP_EXTEND, MVT::v16f32,  MVT::v16f16,  { 1, 1, 1, 1 } }, // vcvtph2ps
2400     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f16,   { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2401     { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,   { 1, 1, 1, 1 } },
2402     { ISD::FP_ROUND,  MVT::v16f16,  MVT::v16f32,  { 1, 1, 1, 1 } }, // vcvtps2ph
2403 
2404     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2405     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2406     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2407     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,   { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2408     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,   { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2409     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,   { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2410     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,   { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2411     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16,  { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2412     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,   { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2413     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,   { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2414     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,   { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2415     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i32,  { 2, 1, 1, 1 } }, // vpslld+vptestmd
2416     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,   { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2417     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,   { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2418     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i64,   { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2419     { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i32,   { 2, 1, 1, 1 } }, // vpmovdb
2420     { ISD::TRUNCATE,  MVT::v4i8,    MVT::v4i32,   { 2, 1, 1, 1 } }, // vpmovdb
2421     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32,  { 2, 1, 1, 1 } }, // vpmovdb
2422     { ISD::TRUNCATE,  MVT::v32i8,   MVT::v16i32,  { 2, 1, 1, 1 } }, // vpmovdb
2423     { ISD::TRUNCATE,  MVT::v64i8,   MVT::v16i32,  { 2, 1, 1, 1 } }, // vpmovdb
2424     { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32,  { 2, 1, 1, 1 } }, // vpmovdw
2425     { ISD::TRUNCATE,  MVT::v32i16,  MVT::v16i32,  { 2, 1, 1, 1 } }, // vpmovdw
2426     { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i64,   { 2, 1, 1, 1 } }, // vpmovqb
2427     { ISD::TRUNCATE,  MVT::v2i16,   MVT::v2i64,   { 1, 1, 1, 1 } }, // vpshufb
2428     { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqb
2429     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqb
2430     { ISD::TRUNCATE,  MVT::v32i8,   MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqb
2431     { ISD::TRUNCATE,  MVT::v64i8,   MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqb
2432     { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqw
2433     { ISD::TRUNCATE,  MVT::v16i16,  MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqw
2434     { ISD::TRUNCATE,  MVT::v32i16,  MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqw
2435     { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,   { 1, 1, 1, 1 } }, // vpmovqd
2436     { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,   { 1, 1, 1, 1 } }, // zmm vpmovqd
2437     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i64,  { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2438 
2439     { ISD::TRUNCATE,  MVT::v16i8,  MVT::v16i16,   { 3, 1, 1, 1 } }, // extend to v16i32
2440     { ISD::TRUNCATE,  MVT::v32i8,  MVT::v32i16,   { 8, 1, 1, 1 } },
2441     { ISD::TRUNCATE,  MVT::v64i8,  MVT::v32i16,   { 8, 1, 1, 1 } },
2442 
2443     // Sign extend is zmm vpternlogd+vptruncdb.
2444     // Zero extend is zmm broadcast load+vptruncdw.
2445     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   { 3, 1, 1, 1 } },
2446     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   { 4, 1, 1, 1 } },
2447     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   { 3, 1, 1, 1 } },
2448     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   { 4, 1, 1, 1 } },
2449     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   { 3, 1, 1, 1 } },
2450     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   { 4, 1, 1, 1 } },
2451     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  { 3, 1, 1, 1 } },
2452     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  { 4, 1, 1, 1 } },
2453 
2454     // Sign extend is zmm vpternlogd+vptruncdw.
2455     // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2456     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   { 3, 1, 1, 1 } },
2457     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   { 4, 1, 1, 1 } },
2458     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   { 3, 1, 1, 1 } },
2459     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   { 4, 1, 1, 1 } },
2460     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   { 3, 1, 1, 1 } },
2461     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   { 4, 1, 1, 1 } },
2462     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  { 3, 1, 1, 1 } },
2463     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  { 4, 1, 1, 1 } },
2464 
2465     { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   { 1, 1, 1, 1 } }, // zmm vpternlogd
2466     { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2467     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   { 1, 1, 1, 1 } }, // zmm vpternlogd
2468     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2469     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   { 1, 1, 1, 1 } }, // zmm vpternlogd
2470     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2471     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   { 1, 1, 1, 1 } }, // zmm vpternlogq
2472     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2473     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   { 1, 1, 1, 1 } }, // zmm vpternlogq
2474     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2475 
2476     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  { 1, 1, 1, 1 } }, // vpternlogd
2477     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  { 2, 1, 1, 1 } }, // vpternlogd+psrld
2478     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   { 1, 1, 1, 1 } }, // vpternlogq
2479     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2480 
2481     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  { 1, 1, 1, 1 } },
2482     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  { 1, 1, 1, 1 } },
2483     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2484     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2485     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,   { 1, 1, 1, 1 } },
2486     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,   { 1, 1, 1, 1 } },
2487     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  { 1, 1, 1, 1 } },
2488     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  { 1, 1, 1, 1 } },
2489     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  { 1, 1, 1, 1 } },
2490     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  { 1, 1, 1, 1 } },
2491 
2492     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8,  { 3, 1, 1, 1 } }, // FIXME: May not be right
2493     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8,  { 3, 1, 1, 1 } }, // FIXME: May not be right
2494 
2495     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   { 4, 1, 1, 1 } },
2496     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  { 3, 1, 1, 1 } },
2497     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v16i8,  { 2, 1, 1, 1 } },
2498     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  { 1, 1, 1, 1 } },
2499     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  { 2, 1, 1, 1 } },
2500     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2501     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  { 1, 1, 1, 1 } },
2502     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2503 
2504     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   { 4, 1, 1, 1 } },
2505     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  { 3, 1, 1, 1 } },
2506     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v16i8,  { 2, 1, 1, 1 } },
2507     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  { 1, 1, 1, 1 } },
2508     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  { 2, 1, 1, 1 } },
2509     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2510     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  { 1, 1, 1, 1 } },
2511     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2512     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  {26, 1, 1, 1 } },
2513     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  { 5, 1, 1, 1 } },
2514 
2515     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, { 2, 1, 1, 1 } },
2516     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f64, { 7, 1, 1, 1 } },
2517     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v32f64, {15, 1, 1, 1 } },
2518     { ISD::FP_TO_SINT,  MVT::v64i8,  MVT::v64f32, {11, 1, 1, 1 } },
2519     { ISD::FP_TO_SINT,  MVT::v64i8,  MVT::v64f64, {31, 1, 1, 1 } },
2520     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f64,  { 3, 1, 1, 1 } },
2521     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2522     { ISD::FP_TO_SINT,  MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2523     { ISD::FP_TO_SINT,  MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2524     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  { 1, 1, 1, 1 } },
2525     { ISD::FP_TO_SINT,  MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2526 
2527     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  { 1, 1, 1, 1 } },
2528     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f64,  { 3, 1, 1, 1 } },
2529     { ISD::FP_TO_UINT,  MVT::v8i8,   MVT::v8f64,  { 3, 1, 1, 1 } },
2530     { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2531     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2532     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v16f32, { 3, 1, 1, 1 } },
2533   };
2534 
2535   static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2536     // Mask sign extend has an instruction.
2537     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   { 1, 1, 1, 1 } },
2538     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   { 1, 1, 1, 1 } },
2539     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   { 1, 1, 1, 1 } },
2540     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   { 1, 1, 1, 1 } },
2541     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   { 1, 1, 1, 1 } },
2542     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   { 1, 1, 1, 1 } },
2543     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   { 1, 1, 1, 1 } },
2544     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   { 1, 1, 1, 1 } },
2545     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   { 1, 1, 1, 1 } },
2546     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   { 1, 1, 1, 1 } },
2547     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   { 1, 1, 1, 1 } },
2548     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  { 1, 1, 1, 1 } },
2549     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  { 1, 1, 1, 1 } },
2550     { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  { 1, 1, 1, 1 } },
2551     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1,  { 1, 1, 1, 1 } },
2552     { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v64i1,  { 1, 1, 1, 1 } },
2553     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1,  { 1, 1, 1, 1 } },
2554 
2555     // Mask zero extend is a sext + shift.
2556     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   { 2, 1, 1, 1 } },
2557     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   { 2, 1, 1, 1 } },
2558     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   { 2, 1, 1, 1 } },
2559     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   { 2, 1, 1, 1 } },
2560     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   { 2, 1, 1, 1 } },
2561     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   { 2, 1, 1, 1 } },
2562     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   { 2, 1, 1, 1 } },
2563     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   { 2, 1, 1, 1 } },
2564     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   { 2, 1, 1, 1 } },
2565     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   { 2, 1, 1, 1 } },
2566     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   { 2, 1, 1, 1 } },
2567     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  { 2, 1, 1, 1 } },
2568     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  { 2, 1, 1, 1 } },
2569     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  { 2, 1, 1, 1 } },
2570     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1,  { 2, 1, 1, 1 } },
2571     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v64i1,  { 2, 1, 1, 1 } },
2572     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1,  { 2, 1, 1, 1 } },
2573 
2574     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   { 2, 1, 1, 1 } },
2575     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  { 2, 1, 1, 1 } },
2576     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  { 2, 1, 1, 1 } },
2577     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  { 2, 1, 1, 1 } },
2578     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   { 2, 1, 1, 1 } },
2579     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  { 2, 1, 1, 1 } },
2580     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  { 2, 1, 1, 1 } },
2581     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  { 2, 1, 1, 1 } },
2582     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   { 2, 1, 1, 1 } },
2583     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  { 2, 1, 1, 1 } },
2584     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  { 2, 1, 1, 1 } },
2585     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  { 2, 1, 1, 1 } },
2586     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, { 2, 1, 1, 1 } },
2587     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  { 2, 1, 1, 1 } },
2588     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v16i16, { 2, 1, 1, 1 } },
2589     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i8,  { 2, 1, 1, 1 } },
2590     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v16i16, { 2, 1, 1, 1 } },
2591 
2592     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, { 2, 1, 1, 1 } },
2593   };
2594 
2595   static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2596     // Mask sign extend has an instruction.
2597     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   { 1, 1, 1, 1 } },
2598     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   { 1, 1, 1, 1 } },
2599     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   { 1, 1, 1, 1 } },
2600     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i1,  { 1, 1, 1, 1 } },
2601     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   { 1, 1, 1, 1 } },
2602     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i1,   { 1, 1, 1, 1 } },
2603     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i1,  { 1, 1, 1, 1 } },
2604     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   { 1, 1, 1, 1 } },
2605 
2606     // Mask zero extend is a sext + shift.
2607     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   { 2, 1, 1, 1 } },
2608     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   { 2, 1, 1, 1 } },
2609     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   { 2, 1, 1, 1 } },
2610     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i1,  { 2, 1, 1, 1 } },
2611     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   { 2, 1, 1, 1 } },
2612     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i1,   { 2, 1, 1, 1 } },
2613     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i1,  { 2, 1, 1, 1 } },
2614     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   { 2, 1, 1, 1 } },
2615 
2616     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v4i64,  { 2, 1, 1, 1 } },
2617     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i32,  { 2, 1, 1, 1 } },
2618     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  { 2, 1, 1, 1 } },
2619     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  { 2, 1, 1, 1 } },
2620     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  { 2, 1, 1, 1 } },
2621     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  { 2, 1, 1, 1 } },
2622     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v4i64,  { 2, 1, 1, 1 } },
2623     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  { 2, 1, 1, 1 } },
2624 
2625     { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  { 1, 1, 1, 1 } },
2626     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  { 1, 1, 1, 1 } },
2627     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  { 1, 1, 1, 1 } },
2628     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  { 1, 1, 1, 1 } },
2629 
2630     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  { 1, 1, 1, 1 } },
2631     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  { 1, 1, 1, 1 } },
2632     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  { 1, 1, 1, 1 } },
2633     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  { 1, 1, 1, 1 } },
2634 
2635     { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v4f32,  { 1, 1, 1, 1 } },
2636     { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  { 1, 1, 1, 1 } },
2637     { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  { 1, 1, 1, 1 } },
2638     { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  { 1, 1, 1, 1 } },
2639 
2640     { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v4f32,  { 1, 1, 1, 1 } },
2641     { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  { 1, 1, 1, 1 } },
2642     { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  { 1, 1, 1, 1 } },
2643     { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  { 1, 1, 1, 1 } },
2644   };
2645 
2646   static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2647     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2648     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2649     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2650     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,   { 8, 1, 1, 1 } }, // split+2*v8i8
2651     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,   { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2652     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,   { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2653     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,   { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2654     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16,  { 8, 1, 1, 1 } }, // split+2*v8i16
2655     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,   { 2, 1, 1, 1 } }, // vpslld+vptestmd
2656     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,   { 2, 1, 1, 1 } }, // vpslld+vptestmd
2657     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,   { 2, 1, 1, 1 } }, // vpslld+vptestmd
2658     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v8i32,   { 2, 1, 1, 1 } }, // vpslld+vptestmd
2659     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,   { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2660     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,   { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2661     { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,   { 1, 1, 1, 1 } }, // vpmovqd
2662     { ISD::TRUNCATE,  MVT::v4i8,    MVT::v4i64,   { 2, 1, 1, 1 } }, // vpmovqb
2663     { ISD::TRUNCATE,  MVT::v4i16,   MVT::v4i64,   { 2, 1, 1, 1 } }, // vpmovqw
2664     { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i32,   { 2, 1, 1, 1 } }, // vpmovwb
2665 
2666     // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2667     // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2668     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   { 5, 1, 1, 1 } },
2669     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   { 6, 1, 1, 1 } },
2670     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   { 5, 1, 1, 1 } },
2671     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   { 6, 1, 1, 1 } },
2672     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   { 5, 1, 1, 1 } },
2673     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   { 6, 1, 1, 1 } },
2674     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  {10, 1, 1, 1 } },
2675     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  {12, 1, 1, 1 } },
2676 
2677     // sign extend is vpcmpeq+maskedmove+vpmovdw
2678     // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2679     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   { 4, 1, 1, 1 } },
2680     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   { 5, 1, 1, 1 } },
2681     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   { 4, 1, 1, 1 } },
2682     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   { 5, 1, 1, 1 } },
2683     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   { 4, 1, 1, 1 } },
2684     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   { 5, 1, 1, 1 } },
2685     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  {10, 1, 1, 1 } },
2686     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  {12, 1, 1, 1 } },
2687 
2688     { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   { 1, 1, 1, 1 } }, // vpternlogd
2689     { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   { 2, 1, 1, 1 } }, // vpternlogd+psrld
2690     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   { 1, 1, 1, 1 } }, // vpternlogd
2691     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   { 2, 1, 1, 1 } }, // vpternlogd+psrld
2692     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   { 1, 1, 1, 1 } }, // vpternlogd
2693     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   { 2, 1, 1, 1 } }, // vpternlogd+psrld
2694     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i1,  { 1, 1, 1, 1 } }, // vpternlogd
2695     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i1,  { 2, 1, 1, 1 } }, // vpternlogd+psrld
2696 
2697     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   { 1, 1, 1, 1 } }, // vpternlogq
2698     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2699     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   { 1, 1, 1, 1 } }, // vpternlogq
2700     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2701 
2702     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  { 1, 1, 1, 1 } },
2703     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  { 1, 1, 1, 1 } },
2704     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  { 1, 1, 1, 1 } },
2705     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  { 1, 1, 1, 1 } },
2706     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  { 1, 1, 1, 1 } },
2707     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  { 1, 1, 1, 1 } },
2708     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  { 1, 1, 1, 1 } },
2709     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  { 1, 1, 1, 1 } },
2710     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  { 1, 1, 1, 1 } },
2711     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  { 1, 1, 1, 1 } },
2712     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  { 1, 1, 1, 1 } },
2713     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  { 1, 1, 1, 1 } },
2714 
2715     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 1, 1, 1, 1 } },
2716     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  { 1, 1, 1, 1 } },
2717     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 1, 1, 1, 1 } },
2718     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  { 1, 1, 1, 1 } },
2719 
2720     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    { 1, 1, 1, 1 } },
2721     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    { 1, 1, 1, 1 } },
2722     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 1, 1, 1, 1 } },
2723     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  { 1, 1, 1, 1 } },
2724     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 1, 1, 1, 1 } },
2725     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  { 1, 1, 1, 1 } },
2726     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  { 1, 1, 1, 1 } },
2727     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 1, 1, 1, 1 } },
2728     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  { 1, 1, 1, 1 } },
2729     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  { 1, 1, 1, 1 } },
2730     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  { 5, 1, 1, 1 } },
2731     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  { 5, 1, 1, 1 } },
2732     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  { 5, 1, 1, 1 } },
2733 
2734     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v8f32,  { 2, 1, 1, 1 } },
2735     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, { 2, 1, 1, 1 } },
2736     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v32f32, { 5, 1, 1, 1 } },
2737 
2738     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    { 1, 1, 1, 1 } },
2739     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    { 1, 1, 1, 1 } },
2740     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  { 1, 1, 1, 1 } },
2741     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  { 1, 1, 1, 1 } },
2742     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  { 1, 1, 1, 1 } },
2743     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  { 1, 1, 1, 1 } },
2744     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  { 1, 1, 1, 1 } },
2745   };
2746 
2747   static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2748     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   { 3, 1, 1, 1 } },
2749     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   { 3, 1, 1, 1 } },
2750     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   { 3, 1, 1, 1 } },
2751     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   { 3, 1, 1, 1 } },
2752     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  { 1, 1, 1, 1 } },
2753     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  { 1, 1, 1, 1 } },
2754 
2755     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  { 2, 1, 1, 1 } },
2756     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  { 2, 1, 1, 1 } },
2757     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  { 2, 1, 1, 1 } },
2758     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  { 2, 1, 1, 1 } },
2759     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  { 2, 1, 1, 1 } },
2760     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  { 2, 1, 1, 1 } },
2761     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  { 2, 1, 1, 1 } },
2762     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  { 2, 1, 1, 1 } },
2763     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  { 2, 1, 1, 1 } },
2764     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  { 2, 1, 1, 1 } },
2765     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2766     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2767     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  { 2, 1, 1, 1 } },
2768     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  { 2, 1, 1, 1 } },
2769 
2770     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  { 2, 1, 1, 1 } },
2771 
2772     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2773     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, { 4, 1, 1, 1 } },
2774     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  { 1, 1, 1, 1 } },
2775     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  { 1, 1, 1, 1 } },
2776     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  { 1, 1, 1, 1 } },
2777     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  { 4, 1, 1, 1 } },
2778     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i64,  { 4, 1, 1, 1 } },
2779     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  { 1, 1, 1, 1 } },
2780     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v2i64,  { 1, 1, 1, 1 } },
2781     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i64,  { 5, 1, 1, 1 } },
2782     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  { 1, 1, 1, 1 } },
2783     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  { 2, 1, 1, 1 } },
2784 
2785     { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  { 3, 1, 1, 1 } },
2786     { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  { 3, 1, 1, 1 } },
2787 
2788     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v8f32,  { 1, 1, 1, 1 } },
2789     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f64,  { 1, 1, 1, 1 } },
2790     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  { 1, 1, 1, 1 } },
2791     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  { 3, 1, 1, 1 } },
2792 
2793     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    { 3, 1, 1, 1 } },
2794     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    { 3, 1, 1, 1 } },
2795     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  { 1, 1, 1, 1 } },
2796     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  { 3, 1, 1, 1 } },
2797     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  { 4, 1, 1, 1 } },
2798     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  { 4, 1, 1, 1 } },
2799     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  { 3, 1, 1, 1 } },
2800     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  { 4, 1, 1, 1 } },
2801 
2802     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 2, 1, 1, 1 } },
2803     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  { 2, 1, 1, 1 } },
2804     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 2, 1, 1, 1 } },
2805     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  { 2, 1, 1, 1 } },
2806     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  { 1, 1, 1, 1 } },
2807     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  { 1, 1, 1, 1 } },
2808     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  { 3, 1, 1, 1 } },
2809 
2810     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 2, 1, 1, 1 } },
2811     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  { 2, 1, 1, 1 } },
2812     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 2, 1, 1, 1 } },
2813     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  { 2, 1, 1, 1 } },
2814     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  { 2, 1, 1, 1 } },
2815     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  { 1, 1, 1, 1 } },
2816     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 2, 1, 1, 1 } },
2817     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  { 2, 1, 1, 1 } },
2818     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  { 2, 1, 1, 1 } },
2819     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  { 4, 1, 1, 1 } },
2820   };
2821 
2822   static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2823     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   { 4, 1, 1, 1 } },
2824     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   { 4, 1, 1, 1 } },
2825     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   { 4, 1, 1, 1 } },
2826     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   { 4, 1, 1, 1 } },
2827     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  { 4, 1, 1, 1 } },
2828     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  { 4, 1, 1, 1 } },
2829 
2830     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  { 3, 1, 1, 1 } },
2831     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  { 3, 1, 1, 1 } },
2832     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  { 3, 1, 1, 1 } },
2833     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  { 3, 1, 1, 1 } },
2834     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  { 3, 1, 1, 1 } },
2835     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  { 3, 1, 1, 1 } },
2836     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  { 3, 1, 1, 1 } },
2837     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  { 3, 1, 1, 1 } },
2838     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  { 3, 1, 1, 1 } },
2839     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  { 3, 1, 1, 1 } },
2840     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  { 3, 1, 1, 1 } },
2841     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  { 3, 1, 1, 1 } },
2842 
2843     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  { 4, 1, 1, 1 } },
2844     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  { 5, 1, 1, 1 } },
2845     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, { 4, 1, 1, 1 } },
2846     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  { 9, 1, 1, 1 } },
2847     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i64, {11, 1, 1, 1 } },
2848 
2849     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2850     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, { 6, 1, 1, 1 } },
2851     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2852     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  { 5, 1, 1, 1 } },
2853     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  { 5, 1, 1, 1 } },
2854     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i64,  { 5, 1, 1, 1 } },
2855     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i64,  { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2856     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  { 2, 1, 1, 1 } },
2857 
2858     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i1,   { 3, 1, 1, 1 } },
2859     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i1,   { 3, 1, 1, 1 } },
2860     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i1,   { 8, 1, 1, 1 } },
2861     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  { 4, 1, 1, 1 } },
2862     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v16i8,  { 2, 1, 1, 1 } },
2863     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  { 4, 1, 1, 1 } },
2864     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v8i16,  { 2, 1, 1, 1 } },
2865     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  { 2, 1, 1, 1 } },
2866     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  { 2, 1, 1, 1 } },
2867     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  { 4, 1, 1, 1 } },
2868     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  { 5, 1, 1, 1 } },
2869     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  { 8, 1, 1, 1 } },
2870 
2871     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i1,   { 7, 1, 1, 1 } },
2872     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i1,   { 7, 1, 1, 1 } },
2873     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i1,   { 6, 1, 1, 1 } },
2874     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  { 4, 1, 1, 1 } },
2875     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v16i8,  { 2, 1, 1, 1 } },
2876     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  { 4, 1, 1, 1 } },
2877     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v8i16,  { 2, 1, 1, 1 } },
2878     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  { 4, 1, 1, 1 } },
2879     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  { 4, 1, 1, 1 } },
2880     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 5, 1, 1, 1 } },
2881     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  { 6, 1, 1, 1 } },
2882     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  { 8, 1, 1, 1 } },
2883     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  {10, 1, 1, 1 } },
2884     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  {10, 1, 1, 1 } },
2885     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  {18, 1, 1, 1 } },
2886     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  { 5, 1, 1, 1 } },
2887     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  {10, 1, 1, 1 } },
2888 
2889     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v8f32,  { 2, 1, 1, 1 } },
2890     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f64,  { 2, 1, 1, 1 } },
2891     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v8f32,  { 2, 1, 1, 1 } },
2892     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v4f64,  { 2, 1, 1, 1 } },
2893     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f32,  { 2, 1, 1, 1 } },
2894     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f64,  { 2, 1, 1, 1 } },
2895     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v8f32,  { 2, 1, 1, 1 } },
2896     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v4f64,  { 2, 1, 1, 1 } },
2897     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f64,  { 2, 1, 1, 1 } },
2898     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  { 2, 1, 1, 1 } },
2899     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  { 5, 1, 1, 1 } },
2900 
2901     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v8f32,  { 2, 1, 1, 1 } },
2902     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f64,  { 2, 1, 1, 1 } },
2903     { ISD::FP_TO_UINT,  MVT::v32i8,  MVT::v8f32,  { 2, 1, 1, 1 } },
2904     { ISD::FP_TO_UINT,  MVT::v32i8,  MVT::v4f64,  { 2, 1, 1, 1 } },
2905     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f32,  { 2, 1, 1, 1 } },
2906     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f64,  { 2, 1, 1, 1 } },
2907     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  { 2, 1, 1, 1 } },
2908     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v4f64,  { 2, 1, 1, 1 } },
2909     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  { 3, 1, 1, 1 } },
2910     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  { 4, 1, 1, 1 } },
2911     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  { 6, 1, 1, 1 } },
2912     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  { 7, 1, 1, 1 } },
2913     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  { 7, 1, 1, 1 } },
2914 
2915     { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  { 1, 1, 1, 1 } },
2916     { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  { 1, 1, 1, 1 } },
2917   };
2918 
2919   static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2920     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8,   { 1, 1, 1, 1 } },
2921     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8,   { 1, 1, 1, 1 } },
2922     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8,   { 1, 1, 1, 1 } },
2923     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8,   { 1, 1, 1, 1 } },
2924     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8,   { 1, 1, 1, 1 } },
2925     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8,   { 1, 1, 1, 1 } },
2926     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16,   { 1, 1, 1, 1 } },
2927     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16,   { 1, 1, 1, 1 } },
2928     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16,   { 1, 1, 1, 1 } },
2929     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16,   { 1, 1, 1, 1 } },
2930     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32,   { 1, 1, 1, 1 } },
2931     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32,   { 1, 1, 1, 1 } },
2932 
2933     // These truncates end up widening elements.
2934     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   { 1, 1, 1, 1 } }, // PMOVXZBQ
2935     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  { 1, 1, 1, 1 } }, // PMOVXZWQ
2936     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   { 1, 1, 1, 1 } }, // PMOVXZBD
2937 
2938     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  { 2, 1, 1, 1 } },
2939     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  { 2, 1, 1, 1 } },
2940     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  { 2, 1, 1, 1 } },
2941 
2942     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i32,    { 1, 1, 1, 1 } },
2943     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i32,    { 1, 1, 1, 1 } },
2944     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i64,    { 1, 1, 1, 1 } },
2945     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i64,    { 1, 1, 1, 1 } },
2946     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  { 1, 1, 1, 1 } },
2947     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 1, 1, 1, 1 } },
2948     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  { 1, 1, 1, 1 } },
2949     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 1, 1, 1, 1 } },
2950     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 1, 1, 1, 1 } },
2951     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  { 1, 1, 1, 1 } },
2952     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  { 2, 1, 1, 1 } },
2953 
2954     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i32,    { 1, 1, 1, 1 } },
2955     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i32,    { 1, 1, 1, 1 } },
2956     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    { 4, 1, 1, 1 } },
2957     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    { 4, 1, 1, 1 } },
2958     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  { 1, 1, 1, 1 } },
2959     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 1, 1, 1, 1 } },
2960     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  { 1, 1, 1, 1 } },
2961     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 1, 1, 1, 1 } },
2962     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  { 3, 1, 1, 1 } },
2963     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 3, 1, 1, 1 } },
2964     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  { 2, 1, 1, 1 } },
2965     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  {12, 1, 1, 1 } },
2966     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  {22, 1, 1, 1 } },
2967     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  { 4, 1, 1, 1 } },
2968 
2969     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f32,    { 1, 1, 1, 1 } },
2970     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f32,    { 1, 1, 1, 1 } },
2971     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f64,    { 1, 1, 1, 1 } },
2972     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f64,    { 1, 1, 1, 1 } },
2973     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f32,  { 2, 1, 1, 1 } },
2974     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v2f64,  { 2, 1, 1, 1 } },
2975     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f32,  { 1, 1, 1, 1 } },
2976     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v2f64,  { 1, 1, 1, 1 } },
2977     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f32,  { 1, 1, 1, 1 } },
2978     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v2f64,  { 1, 1, 1, 1 } },
2979 
2980     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f32,    { 1, 1, 1, 1 } },
2981     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    { 4, 1, 1, 1 } },
2982     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f64,    { 1, 1, 1, 1 } },
2983     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    { 4, 1, 1, 1 } },
2984     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f32,  { 2, 1, 1, 1 } },
2985     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v2f64,  { 2, 1, 1, 1 } },
2986     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f32,  { 1, 1, 1, 1 } },
2987     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v2f64,  { 1, 1, 1, 1 } },
2988     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  { 4, 1, 1, 1 } },
2989     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  { 4, 1, 1, 1 } },
2990   };
2991 
2992   static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
2993     // These are somewhat magic numbers justified by comparing the
2994     // output of llvm-mca for our various supported scheduler models
2995     // and basing it off the worst case scenario.
2996     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i32,    { 3, 1, 1, 1 } },
2997     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i32,    { 3, 1, 1, 1 } },
2998     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i64,    { 3, 1, 1, 1 } },
2999     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i64,    { 3, 1, 1, 1 } },
3000     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  { 3, 1, 1, 1 } },
3001     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 4, 1, 1, 1 } },
3002     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  { 3, 1, 1, 1 } },
3003     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 4, 1, 1, 1 } },
3004     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 3, 1, 1, 1 } },
3005     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  { 4, 1, 1, 1 } },
3006     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  { 8, 1, 1, 1 } },
3007     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  { 8, 1, 1, 1 } },
3008 
3009     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i32,    { 3, 1, 1, 1 } },
3010     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i32,    { 3, 1, 1, 1 } },
3011     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    { 8, 1, 1, 1 } },
3012     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    { 9, 1, 1, 1 } },
3013     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 4, 1, 1, 1 } },
3014     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  { 4, 1, 1, 1 } },
3015     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  { 4, 1, 1, 1 } },
3016     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 4, 1, 1, 1 } },
3017     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  { 7, 1, 1, 1 } },
3018     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  { 7, 1, 1, 1 } },
3019     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 5, 1, 1, 1 } },
3020     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  {15, 1, 1, 1 } },
3021     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  {18, 1, 1, 1 } },
3022 
3023     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f32,    { 4, 1, 1, 1 } },
3024     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f32,    { 4, 1, 1, 1 } },
3025     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f64,    { 4, 1, 1, 1 } },
3026     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f64,    { 4, 1, 1, 1 } },
3027     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f32,  { 6, 1, 1, 1 } },
3028     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v2f64,  { 6, 1, 1, 1 } },
3029     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f32,  { 5, 1, 1, 1 } },
3030     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v2f64,  { 5, 1, 1, 1 } },
3031     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f32,  { 4, 1, 1, 1 } },
3032     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v2f64,  { 4, 1, 1, 1 } },
3033 
3034     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f32,    { 4, 1, 1, 1 } },
3035     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    { 4, 1, 1, 1 } },
3036     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f64,    { 4, 1, 1, 1 } },
3037     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    {15, 1, 1, 1 } },
3038     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f32,  { 6, 1, 1, 1 } },
3039     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v2f64,  { 6, 1, 1, 1 } },
3040     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f32,  { 5, 1, 1, 1 } },
3041     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v2f64,  { 5, 1, 1, 1 } },
3042     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  { 8, 1, 1, 1 } },
3043     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  { 8, 1, 1, 1 } },
3044 
3045     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v16i8,  { 4, 1, 1, 1 } },
3046     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v16i8,  { 4, 1, 1, 1 } },
3047     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v16i8,  { 2, 1, 1, 1 } },
3048     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v16i8,  { 3, 1, 1, 1 } },
3049     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v16i8,  { 1, 1, 1, 1 } },
3050     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v16i8,  { 2, 1, 1, 1 } },
3051     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v8i16,  { 2, 1, 1, 1 } },
3052     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v8i16,  { 3, 1, 1, 1 } },
3053     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v8i16,  { 1, 1, 1, 1 } },
3054     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v8i16,  { 2, 1, 1, 1 } },
3055     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v4i32,  { 1, 1, 1, 1 } },
3056     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v4i32,  { 2, 1, 1, 1 } },
3057 
3058     // These truncates are really widening elements.
3059     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i32,  { 1, 1, 1, 1 } }, // PSHUFD
3060     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3061     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3062     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  { 1, 1, 1, 1 } }, // PUNPCKLWD
3063     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3064     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   { 1, 1, 1, 1 } }, // PUNPCKLBW
3065 
3066     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3067     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, { 3, 1, 1, 1 } },
3068     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3069     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, { 7, 1, 1, 1 } },
3070     { ISD::TRUNCATE,    MVT::v2i16,  MVT::v2i32,  { 1, 1, 1, 1 } },
3071     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  { 3, 1, 1, 1 } },
3072     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  { 5, 1, 1, 1 } },
3073     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3074     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3075     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v2i64,  { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3076     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v2i64,  { 1, 1, 1, 1 } }, // PSHUFD
3077   };
3078 
3079   static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3080     { ISD::FP_ROUND,  MVT::f16,     MVT::f32,     { 1, 1, 1, 1 } },
3081     { ISD::FP_ROUND,  MVT::v8f16,   MVT::v8f32,   { 1, 1, 1, 1 } },
3082     { ISD::FP_ROUND,  MVT::v4f16,   MVT::v4f32,   { 1, 1, 1, 1 } },
3083     { ISD::FP_EXTEND, MVT::f32,     MVT::f16,     { 1, 1, 1, 1 } },
3084     { ISD::FP_EXTEND, MVT::f64,     MVT::f16,     { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3085     { ISD::FP_EXTEND, MVT::v8f32,   MVT::v8f16,   { 1, 1, 1, 1 } },
3086     { ISD::FP_EXTEND, MVT::v4f32,   MVT::v4f16,   { 1, 1, 1, 1 } },
3087     { ISD::FP_EXTEND, MVT::v4f64,   MVT::v4f16,   { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3088   };
3089 
3090   // Attempt to map directly to (simple) MVT types to let us match custom entries.
3091   EVT SrcTy = TLI->getValueType(DL, Src);
3092   EVT DstTy = TLI->getValueType(DL, Dst);
3093 
3094   // The function getSimpleVT only handles simple value types.
3095   if (SrcTy.isSimple() && DstTy.isSimple()) {
3096     MVT SimpleSrcTy = SrcTy.getSimpleVT();
3097     MVT SimpleDstTy = DstTy.getSimpleVT();
3098 
3099     if (ST->useAVX512Regs()) {
3100       if (ST->hasBWI())
3101         if (const auto *Entry = ConvertCostTableLookup(
3102                 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3103           if (auto KindCost = Entry->Cost[CostKind])
3104             return *KindCost;
3105 
3106       if (ST->hasDQI())
3107         if (const auto *Entry = ConvertCostTableLookup(
3108                 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3109           if (auto KindCost = Entry->Cost[CostKind])
3110             return *KindCost;
3111 
3112       if (ST->hasAVX512())
3113         if (const auto *Entry = ConvertCostTableLookup(
3114                 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3115           if (auto KindCost = Entry->Cost[CostKind])
3116             return *KindCost;
3117     }
3118 
3119     if (ST->hasBWI())
3120       if (const auto *Entry = ConvertCostTableLookup(
3121               AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3122         if (auto KindCost = Entry->Cost[CostKind])
3123           return *KindCost;
3124 
3125     if (ST->hasDQI())
3126       if (const auto *Entry = ConvertCostTableLookup(
3127               AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3128         if (auto KindCost = Entry->Cost[CostKind])
3129           return *KindCost;
3130 
3131     if (ST->hasAVX512())
3132       if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3133                                                      SimpleDstTy, SimpleSrcTy))
3134         if (auto KindCost = Entry->Cost[CostKind])
3135           return *KindCost;
3136 
3137     if (ST->hasAVX2()) {
3138       if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3139                                                      SimpleDstTy, SimpleSrcTy))
3140         if (auto KindCost = Entry->Cost[CostKind])
3141           return *KindCost;
3142     }
3143 
3144     if (ST->hasAVX()) {
3145       if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3146                                                      SimpleDstTy, SimpleSrcTy))
3147         if (auto KindCost = Entry->Cost[CostKind])
3148           return *KindCost;
3149     }
3150 
3151     if (ST->hasF16C()) {
3152       if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3153                                                      SimpleDstTy, SimpleSrcTy))
3154         if (auto KindCost = Entry->Cost[CostKind])
3155           return *KindCost;
3156     }
3157 
3158     if (ST->hasSSE41()) {
3159       if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3160                                                      SimpleDstTy, SimpleSrcTy))
3161         if (auto KindCost = Entry->Cost[CostKind])
3162           return *KindCost;
3163     }
3164 
3165     if (ST->hasSSE2()) {
3166       if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3167                                                      SimpleDstTy, SimpleSrcTy))
3168         if (auto KindCost = Entry->Cost[CostKind])
3169           return *KindCost;
3170     }
3171 
3172     if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3173         (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3174       // fp16 conversions not covered by any table entries require a libcall.
3175       // Return a large (arbitrary) number to model this.
3176       return InstructionCost(64);
3177     }
3178   }
3179 
3180   // Fall back to legalized types.
3181   std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3182   std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3183 
3184   // If we're truncating to the same legalized type - just assume its free.
3185   if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3186     return TTI::TCC_Free;
3187 
3188   if (ST->useAVX512Regs()) {
3189     if (ST->hasBWI())
3190       if (const auto *Entry = ConvertCostTableLookup(
3191               AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3192         if (auto KindCost = Entry->Cost[CostKind])
3193           return std::max(LTSrc.first, LTDest.first) * *KindCost;
3194 
3195     if (ST->hasDQI())
3196       if (const auto *Entry = ConvertCostTableLookup(
3197               AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3198         if (auto KindCost = Entry->Cost[CostKind])
3199           return std::max(LTSrc.first, LTDest.first) * *KindCost;
3200 
3201     if (ST->hasAVX512())
3202       if (const auto *Entry = ConvertCostTableLookup(
3203               AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3204         if (auto KindCost = Entry->Cost[CostKind])
3205           return std::max(LTSrc.first, LTDest.first) * *KindCost;
3206   }
3207 
3208   if (ST->hasBWI())
3209     if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3210                                                    LTDest.second, LTSrc.second))
3211       if (auto KindCost = Entry->Cost[CostKind])
3212         return std::max(LTSrc.first, LTDest.first) * *KindCost;
3213 
3214   if (ST->hasDQI())
3215     if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3216                                                    LTDest.second, LTSrc.second))
3217       if (auto KindCost = Entry->Cost[CostKind])
3218         return std::max(LTSrc.first, LTDest.first) * *KindCost;
3219 
3220   if (ST->hasAVX512())
3221     if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3222                                                    LTDest.second, LTSrc.second))
3223       if (auto KindCost = Entry->Cost[CostKind])
3224         return std::max(LTSrc.first, LTDest.first) * *KindCost;
3225 
3226   if (ST->hasAVX2())
3227     if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3228                                                    LTDest.second, LTSrc.second))
3229       if (auto KindCost = Entry->Cost[CostKind])
3230         return std::max(LTSrc.first, LTDest.first) * *KindCost;
3231 
3232   if (ST->hasAVX())
3233     if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3234                                                    LTDest.second, LTSrc.second))
3235       if (auto KindCost = Entry->Cost[CostKind])
3236         return std::max(LTSrc.first, LTDest.first) * *KindCost;
3237 
3238   if (ST->hasF16C()) {
3239     if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3240                                                    LTDest.second, LTSrc.second))
3241       if (auto KindCost = Entry->Cost[CostKind])
3242         return std::max(LTSrc.first, LTDest.first) * *KindCost;
3243   }
3244 
3245   if (ST->hasSSE41())
3246     if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3247                                                    LTDest.second, LTSrc.second))
3248       if (auto KindCost = Entry->Cost[CostKind])
3249         return std::max(LTSrc.first, LTDest.first) * *KindCost;
3250 
3251   if (ST->hasSSE2())
3252     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3253                                                    LTDest.second, LTSrc.second))
3254       if (auto KindCost = Entry->Cost[CostKind])
3255         return std::max(LTSrc.first, LTDest.first) * *KindCost;
3256 
3257   // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3258   // sitofp.
3259   if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3260       1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3261     Type *ExtSrc = Src->getWithNewBitWidth(32);
3262     unsigned ExtOpc =
3263         (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3264 
3265     // For scalar loads the extend would be free.
3266     InstructionCost ExtCost = 0;
3267     if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3268       ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3269 
3270     return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3271                                       TTI::CastContextHint::None, CostKind);
3272   }
3273 
3274   // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3275   // i32.
3276   if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3277       1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3278     Type *TruncDst = Dst->getWithNewBitWidth(32);
3279     return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3280            getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3281                             TTI::CastContextHint::None, CostKind);
3282   }
3283 
3284   // TODO: Allow non-throughput costs that aren't binary.
3285   auto AdjustCost = [&CostKind](InstructionCost Cost,
3286                                 InstructionCost N = 1) -> InstructionCost {
3287     if (CostKind != TTI::TCK_RecipThroughput)
3288       return Cost == 0 ? 0 : N;
3289     return Cost * N;
3290   };
3291   return AdjustCost(
3292       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3293 }
3294 
3295 InstructionCost X86TTIImpl::getCmpSelInstrCost(
3296     unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3297     TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
3298     TTI::OperandValueInfo Op2Info, const Instruction *I) {
3299   // Early out if this type isn't scalar/vector integer/float.
3300   if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3301     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3302                                      Op1Info, Op2Info, I);
3303 
3304   // Legalize the type.
3305   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3306 
3307   MVT MTy = LT.second;
3308 
3309   int ISD = TLI->InstructionOpcodeToISD(Opcode);
3310   assert(ISD && "Invalid opcode");
3311 
3312   InstructionCost ExtraCost = 0;
3313   if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3314     // Some vector comparison predicates cost extra instructions.
3315     // TODO: Adjust ExtraCost based on CostKind?
3316     // TODO: Should we invert this and assume worst case cmp costs
3317     // and reduce for particular predicates?
3318     if (MTy.isVector() &&
3319         !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3320           (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3321           ST->hasBWI())) {
3322       // Fallback to I if a specific predicate wasn't specified.
3323       CmpInst::Predicate Pred = VecPred;
3324       if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3325                 Pred == CmpInst::BAD_FCMP_PREDICATE))
3326         Pred = cast<CmpInst>(I)->getPredicate();
3327 
3328       bool CmpWithConstant = false;
3329       if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3330         CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3331 
3332       switch (Pred) {
3333       case CmpInst::Predicate::ICMP_NE:
3334         // xor(cmpeq(x,y),-1)
3335         ExtraCost = CmpWithConstant ? 0 : 1;
3336         break;
3337       case CmpInst::Predicate::ICMP_SGE:
3338       case CmpInst::Predicate::ICMP_SLE:
3339         // xor(cmpgt(x,y),-1)
3340         ExtraCost = CmpWithConstant ? 0 : 1;
3341         break;
3342       case CmpInst::Predicate::ICMP_ULT:
3343       case CmpInst::Predicate::ICMP_UGT:
3344         // cmpgt(xor(x,signbit),xor(y,signbit))
3345         // xor(cmpeq(pmaxu(x,y),x),-1)
3346         ExtraCost = CmpWithConstant ? 1 : 2;
3347         break;
3348       case CmpInst::Predicate::ICMP_ULE:
3349       case CmpInst::Predicate::ICMP_UGE:
3350         if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3351             (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3352           // cmpeq(psubus(x,y),0)
3353           // cmpeq(pminu(x,y),x)
3354           ExtraCost = 1;
3355         } else {
3356           // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3357           ExtraCost = CmpWithConstant ? 2 : 3;
3358         }
3359         break;
3360       case CmpInst::Predicate::FCMP_ONE:
3361       case CmpInst::Predicate::FCMP_UEQ:
3362         // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3363         // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3364         if (CondTy && !ST->hasAVX())
3365           return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3366                                     CmpInst::Predicate::FCMP_UNO, CostKind,
3367                                     Op1Info, Op2Info) +
3368                  getCmpSelInstrCost(Opcode, ValTy, CondTy,
3369                                     CmpInst::Predicate::FCMP_OEQ, CostKind,
3370                                     Op1Info, Op2Info) +
3371                  getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3372 
3373         break;
3374       case CmpInst::Predicate::BAD_ICMP_PREDICATE:
3375       case CmpInst::Predicate::BAD_FCMP_PREDICATE:
3376         // Assume worst case scenario and add the maximum extra cost.
3377         ExtraCost = 3;
3378         break;
3379       default:
3380         break;
3381       }
3382     }
3383   }
3384 
3385   static const CostKindTblEntry SLMCostTbl[] = {
3386     // slm pcmpeq/pcmpgt throughput is 2
3387     { ISD::SETCC,   MVT::v2i64,   { 2, 5, 1, 2 } },
3388     // slm pblendvb/blendvpd/blendvps throughput is 4
3389     { ISD::SELECT,  MVT::v2f64,   { 4, 4, 1, 3 } }, // vblendvpd
3390     { ISD::SELECT,  MVT::v4f32,   { 4, 4, 1, 3 } }, // vblendvps
3391     { ISD::SELECT,  MVT::v2i64,   { 4, 4, 1, 3 } }, // pblendvb
3392     { ISD::SELECT,  MVT::v8i32,   { 4, 4, 1, 3 } }, // pblendvb
3393     { ISD::SELECT,  MVT::v8i16,   { 4, 4, 1, 3 } }, // pblendvb
3394     { ISD::SELECT,  MVT::v16i8,   { 4, 4, 1, 3 } }, // pblendvb
3395   };
3396 
3397   static const CostKindTblEntry AVX512BWCostTbl[] = {
3398     { ISD::SETCC,   MVT::v32i16,  { 1, 1, 1, 1 } },
3399     { ISD::SETCC,   MVT::v16i16,  { 1, 1, 1, 1 } },
3400     { ISD::SETCC,   MVT::v64i8,   { 1, 1, 1, 1 } },
3401     { ISD::SETCC,   MVT::v32i8,   { 1, 1, 1, 1 } },
3402 
3403     { ISD::SELECT,  MVT::v32i16,  { 1, 1, 1, 1 } },
3404     { ISD::SELECT,  MVT::v64i8,   { 1, 1, 1, 1 } },
3405   };
3406 
3407   static const CostKindTblEntry AVX512CostTbl[] = {
3408     { ISD::SETCC,   MVT::v8f64,   { 1, 4, 1, 1 } },
3409     { ISD::SETCC,   MVT::v4f64,   { 1, 4, 1, 1 } },
3410     { ISD::SETCC,   MVT::v16f32,  { 1, 4, 1, 1 } },
3411     { ISD::SETCC,   MVT::v8f32,   { 1, 4, 1, 1 } },
3412 
3413     { ISD::SETCC,   MVT::v8i64,   { 1, 1, 1, 1 } },
3414     { ISD::SETCC,   MVT::v4i64,   { 1, 1, 1, 1 } },
3415     { ISD::SETCC,   MVT::v2i64,   { 1, 1, 1, 1 } },
3416     { ISD::SETCC,   MVT::v16i32,  { 1, 1, 1, 1 } },
3417     { ISD::SETCC,   MVT::v8i32,   { 1, 1, 1, 1 } },
3418     { ISD::SETCC,   MVT::v32i16,  { 3, 7, 5, 5 } },
3419     { ISD::SETCC,   MVT::v64i8,   { 3, 7, 5, 5 } },
3420 
3421     { ISD::SELECT,  MVT::v8i64,   { 1, 1, 1, 1 } },
3422     { ISD::SELECT,  MVT::v4i64,   { 1, 1, 1, 1 } },
3423     { ISD::SELECT,  MVT::v2i64,   { 1, 1, 1, 1 } },
3424     { ISD::SELECT,  MVT::v16i32,  { 1, 1, 1, 1 } },
3425     { ISD::SELECT,  MVT::v8i32,   { 1, 1, 1, 1 } },
3426     { ISD::SELECT,  MVT::v4i32,   { 1, 1, 1, 1 } },
3427     { ISD::SELECT,  MVT::v8f64,   { 1, 1, 1, 1 } },
3428     { ISD::SELECT,  MVT::v4f64,   { 1, 1, 1, 1 } },
3429     { ISD::SELECT,  MVT::v2f64,   { 1, 1, 1, 1 } },
3430     { ISD::SELECT,  MVT::f64,     { 1, 1, 1, 1 } },
3431     { ISD::SELECT,  MVT::v16f32,  { 1, 1, 1, 1 } },
3432     { ISD::SELECT,  MVT::v8f32 ,  { 1, 1, 1, 1 } },
3433     { ISD::SELECT,  MVT::v4f32,   { 1, 1, 1, 1 } },
3434     { ISD::SELECT,  MVT::f32  ,   { 1, 1, 1, 1 } },
3435 
3436     { ISD::SELECT,  MVT::v32i16,  { 2, 2, 4, 4 } },
3437     { ISD::SELECT,  MVT::v16i16,  { 1, 1, 1, 1 } },
3438     { ISD::SELECT,  MVT::v8i16,   { 1, 1, 1, 1 } },
3439     { ISD::SELECT,  MVT::v64i8,   { 2, 2, 4, 4 } },
3440     { ISD::SELECT,  MVT::v32i8,   { 1, 1, 1, 1 } },
3441     { ISD::SELECT,  MVT::v16i8,   { 1, 1, 1, 1 } },
3442   };
3443 
3444   static const CostKindTblEntry AVX2CostTbl[] = {
3445     { ISD::SETCC,   MVT::v4f64,   { 1, 4, 1, 2 } },
3446     { ISD::SETCC,   MVT::v2f64,   { 1, 4, 1, 1 } },
3447     { ISD::SETCC,   MVT::f64,     { 1, 4, 1, 1 } },
3448     { ISD::SETCC,   MVT::v8f32,   { 1, 4, 1, 2 } },
3449     { ISD::SETCC,   MVT::v4f32,   { 1, 4, 1, 1 } },
3450     { ISD::SETCC,   MVT::f32,     { 1, 4, 1, 1 } },
3451 
3452     { ISD::SETCC,   MVT::v4i64,   { 1, 1, 1, 2 } },
3453     { ISD::SETCC,   MVT::v8i32,   { 1, 1, 1, 2 } },
3454     { ISD::SETCC,   MVT::v16i16,  { 1, 1, 1, 2 } },
3455     { ISD::SETCC,   MVT::v32i8,   { 1, 1, 1, 2 } },
3456 
3457     { ISD::SELECT,  MVT::v4f64,   { 2, 2, 1, 2 } }, // vblendvpd
3458     { ISD::SELECT,  MVT::v8f32,   { 2, 2, 1, 2 } }, // vblendvps
3459     { ISD::SELECT,  MVT::v4i64,   { 2, 2, 1, 2 } }, // pblendvb
3460     { ISD::SELECT,  MVT::v8i32,   { 2, 2, 1, 2 } }, // pblendvb
3461     { ISD::SELECT,  MVT::v16i16,  { 2, 2, 1, 2 } }, // pblendvb
3462     { ISD::SELECT,  MVT::v32i8,   { 2, 2, 1, 2 } }, // pblendvb
3463   };
3464 
3465   static const CostKindTblEntry XOPCostTbl[] = {
3466     { ISD::SETCC,   MVT::v4i64,   { 4, 2, 5, 6 } },
3467     { ISD::SETCC,   MVT::v2i64,   { 1, 1, 1, 1 } },
3468   };
3469 
3470   static const CostKindTblEntry AVX1CostTbl[] = {
3471     { ISD::SETCC,   MVT::v4f64,   { 2, 3, 1, 2 } },
3472     { ISD::SETCC,   MVT::v2f64,   { 1, 3, 1, 1 } },
3473     { ISD::SETCC,   MVT::f64,     { 1, 3, 1, 1 } },
3474     { ISD::SETCC,   MVT::v8f32,   { 2, 3, 1, 2 } },
3475     { ISD::SETCC,   MVT::v4f32,   { 1, 3, 1, 1 } },
3476     { ISD::SETCC,   MVT::f32,     { 1, 3, 1, 1 } },
3477 
3478     // AVX1 does not support 8-wide integer compare.
3479     { ISD::SETCC,   MVT::v4i64,   { 4, 2, 5, 6 } },
3480     { ISD::SETCC,   MVT::v8i32,   { 4, 2, 5, 6 } },
3481     { ISD::SETCC,   MVT::v16i16,  { 4, 2, 5, 6 } },
3482     { ISD::SETCC,   MVT::v32i8,   { 4, 2, 5, 6 } },
3483 
3484     { ISD::SELECT,  MVT::v4f64,   { 3, 3, 1, 2 } }, // vblendvpd
3485     { ISD::SELECT,  MVT::v8f32,   { 3, 3, 1, 2 } }, // vblendvps
3486     { ISD::SELECT,  MVT::v4i64,   { 3, 3, 1, 2 } }, // vblendvpd
3487     { ISD::SELECT,  MVT::v8i32,   { 3, 3, 1, 2 } }, // vblendvps
3488     { ISD::SELECT,  MVT::v16i16,  { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3489     { ISD::SELECT,  MVT::v32i8,   { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3490   };
3491 
3492   static const CostKindTblEntry SSE42CostTbl[] = {
3493     { ISD::SETCC,   MVT::v2i64,   { 1, 2, 1, 2 } },
3494   };
3495 
3496   static const CostKindTblEntry SSE41CostTbl[] = {
3497     { ISD::SETCC,   MVT::v2f64,   { 1, 5, 1, 1 } },
3498     { ISD::SETCC,   MVT::v4f32,   { 1, 5, 1, 1 } },
3499 
3500     { ISD::SELECT,  MVT::v2f64,   { 2, 2, 1, 2 } }, // blendvpd
3501     { ISD::SELECT,  MVT::f64,     { 2, 2, 1, 2 } }, // blendvpd
3502     { ISD::SELECT,  MVT::v4f32,   { 2, 2, 1, 2 } }, // blendvps
3503     { ISD::SELECT,  MVT::f32  ,   { 2, 2, 1, 2 } }, // blendvps
3504     { ISD::SELECT,  MVT::v2i64,   { 2, 2, 1, 2 } }, // pblendvb
3505     { ISD::SELECT,  MVT::v4i32,   { 2, 2, 1, 2 } }, // pblendvb
3506     { ISD::SELECT,  MVT::v8i16,   { 2, 2, 1, 2 } }, // pblendvb
3507     { ISD::SELECT,  MVT::v16i8,   { 2, 2, 1, 2 } }, // pblendvb
3508   };
3509 
3510   static const CostKindTblEntry SSE2CostTbl[] = {
3511     { ISD::SETCC,   MVT::v2f64,   { 2, 5, 1, 1 } },
3512     { ISD::SETCC,   MVT::f64,     { 1, 5, 1, 1 } },
3513 
3514     { ISD::SETCC,   MVT::v2i64,   { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3515     { ISD::SETCC,   MVT::v4i32,   { 1, 1, 1, 1 } },
3516     { ISD::SETCC,   MVT::v8i16,   { 1, 1, 1, 1 } },
3517     { ISD::SETCC,   MVT::v16i8,   { 1, 1, 1, 1 } },
3518 
3519     { ISD::SELECT,  MVT::v2f64,   { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3520     { ISD::SELECT,  MVT::f64,     { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3521     { ISD::SELECT,  MVT::v2i64,   { 2, 2, 3, 3 } }, // pand + pandn + por
3522     { ISD::SELECT,  MVT::v4i32,   { 2, 2, 3, 3 } }, // pand + pandn + por
3523     { ISD::SELECT,  MVT::v8i16,   { 2, 2, 3, 3 } }, // pand + pandn + por
3524     { ISD::SELECT,  MVT::v16i8,   { 2, 2, 3, 3 } }, // pand + pandn + por
3525   };
3526 
3527   static const CostKindTblEntry SSE1CostTbl[] = {
3528     { ISD::SETCC,   MVT::v4f32,   { 2, 5, 1, 1 } },
3529     { ISD::SETCC,   MVT::f32,     { 1, 5, 1, 1 } },
3530 
3531     { ISD::SELECT,  MVT::v4f32,   { 2, 2, 3, 3 } }, // andps + andnps + orps
3532     { ISD::SELECT,  MVT::f32,     { 2, 2, 3, 3 } }, // andps + andnps + orps
3533   };
3534 
3535   if (ST->useSLMArithCosts())
3536     if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3537       if (auto KindCost = Entry->Cost[CostKind])
3538         return LT.first * (ExtraCost + *KindCost);
3539 
3540   if (ST->hasBWI())
3541     if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3542       if (auto KindCost = Entry->Cost[CostKind])
3543         return LT.first * (ExtraCost + *KindCost);
3544 
3545   if (ST->hasAVX512())
3546     if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3547       if (auto KindCost = Entry->Cost[CostKind])
3548         return LT.first * (ExtraCost + *KindCost);
3549 
3550   if (ST->hasAVX2())
3551     if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3552       if (auto KindCost = Entry->Cost[CostKind])
3553         return LT.first * (ExtraCost + *KindCost);
3554 
3555   if (ST->hasXOP())
3556     if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3557       if (auto KindCost = Entry->Cost[CostKind])
3558         return LT.first * (ExtraCost + *KindCost);
3559 
3560   if (ST->hasAVX())
3561     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3562       if (auto KindCost = Entry->Cost[CostKind])
3563         return LT.first * (ExtraCost + *KindCost);
3564 
3565   if (ST->hasSSE42())
3566     if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3567       if (auto KindCost = Entry->Cost[CostKind])
3568         return LT.first * (ExtraCost + *KindCost);
3569 
3570   if (ST->hasSSE41())
3571     if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3572       if (auto KindCost = Entry->Cost[CostKind])
3573         return LT.first * (ExtraCost + *KindCost);
3574 
3575   if (ST->hasSSE2())
3576     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3577       if (auto KindCost = Entry->Cost[CostKind])
3578         return LT.first * (ExtraCost + *KindCost);
3579 
3580   if (ST->hasSSE1())
3581     if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3582       if (auto KindCost = Entry->Cost[CostKind])
3583         return LT.first * (ExtraCost + *KindCost);
3584 
3585   // Assume a 3cy latency for fp select ops.
3586   if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3587     if (ValTy->getScalarType()->isFloatingPointTy())
3588       return 3;
3589 
3590   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3591                                    Op1Info, Op2Info, I);
3592 }
3593 
3594 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
3595 
3596 InstructionCost
3597 X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
3598                                   TTI::TargetCostKind CostKind) {
3599   // Costs should match the codegen from:
3600   // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3601   // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3602   // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3603   // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3604   // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3605 
3606   // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3607   //       specialized in these tables yet.
3608   static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3609     { ISD::FSHL,       MVT::v8i64,   {  1,  1,  1,  1 } },
3610     { ISD::FSHL,       MVT::v4i64,   {  1,  1,  1,  1 } },
3611     { ISD::FSHL,       MVT::v2i64,   {  1,  1,  1,  1 } },
3612     { ISD::FSHL,       MVT::v16i32,  {  1,  1,  1,  1 } },
3613     { ISD::FSHL,       MVT::v8i32,   {  1,  1,  1,  1 } },
3614     { ISD::FSHL,       MVT::v4i32,   {  1,  1,  1,  1 } },
3615     { ISD::FSHL,       MVT::v32i16,  {  1,  1,  1,  1 } },
3616     { ISD::FSHL,       MVT::v16i16,  {  1,  1,  1,  1 } },
3617     { ISD::FSHL,       MVT::v8i16,   {  1,  1,  1,  1 } },
3618     { ISD::ROTL,       MVT::v32i16,  {  1,  1,  1,  1 } },
3619     { ISD::ROTL,       MVT::v16i16,  {  1,  1,  1,  1 } },
3620     { ISD::ROTL,       MVT::v8i16,   {  1,  1,  1,  1 } },
3621     { ISD::ROTR,       MVT::v32i16,  {  1,  1,  1,  1 } },
3622     { ISD::ROTR,       MVT::v16i16,  {  1,  1,  1,  1 } },
3623     { ISD::ROTR,       MVT::v8i16,   {  1,  1,  1,  1 } },
3624     { X86ISD::VROTLI,  MVT::v32i16,  {  1,  1,  1,  1 } },
3625     { X86ISD::VROTLI,  MVT::v16i16,  {  1,  1,  1,  1 } },
3626     { X86ISD::VROTLI,  MVT::v8i16,   {  1,  1,  1,  1 } },
3627   };
3628   static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3629     { ISD::CTPOP,      MVT::v32i16,  {  1,  1,  1,  1 } },
3630     { ISD::CTPOP,      MVT::v64i8,   {  1,  1,  1,  1 } },
3631     { ISD::CTPOP,      MVT::v16i16,  {  1,  1,  1,  1 } },
3632     { ISD::CTPOP,      MVT::v32i8,   {  1,  1,  1,  1 } },
3633     { ISD::CTPOP,      MVT::v8i16,   {  1,  1,  1,  1 } },
3634     { ISD::CTPOP,      MVT::v16i8,   {  1,  1,  1,  1 } },
3635   };
3636   static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3637     { ISD::CTPOP,      MVT::v8i64,   {  1,  1,  1,  1 } },
3638     { ISD::CTPOP,      MVT::v16i32,  {  1,  1,  1,  1 } },
3639     { ISD::CTPOP,      MVT::v4i64,   {  1,  1,  1,  1 } },
3640     { ISD::CTPOP,      MVT::v8i32,   {  1,  1,  1,  1 } },
3641     { ISD::CTPOP,      MVT::v2i64,   {  1,  1,  1,  1 } },
3642     { ISD::CTPOP,      MVT::v4i32,   {  1,  1,  1,  1 } },
3643   };
3644   static const CostKindTblEntry AVX512CDCostTbl[] = {
3645     { ISD::CTLZ,       MVT::v8i64,   {  1,  5,  1,  1 } },
3646     { ISD::CTLZ,       MVT::v16i32,  {  1,  5,  1,  1 } },
3647     { ISD::CTLZ,       MVT::v32i16,  { 18, 27, 23, 27 } },
3648     { ISD::CTLZ,       MVT::v64i8,   {  3, 16,  9, 11 } },
3649     { ISD::CTLZ,       MVT::v4i64,   {  1,  5,  1,  1 } },
3650     { ISD::CTLZ,       MVT::v8i32,   {  1,  5,  1,  1 } },
3651     { ISD::CTLZ,       MVT::v16i16,  {  8, 19, 11, 13 } },
3652     { ISD::CTLZ,       MVT::v32i8,   {  2, 11,  9, 10 } },
3653     { ISD::CTLZ,       MVT::v2i64,   {  1,  5,  1,  1 } },
3654     { ISD::CTLZ,       MVT::v4i32,   {  1,  5,  1,  1 } },
3655     { ISD::CTLZ,       MVT::v8i16,   {  3, 15,  4,  6 } },
3656     { ISD::CTLZ,       MVT::v16i8,   {  2, 10,  9, 10 } },
3657 
3658     { ISD::CTTZ,       MVT::v8i64,   {  2,  8,  6,  7 } },
3659     { ISD::CTTZ,       MVT::v16i32,  {  2,  8,  6,  7 } },
3660     { ISD::CTTZ,       MVT::v4i64,   {  1,  8,  6,  6 } },
3661     { ISD::CTTZ,       MVT::v8i32,   {  1,  8,  6,  6 } },
3662     { ISD::CTTZ,       MVT::v2i64,   {  1,  8,  6,  6 } },
3663     { ISD::CTTZ,       MVT::v4i32,   {  1,  8,  6,  6 } },
3664   };
3665   static const CostKindTblEntry AVX512BWCostTbl[] = {
3666     { ISD::ABS,        MVT::v32i16,  {  1,  1,  1,  1 } },
3667     { ISD::ABS,        MVT::v64i8,   {  1,  1,  1,  1 } },
3668     { ISD::BITREVERSE, MVT::v2i64,   {  3, 10, 10, 11 } },
3669     { ISD::BITREVERSE, MVT::v4i64,   {  3, 11, 10, 11 } },
3670     { ISD::BITREVERSE, MVT::v8i64,   {  3, 12, 10, 14 } },
3671     { ISD::BITREVERSE, MVT::v4i32,   {  3, 10, 10, 11 } },
3672     { ISD::BITREVERSE, MVT::v8i32,   {  3, 11, 10, 11 } },
3673     { ISD::BITREVERSE, MVT::v16i32,  {  3, 12, 10, 14 } },
3674     { ISD::BITREVERSE, MVT::v8i16,   {  3, 10, 10, 11 } },
3675     { ISD::BITREVERSE, MVT::v16i16,  {  3, 11, 10, 11 } },
3676     { ISD::BITREVERSE, MVT::v32i16,  {  3, 12, 10, 14 } },
3677     { ISD::BITREVERSE, MVT::v16i8,   {  2,  5,  9,  9 } },
3678     { ISD::BITREVERSE, MVT::v32i8,   {  2,  5,  9,  9 } },
3679     { ISD::BITREVERSE, MVT::v64i8,   {  2,  5,  9, 12 } },
3680     { ISD::BSWAP,      MVT::v2i64,   {  1,  1,  1,  2 } },
3681     { ISD::BSWAP,      MVT::v4i64,   {  1,  1,  1,  2 } },
3682     { ISD::BSWAP,      MVT::v8i64,   {  1,  1,  1,  2 } },
3683     { ISD::BSWAP,      MVT::v4i32,   {  1,  1,  1,  2 } },
3684     { ISD::BSWAP,      MVT::v8i32,   {  1,  1,  1,  2 } },
3685     { ISD::BSWAP,      MVT::v16i32,  {  1,  1,  1,  2 } },
3686     { ISD::BSWAP,      MVT::v8i16,   {  1,  1,  1,  2 } },
3687     { ISD::BSWAP,      MVT::v16i16,  {  1,  1,  1,  2 } },
3688     { ISD::BSWAP,      MVT::v32i16,  {  1,  1,  1,  2 } },
3689     { ISD::CTLZ,       MVT::v8i64,   {  8, 22, 23, 23 } },
3690     { ISD::CTLZ,       MVT::v16i32,  {  8, 23, 25, 25 } },
3691     { ISD::CTLZ,       MVT::v32i16,  {  4, 15, 15, 16 } },
3692     { ISD::CTLZ,       MVT::v64i8,   {  3, 12, 10,  9 } },
3693     { ISD::CTPOP,      MVT::v2i64,   {  3,  7, 10, 10 } },
3694     { ISD::CTPOP,      MVT::v4i64,   {  3,  7, 10, 10 } },
3695     { ISD::CTPOP,      MVT::v8i64,   {  3,  8, 10, 12 } },
3696     { ISD::CTPOP,      MVT::v4i32,   {  7, 11, 14, 14 } },
3697     { ISD::CTPOP,      MVT::v8i32,   {  7, 11, 14, 14 } },
3698     { ISD::CTPOP,      MVT::v16i32,  {  7, 12, 14, 16 } },
3699     { ISD::CTPOP,      MVT::v8i16,   {  2,  7, 11, 11 } },
3700     { ISD::CTPOP,      MVT::v16i16,  {  2,  7, 11, 11 } },
3701     { ISD::CTPOP,      MVT::v32i16,  {  3,  7, 11, 13 } },
3702     { ISD::CTPOP,      MVT::v16i8,   {  2,  4,  8,  8 } },
3703     { ISD::CTPOP,      MVT::v32i8,   {  2,  4,  8,  8 } },
3704     { ISD::CTPOP,      MVT::v64i8,   {  2,  5,  8, 10 } },
3705     { ISD::CTTZ,       MVT::v8i16,   {  3,  9, 14, 14 } },
3706     { ISD::CTTZ,       MVT::v16i16,  {  3,  9, 14, 14 } },
3707     { ISD::CTTZ,       MVT::v32i16,  {  3, 10, 14, 16 } },
3708     { ISD::CTTZ,       MVT::v16i8,   {  2,  6, 11, 11 } },
3709     { ISD::CTTZ,       MVT::v32i8,   {  2,  6, 11, 11 } },
3710     { ISD::CTTZ,       MVT::v64i8,   {  3,  7, 11, 13 } },
3711     { ISD::ROTL,       MVT::v32i16,  {  2,  8,  6,  8 } },
3712     { ISD::ROTL,       MVT::v16i16,  {  2,  8,  6,  7 } },
3713     { ISD::ROTL,       MVT::v8i16,   {  2,  7,  6,  7 } },
3714     { ISD::ROTL,       MVT::v64i8,   {  5,  6, 11, 12 } },
3715     { ISD::ROTL,       MVT::v32i8,   {  5, 15,  7, 10 } },
3716     { ISD::ROTL,       MVT::v16i8,   {  5, 15,  7, 10 } },
3717     { ISD::ROTR,       MVT::v32i16,  {  2,  8,  6,  8 } },
3718     { ISD::ROTR,       MVT::v16i16,  {  2,  8,  6,  7 } },
3719     { ISD::ROTR,       MVT::v8i16,   {  2,  7,  6,  7 } },
3720     { ISD::ROTR,       MVT::v64i8,   {  5,  6, 12, 14 } },
3721     { ISD::ROTR,       MVT::v32i8,   {  5, 14,  6,  9 } },
3722     { ISD::ROTR,       MVT::v16i8,   {  5, 14,  6,  9 } },
3723     { X86ISD::VROTLI,  MVT::v32i16,  {  2,  5,  3,  3 } },
3724     { X86ISD::VROTLI,  MVT::v16i16,  {  1,  5,  3,  3 } },
3725     { X86ISD::VROTLI,  MVT::v8i16,   {  1,  5,  3,  3 } },
3726     { X86ISD::VROTLI,  MVT::v64i8,   {  2,  9,  3,  4 } },
3727     { X86ISD::VROTLI,  MVT::v32i8,   {  1,  9,  3,  4 } },
3728     { X86ISD::VROTLI,  MVT::v16i8,   {  1,  8,  3,  4 } },
3729     { ISD::SADDSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
3730     { ISD::SADDSAT,    MVT::v64i8,   {  1,  1,  1,  1 } },
3731     { ISD::SMAX,       MVT::v32i16,  {  1,  1,  1,  1 } },
3732     { ISD::SMAX,       MVT::v64i8,   {  1,  1,  1,  1 } },
3733     { ISD::SMIN,       MVT::v32i16,  {  1,  1,  1,  1 } },
3734     { ISD::SMIN,       MVT::v64i8,   {  1,  1,  1,  1 } },
3735     { ISD::SMULO,      MVT::v32i16,  {  3,  6,  4,  4 } },
3736     { ISD::SMULO,      MVT::v64i8,   {  8, 21, 17, 18 } },
3737     { ISD::UMULO,      MVT::v32i16,  {  2,  5,  3,  3 } },
3738     { ISD::UMULO,      MVT::v64i8,   {  8, 15, 15, 16 } },
3739     { ISD::SSUBSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
3740     { ISD::SSUBSAT,    MVT::v64i8,   {  1,  1,  1,  1 } },
3741     { ISD::UADDSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
3742     { ISD::UADDSAT,    MVT::v64i8,   {  1,  1,  1,  1 } },
3743     { ISD::UMAX,       MVT::v32i16,  {  1,  1,  1,  1 } },
3744     { ISD::UMAX,       MVT::v64i8,   {  1,  1,  1,  1 } },
3745     { ISD::UMIN,       MVT::v32i16,  {  1,  1,  1,  1 } },
3746     { ISD::UMIN,       MVT::v64i8,   {  1,  1,  1,  1 } },
3747     { ISD::USUBSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
3748     { ISD::USUBSAT,    MVT::v64i8,   {  1,  1,  1,  1 } },
3749   };
3750   static const CostKindTblEntry AVX512CostTbl[] = {
3751     { ISD::ABS,        MVT::v8i64,   {  1,  1,  1,  1 } },
3752     { ISD::ABS,        MVT::v4i64,   {  1,  1,  1,  1 } },
3753     { ISD::ABS,        MVT::v2i64,   {  1,  1,  1,  1 } },
3754     { ISD::ABS,        MVT::v16i32,  {  1,  1,  1,  1 } },
3755     { ISD::ABS,        MVT::v8i32,   {  1,  1,  1,  1 } },
3756     { ISD::ABS,        MVT::v32i16,  {  2,  7,  4,  4 } },
3757     { ISD::ABS,        MVT::v16i16,  {  1,  1,  1,  1 } },
3758     { ISD::ABS,        MVT::v64i8,   {  2,  7,  4,  4 } },
3759     { ISD::ABS,        MVT::v32i8,   {  1,  1,  1,  1 } },
3760     { ISD::BITREVERSE, MVT::v8i64,   {  9, 13, 20, 20 } },
3761     { ISD::BITREVERSE, MVT::v16i32,  {  9, 13, 20, 20 } },
3762     { ISD::BITREVERSE, MVT::v32i16,  {  9, 13, 20, 20 } },
3763     { ISD::BITREVERSE, MVT::v64i8,   {  6, 11, 17, 17 } },
3764     { ISD::BSWAP,      MVT::v8i64,   {  4,  7,  5,  5 } },
3765     { ISD::BSWAP,      MVT::v16i32,  {  4,  7,  5,  5 } },
3766     { ISD::BSWAP,      MVT::v32i16,  {  4,  7,  5,  5 } },
3767     { ISD::CTLZ,       MVT::v8i64,   { 10, 28, 32, 32 } },
3768     { ISD::CTLZ,       MVT::v16i32,  { 12, 30, 38, 38 } },
3769     { ISD::CTLZ,       MVT::v32i16,  {  8, 15, 29, 29 } },
3770     { ISD::CTLZ,       MVT::v64i8,   {  6, 11, 19, 19 } },
3771     { ISD::CTPOP,      MVT::v8i64,   { 16, 16, 19, 19 } },
3772     { ISD::CTPOP,      MVT::v16i32,  { 24, 19, 27, 27 } },
3773     { ISD::CTPOP,      MVT::v32i16,  { 18, 15, 22, 22 } },
3774     { ISD::CTPOP,      MVT::v64i8,   { 12, 11, 16, 16 } },
3775     { ISD::CTTZ,       MVT::v8i64,   {  2,  8,  6,  7 } },
3776     { ISD::CTTZ,       MVT::v16i32,  {  2,  8,  6,  7 } },
3777     { ISD::CTTZ,       MVT::v32i16,  {  7, 17, 27, 27 } },
3778     { ISD::CTTZ,       MVT::v64i8,   {  6, 13, 21, 21 } },
3779     { ISD::ROTL,       MVT::v8i64,   {  1,  1,  1,  1 } },
3780     { ISD::ROTL,       MVT::v4i64,   {  1,  1,  1,  1 } },
3781     { ISD::ROTL,       MVT::v2i64,   {  1,  1,  1,  1 } },
3782     { ISD::ROTL,       MVT::v16i32,  {  1,  1,  1,  1 } },
3783     { ISD::ROTL,       MVT::v8i32,   {  1,  1,  1,  1 } },
3784     { ISD::ROTL,       MVT::v4i32,   {  1,  1,  1,  1 } },
3785     { ISD::ROTR,       MVT::v8i64,   {  1,  1,  1,  1 } },
3786     { ISD::ROTR,       MVT::v4i64,   {  1,  1,  1,  1 } },
3787     { ISD::ROTR,       MVT::v2i64,   {  1,  1,  1,  1 } },
3788     { ISD::ROTR,       MVT::v16i32,  {  1,  1,  1,  1 } },
3789     { ISD::ROTR,       MVT::v8i32,   {  1,  1,  1,  1 } },
3790     { ISD::ROTR,       MVT::v4i32,   {  1,  1,  1,  1 } },
3791     { X86ISD::VROTLI,  MVT::v8i64,   {  1,  1,  1,  1 } },
3792     { X86ISD::VROTLI,  MVT::v4i64,   {  1,  1,  1,  1 } },
3793     { X86ISD::VROTLI,  MVT::v2i64,   {  1,  1,  1,  1 } },
3794     { X86ISD::VROTLI,  MVT::v16i32,  {  1,  1,  1,  1 } },
3795     { X86ISD::VROTLI,  MVT::v8i32,   {  1,  1,  1,  1 } },
3796     { X86ISD::VROTLI,  MVT::v4i32,   {  1,  1,  1,  1 } },
3797     { ISD::SADDSAT,    MVT::v2i64,   {  3,  3,  8,  9 } },
3798     { ISD::SADDSAT,    MVT::v4i64,   {  2,  2,  6,  7 } },
3799     { ISD::SADDSAT,    MVT::v8i64,   {  3,  3,  6,  7 } },
3800     { ISD::SADDSAT,    MVT::v4i32,   {  2,  2,  6,  7 } },
3801     { ISD::SADDSAT,    MVT::v8i32,   {  2,  2,  6,  7 } },
3802     { ISD::SADDSAT,    MVT::v16i32,  {  3,  3,  6,  7 } },
3803     { ISD::SADDSAT,    MVT::v32i16,  {  2,  2,  2,  2 } },
3804     { ISD::SADDSAT,    MVT::v64i8,   {  2,  2,  2,  2 } },
3805     { ISD::SMAX,       MVT::v8i64,   {  1,  3,  1,  1 } },
3806     { ISD::SMAX,       MVT::v16i32,  {  1,  1,  1,  1 } },
3807     { ISD::SMAX,       MVT::v32i16,  {  3,  7,  5,  5 } },
3808     { ISD::SMAX,       MVT::v64i8,   {  3,  7,  5,  5 } },
3809     { ISD::SMAX,       MVT::v4i64,   {  1,  3,  1,  1 } },
3810     { ISD::SMAX,       MVT::v2i64,   {  1,  3,  1,  1 } },
3811     { ISD::SMIN,       MVT::v8i64,   {  1,  3,  1,  1 } },
3812     { ISD::SMIN,       MVT::v16i32,  {  1,  1,  1,  1 } },
3813     { ISD::SMIN,       MVT::v32i16,  {  3,  7,  5,  5 } },
3814     { ISD::SMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
3815     { ISD::SMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
3816     { ISD::SMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
3817     { ISD::SMULO,      MVT::v8i64,   { 44, 44, 81, 93 } },
3818     { ISD::SMULO,      MVT::v16i32,  {  5, 12,  9, 11 } },
3819     { ISD::SMULO,      MVT::v32i16,  {  6, 12, 17, 17 } },
3820     { ISD::SMULO,      MVT::v64i8,   { 22, 28, 42, 42 } },
3821     { ISD::SSUBSAT,    MVT::v2i64,   {  2, 13,  9, 10 } },
3822     { ISD::SSUBSAT,    MVT::v4i64,   {  2, 15,  7,  8 } },
3823     { ISD::SSUBSAT,    MVT::v8i64,   {  2, 14,  7,  8 } },
3824     { ISD::SSUBSAT,    MVT::v4i32,   {  2, 14,  7,  8 } },
3825     { ISD::SSUBSAT,    MVT::v8i32,   {  2, 15,  7,  8 } },
3826     { ISD::SSUBSAT,    MVT::v16i32,  {  2, 14,  7,  8 } },
3827     { ISD::SSUBSAT,    MVT::v32i16,  {  2,  2,  2,  2 } },
3828     { ISD::SSUBSAT,    MVT::v64i8,   {  2,  2,  2,  2 } },
3829     { ISD::UMAX,       MVT::v8i64,   {  1,  3,  1,  1 } },
3830     { ISD::UMAX,       MVT::v16i32,  {  1,  1,  1,  1 } },
3831     { ISD::UMAX,       MVT::v32i16,  {  3,  7,  5,  5 } },
3832     { ISD::UMAX,       MVT::v64i8,   {  3,  7,  5,  5 } },
3833     { ISD::UMAX,       MVT::v4i64,   {  1,  3,  1,  1 } },
3834     { ISD::UMAX,       MVT::v2i64,   {  1,  3,  1,  1 } },
3835     { ISD::UMIN,       MVT::v8i64,   {  1,  3,  1,  1 } },
3836     { ISD::UMIN,       MVT::v16i32,  {  1,  1,  1,  1 } },
3837     { ISD::UMIN,       MVT::v32i16,  {  3,  7,  5,  5 } },
3838     { ISD::UMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
3839     { ISD::UMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
3840     { ISD::UMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
3841     { ISD::UMULO,      MVT::v8i64,   { 52, 52, 95, 104} },
3842     { ISD::UMULO,      MVT::v16i32,  {  5, 12,  8, 10 } },
3843     { ISD::UMULO,      MVT::v32i16,  {  5, 13, 16, 16 } },
3844     { ISD::UMULO,      MVT::v64i8,   { 18, 24, 30, 30 } },
3845     { ISD::UADDSAT,    MVT::v2i64,   {  1,  4,  4,  4 } },
3846     { ISD::UADDSAT,    MVT::v4i64,   {  1,  4,  4,  4 } },
3847     { ISD::UADDSAT,    MVT::v8i64,   {  1,  4,  4,  4 } },
3848     { ISD::UADDSAT,    MVT::v4i32,   {  1,  2,  4,  4 } },
3849     { ISD::UADDSAT,    MVT::v8i32,   {  1,  2,  4,  4 } },
3850     { ISD::UADDSAT,    MVT::v16i32,  {  2,  2,  4,  4 } },
3851     { ISD::UADDSAT,    MVT::v32i16,  {  2,  2,  2,  2 } },
3852     { ISD::UADDSAT,    MVT::v64i8,   {  2,  2,  2,  2 } },
3853     { ISD::USUBSAT,    MVT::v2i64,   {  1,  4,  2,  2 } },
3854     { ISD::USUBSAT,    MVT::v4i64,   {  1,  4,  2,  2 } },
3855     { ISD::USUBSAT,    MVT::v8i64,   {  1,  4,  2,  2 } },
3856     { ISD::USUBSAT,    MVT::v8i32,   {  1,  2,  2,  2 } },
3857     { ISD::USUBSAT,    MVT::v16i32,  {  1,  2,  2,  2 } },
3858     { ISD::USUBSAT,    MVT::v32i16,  {  2,  2,  2,  2 } },
3859     { ISD::USUBSAT,    MVT::v64i8,   {  2,  2,  2,  2 } },
3860     { ISD::FMAXNUM,    MVT::f32,     {  2,  2,  3,  3 } },
3861     { ISD::FMAXNUM,    MVT::v4f32,   {  1,  1,  3,  3 } },
3862     { ISD::FMAXNUM,    MVT::v8f32,   {  2,  2,  3,  3 } },
3863     { ISD::FMAXNUM,    MVT::v16f32,  {  4,  4,  3,  3 } },
3864     { ISD::FMAXNUM,    MVT::f64,     {  2,  2,  3,  3 } },
3865     { ISD::FMAXNUM,    MVT::v2f64,   {  1,  1,  3,  3 } },
3866     { ISD::FMAXNUM,    MVT::v4f64,   {  2,  2,  3,  3 } },
3867     { ISD::FMAXNUM,    MVT::v8f64,   {  3,  3,  3,  3 } },
3868     { ISD::FSQRT,      MVT::f32,     {  3, 12,  1,  1 } }, // Skylake from http://www.agner.org/
3869     { ISD::FSQRT,      MVT::v4f32,   {  3, 12,  1,  1 } }, // Skylake from http://www.agner.org/
3870     { ISD::FSQRT,      MVT::v8f32,   {  6, 12,  1,  1 } }, // Skylake from http://www.agner.org/
3871     { ISD::FSQRT,      MVT::v16f32,  { 12, 20,  1,  3 } }, // Skylake from http://www.agner.org/
3872     { ISD::FSQRT,      MVT::f64,     {  6, 18,  1,  1 } }, // Skylake from http://www.agner.org/
3873     { ISD::FSQRT,      MVT::v2f64,   {  6, 18,  1,  1 } }, // Skylake from http://www.agner.org/
3874     { ISD::FSQRT,      MVT::v4f64,   { 12, 18,  1,  1 } }, // Skylake from http://www.agner.org/
3875     { ISD::FSQRT,      MVT::v8f64,   { 24, 32,  1,  3 } }, // Skylake from http://www.agner.org/
3876   };
3877   static const CostKindTblEntry XOPCostTbl[] = {
3878     { ISD::BITREVERSE, MVT::v4i64,   {  3,  6,  5,  6 } },
3879     { ISD::BITREVERSE, MVT::v8i32,   {  3,  6,  5,  6 } },
3880     { ISD::BITREVERSE, MVT::v16i16,  {  3,  6,  5,  6 } },
3881     { ISD::BITREVERSE, MVT::v32i8,   {  3,  6,  5,  6 } },
3882     { ISD::BITREVERSE, MVT::v2i64,   {  2,  7,  1,  1 } },
3883     { ISD::BITREVERSE, MVT::v4i32,   {  2,  7,  1,  1 } },
3884     { ISD::BITREVERSE, MVT::v8i16,   {  2,  7,  1,  1 } },
3885     { ISD::BITREVERSE, MVT::v16i8,   {  2,  7,  1,  1 } },
3886     { ISD::BITREVERSE, MVT::i64,     {  2,  2,  3,  4 } },
3887     { ISD::BITREVERSE, MVT::i32,     {  2,  2,  3,  4 } },
3888     { ISD::BITREVERSE, MVT::i16,     {  2,  2,  3,  4 } },
3889     { ISD::BITREVERSE, MVT::i8,      {  2,  2,  3,  4 } },
3890     // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3891     { ISD::ROTL,       MVT::v4i64,   {  4,  7,  5,  6 } },
3892     { ISD::ROTL,       MVT::v8i32,   {  4,  7,  5,  6 } },
3893     { ISD::ROTL,       MVT::v16i16,  {  4,  7,  5,  6 } },
3894     { ISD::ROTL,       MVT::v32i8,   {  4,  7,  5,  6 } },
3895     { ISD::ROTL,       MVT::v2i64,   {  1,  3,  1,  1 } },
3896     { ISD::ROTL,       MVT::v4i32,   {  1,  3,  1,  1 } },
3897     { ISD::ROTL,       MVT::v8i16,   {  1,  3,  1,  1 } },
3898     { ISD::ROTL,       MVT::v16i8,   {  1,  3,  1,  1 } },
3899     { ISD::ROTR,       MVT::v4i64,   {  4,  7,  8,  9 } },
3900     { ISD::ROTR,       MVT::v8i32,   {  4,  7,  8,  9 } },
3901     { ISD::ROTR,       MVT::v16i16,  {  4,  7,  8,  9 } },
3902     { ISD::ROTR,       MVT::v32i8,   {  4,  7,  8,  9 } },
3903     { ISD::ROTR,       MVT::v2i64,   {  1,  3,  3,  3 } },
3904     { ISD::ROTR,       MVT::v4i32,   {  1,  3,  3,  3 } },
3905     { ISD::ROTR,       MVT::v8i16,   {  1,  3,  3,  3 } },
3906     { ISD::ROTR,       MVT::v16i8,   {  1,  3,  3,  3 } },
3907     { X86ISD::VROTLI,  MVT::v4i64,   {  4,  7,  5,  6 } },
3908     { X86ISD::VROTLI,  MVT::v8i32,   {  4,  7,  5,  6 } },
3909     { X86ISD::VROTLI,  MVT::v16i16,  {  4,  7,  5,  6 } },
3910     { X86ISD::VROTLI,  MVT::v32i8,   {  4,  7,  5,  6 } },
3911     { X86ISD::VROTLI,  MVT::v2i64,   {  1,  3,  1,  1 } },
3912     { X86ISD::VROTLI,  MVT::v4i32,   {  1,  3,  1,  1 } },
3913     { X86ISD::VROTLI,  MVT::v8i16,   {  1,  3,  1,  1 } },
3914     { X86ISD::VROTLI,  MVT::v16i8,   {  1,  3,  1,  1 } },
3915   };
3916   static const CostKindTblEntry AVX2CostTbl[] = {
3917     { ISD::ABS,        MVT::v2i64,   {  2,  4,  3,  5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3918     { ISD::ABS,        MVT::v4i64,   {  2,  4,  3,  5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3919     { ISD::ABS,        MVT::v4i32,   {  1,  1,  1,  1 } },
3920     { ISD::ABS,        MVT::v8i32,   {  1,  1,  1,  2 } },
3921     { ISD::ABS,        MVT::v8i16,   {  1,  1,  1,  1 } },
3922     { ISD::ABS,        MVT::v16i16,  {  1,  1,  1,  2 } },
3923     { ISD::ABS,        MVT::v16i8,   {  1,  1,  1,  1 } },
3924     { ISD::ABS,        MVT::v32i8,   {  1,  1,  1,  2 } },
3925     { ISD::BITREVERSE, MVT::v2i64,   {  3, 11, 10, 11 } },
3926     { ISD::BITREVERSE, MVT::v4i64,   {  5, 11, 10, 17 } },
3927     { ISD::BITREVERSE, MVT::v4i32,   {  3, 11, 10, 11 } },
3928     { ISD::BITREVERSE, MVT::v8i32,   {  5, 11, 10, 17 } },
3929     { ISD::BITREVERSE, MVT::v8i16,   {  3, 11, 10, 11 } },
3930     { ISD::BITREVERSE, MVT::v16i16,  {  5, 11, 10, 17 } },
3931     { ISD::BITREVERSE, MVT::v16i8,   {  3,  6,  9,  9 } },
3932     { ISD::BITREVERSE, MVT::v32i8,   {  4,  5,  9, 15 } },
3933     { ISD::BSWAP,      MVT::v2i64,   {  1,  2,  1,  2 } },
3934     { ISD::BSWAP,      MVT::v4i64,   {  1,  3,  1,  2 } },
3935     { ISD::BSWAP,      MVT::v4i32,   {  1,  2,  1,  2 } },
3936     { ISD::BSWAP,      MVT::v8i32,   {  1,  3,  1,  2 } },
3937     { ISD::BSWAP,      MVT::v8i16,   {  1,  2,  1,  2 } },
3938     { ISD::BSWAP,      MVT::v16i16,  {  1,  3,  1,  2 } },
3939     { ISD::CTLZ,       MVT::v2i64,   {  7, 18, 24, 25 } },
3940     { ISD::CTLZ,       MVT::v4i64,   { 14, 18, 24, 44 } },
3941     { ISD::CTLZ,       MVT::v4i32,   {  5, 16, 19, 20 } },
3942     { ISD::CTLZ,       MVT::v8i32,   { 10, 16, 19, 34 } },
3943     { ISD::CTLZ,       MVT::v8i16,   {  4, 13, 14, 15 } },
3944     { ISD::CTLZ,       MVT::v16i16,  {  6, 14, 14, 24 } },
3945     { ISD::CTLZ,       MVT::v16i8,   {  3, 12,  9, 10 } },
3946     { ISD::CTLZ,       MVT::v32i8,   {  4, 12,  9, 14 } },
3947     { ISD::CTPOP,      MVT::v2i64,   {  3,  9, 10, 10 } },
3948     { ISD::CTPOP,      MVT::v4i64,   {  4,  9, 10, 14 } },
3949     { ISD::CTPOP,      MVT::v4i32,   {  7, 12, 14, 14 } },
3950     { ISD::CTPOP,      MVT::v8i32,   {  7, 12, 14, 18 } },
3951     { ISD::CTPOP,      MVT::v8i16,   {  3,  7, 11, 11 } },
3952     { ISD::CTPOP,      MVT::v16i16,  {  6,  8, 11, 18 } },
3953     { ISD::CTPOP,      MVT::v16i8,   {  2,  5,  8,  8 } },
3954     { ISD::CTPOP,      MVT::v32i8,   {  3,  5,  8, 12 } },
3955     { ISD::CTTZ,       MVT::v2i64,   {  4, 11, 13, 13 } },
3956     { ISD::CTTZ,       MVT::v4i64,   {  5, 11, 13, 20 } },
3957     { ISD::CTTZ,       MVT::v4i32,   {  7, 14, 17, 17 } },
3958     { ISD::CTTZ,       MVT::v8i32,   {  7, 15, 17, 24 } },
3959     { ISD::CTTZ,       MVT::v8i16,   {  4,  9, 14, 14 } },
3960     { ISD::CTTZ,       MVT::v16i16,  {  6,  9, 14, 24 } },
3961     { ISD::CTTZ,       MVT::v16i8,   {  3,  7, 11, 11 } },
3962     { ISD::CTTZ,       MVT::v32i8,   {  5,  7, 11, 18 } },
3963     { ISD::SADDSAT,    MVT::v2i64,   {  4, 13,  8, 11 } },
3964     { ISD::SADDSAT,    MVT::v4i64,   {  3, 10,  8, 12 } },
3965     { ISD::SADDSAT,    MVT::v4i32,   {  2,  6,  7,  9 } },
3966     { ISD::SADDSAT,    MVT::v8i32,   {  4,  6,  7, 13 } },
3967     { ISD::SADDSAT,    MVT::v16i16,  {  1,  1,  1,  2 } },
3968     { ISD::SADDSAT,    MVT::v32i8,   {  1,  1,  1,  2 } },
3969     { ISD::SMAX,       MVT::v2i64,   {  2,  7,  2,  3 } },
3970     { ISD::SMAX,       MVT::v4i64,   {  2,  7,  2,  3 } },
3971     { ISD::SMAX,       MVT::v8i32,   {  1,  1,  1,  2 } },
3972     { ISD::SMAX,       MVT::v16i16,  {  1,  1,  1,  2 } },
3973     { ISD::SMAX,       MVT::v32i8,   {  1,  1,  1,  2 } },
3974     { ISD::SMIN,       MVT::v2i64,   {  2,  7,  2,  3 } },
3975     { ISD::SMIN,       MVT::v4i64,   {  2,  7,  2,  3 } },
3976     { ISD::SMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
3977     { ISD::SMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
3978     { ISD::SMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
3979     { ISD::SMULO,      MVT::v4i64,   { 20, 20, 33, 37 } },
3980     { ISD::SMULO,      MVT::v2i64,   {  8,  8, 13, 15 } },
3981     { ISD::SMULO,      MVT::v8i32,   {  8, 20, 13, 24 } },
3982     { ISD::SMULO,      MVT::v4i32,   {  5, 15, 11, 12 } },
3983     { ISD::SMULO,      MVT::v16i16,  {  4, 14,  8, 14 } },
3984     { ISD::SMULO,      MVT::v8i16,   {  3,  9,  6,  6 } },
3985     { ISD::SMULO,      MVT::v32i8,   {  9, 15, 18, 35 } },
3986     { ISD::SMULO,      MVT::v16i8,   {  6, 22, 14, 21 } },
3987     { ISD::SSUBSAT,    MVT::v2i64,   {  4, 13,  9, 13 } },
3988     { ISD::SSUBSAT,    MVT::v4i64,   {  4, 15,  9, 13 } },
3989     { ISD::SSUBSAT,    MVT::v4i32,   {  3, 14,  9, 11 } },
3990     { ISD::SSUBSAT,    MVT::v8i32,   {  4, 15,  9, 16 } },
3991     { ISD::SSUBSAT,    MVT::v16i16,  {  1,  1,  1,  2 } },
3992     { ISD::SSUBSAT,    MVT::v32i8,   {  1,  1,  1,  2 } },
3993     { ISD::UADDSAT,    MVT::v2i64,   {  2,  8,  6,  6 } },
3994     { ISD::UADDSAT,    MVT::v4i64,   {  3,  8,  6, 10 } },
3995     { ISD::UADDSAT,    MVT::v8i32,   {  2,  2,  4,  8 } },
3996     { ISD::UADDSAT,    MVT::v16i16,  {  1,  1,  1,  2 } },
3997     { ISD::UADDSAT,    MVT::v32i8,   {  1,  1,  1,  2 } },
3998     { ISD::UMAX,       MVT::v2i64,   {  2,  8,  5,  6 } },
3999     { ISD::UMAX,       MVT::v4i64,   {  2,  8,  5,  8 } },
4000     { ISD::UMAX,       MVT::v8i32,   {  1,  1,  1,  2 } },
4001     { ISD::UMAX,       MVT::v16i16,  {  1,  1,  1,  2 } },
4002     { ISD::UMAX,       MVT::v32i8,   {  1,  1,  1,  2 } },
4003     { ISD::UMIN,       MVT::v2i64,   {  2,  8,  5,  6 } },
4004     { ISD::UMIN,       MVT::v4i64,   {  2,  8,  5,  8 } },
4005     { ISD::UMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
4006     { ISD::UMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
4007     { ISD::UMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
4008     { ISD::UMULO,      MVT::v4i64,   { 24, 24, 39, 43 } },
4009     { ISD::UMULO,      MVT::v2i64,   { 10, 10, 15, 19 } },
4010     { ISD::UMULO,      MVT::v8i32,   {  8, 11, 13, 23 } },
4011     { ISD::UMULO,      MVT::v4i32,   {  5, 12, 11, 12 } },
4012     { ISD::UMULO,      MVT::v16i16,  {  4,  6,  8, 13 } },
4013     { ISD::UMULO,      MVT::v8i16,   {  2,  8,  6,  6 } },
4014     { ISD::UMULO,      MVT::v32i8,   {  9, 13, 17, 33 } },
4015     { ISD::UMULO,      MVT::v16i8,   {  6, 19, 13, 20 } },
4016     { ISD::USUBSAT,    MVT::v2i64,   {  2,  7,  6,  6 } },
4017     { ISD::USUBSAT,    MVT::v4i64,   {  3,  7,  6, 10 } },
4018     { ISD::USUBSAT,    MVT::v8i32,   {  2,  2,  2,  4 } },
4019     { ISD::USUBSAT,    MVT::v16i16,  {  1,  1,  1,  2 } },
4020     { ISD::USUBSAT,    MVT::v32i8,   {  1,  1,  1,  2 } },
4021     { ISD::FMAXNUM,    MVT::f32,     {  2,  7,  3,  5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4022     { ISD::FMAXNUM,    MVT::v4f32,   {  2,  7,  3,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4023     { ISD::FMAXNUM,    MVT::v8f32,   {  3,  7,  3,  6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4024     { ISD::FMAXNUM,    MVT::f64,     {  2,  7,  3,  5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4025     { ISD::FMAXNUM,    MVT::v2f64,   {  2,  7,  3,  5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4026     { ISD::FMAXNUM,    MVT::v4f64,   {  3,  7,  3,  6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4027     { ISD::FSQRT,      MVT::f32,     {  7, 15,  1,  1 } }, // vsqrtss
4028     { ISD::FSQRT,      MVT::v4f32,   {  7, 15,  1,  1 } }, // vsqrtps
4029     { ISD::FSQRT,      MVT::v8f32,   { 14, 21,  1,  3 } }, // vsqrtps
4030     { ISD::FSQRT,      MVT::f64,     { 14, 21,  1,  1 } }, // vsqrtsd
4031     { ISD::FSQRT,      MVT::v2f64,   { 14, 21,  1,  1 } }, // vsqrtpd
4032     { ISD::FSQRT,      MVT::v4f64,   { 28, 35,  1,  3 } }, // vsqrtpd
4033   };
4034   static const CostKindTblEntry AVX1CostTbl[] = {
4035     { ISD::ABS,        MVT::v4i64,   {  6,  8,  6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4036     { ISD::ABS,        MVT::v8i32,   {  3,  6,  4,  5 } },
4037     { ISD::ABS,        MVT::v16i16,  {  3,  6,  4,  5 } },
4038     { ISD::ABS,        MVT::v32i8,   {  3,  6,  4,  5 } },
4039     { ISD::BITREVERSE, MVT::v4i64,   { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4040     { ISD::BITREVERSE, MVT::v2i64,   {  8, 13, 10, 16 } },
4041     { ISD::BITREVERSE, MVT::v8i32,   { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4042     { ISD::BITREVERSE, MVT::v4i32,   {  8, 13, 10, 16 } },
4043     { ISD::BITREVERSE, MVT::v16i16,  { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4044     { ISD::BITREVERSE, MVT::v8i16,   {  8, 13, 10, 16 } },
4045     { ISD::BITREVERSE, MVT::v32i8,   { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4046     { ISD::BITREVERSE, MVT::v16i8,   {  7,  7,  9, 13 } },
4047     { ISD::BSWAP,      MVT::v4i64,   {  5,  6,  5, 10 } },
4048     { ISD::BSWAP,      MVT::v2i64,   {  2,  2,  1,  3 } },
4049     { ISD::BSWAP,      MVT::v8i32,   {  5,  6,  5, 10 } },
4050     { ISD::BSWAP,      MVT::v4i32,   {  2,  2,  1,  3 } },
4051     { ISD::BSWAP,      MVT::v16i16,  {  5,  6,  5, 10 } },
4052     { ISD::BSWAP,      MVT::v8i16,   {  2,  2,  1,  3 } },
4053     { ISD::CTLZ,       MVT::v4i64,   { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4054     { ISD::CTLZ,       MVT::v2i64,   { 14, 24, 24, 28 } },
4055     { ISD::CTLZ,       MVT::v8i32,   { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4056     { ISD::CTLZ,       MVT::v4i32,   { 12, 20, 19, 23 } },
4057     { ISD::CTLZ,       MVT::v16i16,  { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4058     { ISD::CTLZ,       MVT::v8i16,   {  9, 16, 14, 18 } },
4059     { ISD::CTLZ,       MVT::v32i8,   { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4060     { ISD::CTLZ,       MVT::v16i8,   {  7, 12,  9, 13 } },
4061     { ISD::CTPOP,      MVT::v4i64,   { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4062     { ISD::CTPOP,      MVT::v2i64,   {  7, 14, 10, 14 } },
4063     { ISD::CTPOP,      MVT::v8i32,   { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4064     { ISD::CTPOP,      MVT::v4i32,   {  9, 20, 14, 18 } },
4065     { ISD::CTPOP,      MVT::v16i16,  { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4066     { ISD::CTPOP,      MVT::v8i16,   {  8, 18, 11, 15 } },
4067     { ISD::CTPOP,      MVT::v32i8,   { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4068     { ISD::CTPOP,      MVT::v16i8,   {  6, 12,  8, 12 } },
4069     { ISD::CTTZ,       MVT::v4i64,   { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4070     { ISD::CTTZ,       MVT::v2i64,   {  9, 19, 13, 17 } },
4071     { ISD::CTTZ,       MVT::v8i32,   { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4072     { ISD::CTTZ,       MVT::v4i32,   { 11, 24, 17, 21 } },
4073     { ISD::CTTZ,       MVT::v16i16,  { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4074     { ISD::CTTZ,       MVT::v8i16,   {  9, 21, 14, 18 } },
4075     { ISD::CTTZ,       MVT::v32i8,   { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4076     { ISD::CTTZ,       MVT::v16i8,   {  8, 16, 11, 15 } },
4077     { ISD::SADDSAT,    MVT::v2i64,   {  6, 13,  8, 11 } },
4078     { ISD::SADDSAT,    MVT::v4i64,   { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4079     { ISD::SADDSAT,    MVT::v8i32,   { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4080     { ISD::SADDSAT,    MVT::v16i16,  {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4081     { ISD::SADDSAT,    MVT::v32i8,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4082     { ISD::SMAX,       MVT::v4i64,   {  6,  9,  6, 12 } }, // 2 x 128-bit Op + extract/insert
4083     { ISD::SMAX,       MVT::v2i64,   {  3,  7,  2,  4 } },
4084     { ISD::SMAX,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4085     { ISD::SMAX,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4086     { ISD::SMAX,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4087     { ISD::SMIN,       MVT::v4i64,   {  6,  9,  6, 12 } }, // 2 x 128-bit Op + extract/insert
4088     { ISD::SMIN,       MVT::v2i64,   {  3,  7,  2,  3 } },
4089     { ISD::SMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4090     { ISD::SMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4091     { ISD::SMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4092     { ISD::SMULO,      MVT::v4i64,   { 20, 20, 33, 37 } },
4093     { ISD::SMULO,      MVT::v2i64,   {  9,  9, 13, 17 } },
4094     { ISD::SMULO,      MVT::v8i32,   { 15, 20, 24, 29 } },
4095     { ISD::SMULO,      MVT::v4i32,   {  7, 15, 11, 13 } },
4096     { ISD::SMULO,      MVT::v16i16,  {  8, 14, 14, 15 } },
4097     { ISD::SMULO,      MVT::v8i16,   {  3,  9,  6,  6 } },
4098     { ISD::SMULO,      MVT::v32i8,   { 20, 20, 37, 39 } },
4099     { ISD::SMULO,      MVT::v16i8,   {  9, 22, 18, 21 } },
4100     { ISD::SSUBSAT,    MVT::v2i64,   {  7, 13,  9, 13 } },
4101     { ISD::SSUBSAT,    MVT::v4i64,   { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4102     { ISD::SSUBSAT,    MVT::v8i32,   { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4103     { ISD::SSUBSAT,    MVT::v16i16,  {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4104     { ISD::SSUBSAT,    MVT::v32i8,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4105     { ISD::UADDSAT,    MVT::v2i64,   {  3,  8,  6,  6 } },
4106     { ISD::UADDSAT,    MVT::v4i64,   {  8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4107     { ISD::UADDSAT,    MVT::v8i32,   {  6,  6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4108     { ISD::UADDSAT,    MVT::v16i16,  {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4109     { ISD::UADDSAT,    MVT::v32i8,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4110     { ISD::UMAX,       MVT::v4i64,   {  9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4111     { ISD::UMAX,       MVT::v2i64,   {  4,  8,  5,  7 } },
4112     { ISD::UMAX,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4113     { ISD::UMAX,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4114     { ISD::UMAX,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4115     { ISD::UMIN,       MVT::v4i64,   {  9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4116     { ISD::UMIN,       MVT::v2i64,   {  4,  8,  5,  7 } },
4117     { ISD::UMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4118     { ISD::UMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4119     { ISD::UMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4120     { ISD::UMULO,      MVT::v4i64,   { 24, 26, 39, 45 } },
4121     { ISD::UMULO,      MVT::v2i64,   { 10, 12, 15, 20 } },
4122     { ISD::UMULO,      MVT::v8i32,   { 14, 15, 23, 28 } },
4123     { ISD::UMULO,      MVT::v4i32,   {  7, 12, 11, 13 } },
4124     { ISD::UMULO,      MVT::v16i16,  {  7, 11, 13, 14 } },
4125     { ISD::UMULO,      MVT::v8i16,   {  3,  8,  6,  6 } },
4126     { ISD::UMULO,      MVT::v32i8,   { 19, 19, 35, 37 } },
4127     { ISD::UMULO,      MVT::v16i8,   {  9, 19, 17, 20 } },
4128     { ISD::USUBSAT,    MVT::v2i64,   {  3,  7,  6,  6 } },
4129     { ISD::USUBSAT,    MVT::v4i64,   {  8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4130     { ISD::USUBSAT,    MVT::v8i32,   {  4,  4,  7,  8 } }, // 2 x 128-bit Op + extract/insert
4131     { ISD::USUBSAT,    MVT::v8i32,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4132     { ISD::USUBSAT,    MVT::v16i16,  {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4133     { ISD::USUBSAT,    MVT::v32i8,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
4134     { ISD::FMAXNUM,    MVT::f32,     {  3,  6,  3,  5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4135     { ISD::FMAXNUM,    MVT::v4f32,   {  3,  6,  3,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4136     { ISD::FMAXNUM,    MVT::v8f32,   {  5,  7,  3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4137     { ISD::FMAXNUM,    MVT::f64,     {  3,  6,  3,  5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4138     { ISD::FMAXNUM,    MVT::v2f64,   {  3,  6,  3,  5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4139     { ISD::FMAXNUM,    MVT::v4f64,   {  5,  7,  3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4140     { ISD::FSQRT,      MVT::f32,     { 21, 21,  1,  1 } }, // vsqrtss
4141     { ISD::FSQRT,      MVT::v4f32,   { 21, 21,  1,  1 } }, // vsqrtps
4142     { ISD::FSQRT,      MVT::v8f32,   { 42, 42,  1,  3 } }, // vsqrtps
4143     { ISD::FSQRT,      MVT::f64,     { 27, 27,  1,  1 } }, // vsqrtsd
4144     { ISD::FSQRT,      MVT::v2f64,   { 27, 27,  1,  1 } }, // vsqrtpd
4145     { ISD::FSQRT,      MVT::v4f64,   { 54, 54,  1,  3 } }, // vsqrtpd
4146   };
4147   static const CostKindTblEntry GFNICostTbl[] = {
4148     { ISD::BITREVERSE, MVT::i8,      {  3,  3,  3,  4 } }, // gf2p8affineqb
4149     { ISD::BITREVERSE, MVT::i16,     {  3,  3,  4,  6 } }, // gf2p8affineqb
4150     { ISD::BITREVERSE, MVT::i32,     {  3,  3,  4,  5 } }, // gf2p8affineqb
4151     { ISD::BITREVERSE, MVT::i64,     {  3,  3,  4,  6 } }, // gf2p8affineqb
4152     { ISD::BITREVERSE, MVT::v16i8,   {  1,  6,  1,  2 } }, // gf2p8affineqb
4153     { ISD::BITREVERSE, MVT::v32i8,   {  1,  6,  1,  2 } }, // gf2p8affineqb
4154     { ISD::BITREVERSE, MVT::v64i8,   {  1,  6,  1,  2 } }, // gf2p8affineqb
4155     { ISD::BITREVERSE, MVT::v8i16,   {  1,  8,  2,  4 } }, // gf2p8affineqb
4156     { ISD::BITREVERSE, MVT::v16i16,  {  1,  9,  2,  4 } }, // gf2p8affineqb
4157     { ISD::BITREVERSE, MVT::v32i16,  {  1,  9,  2,  4 } }, // gf2p8affineqb
4158     { ISD::BITREVERSE, MVT::v4i32,   {  1,  8,  2,  4 } }, // gf2p8affineqb
4159     { ISD::BITREVERSE, MVT::v8i32,   {  1,  9,  2,  4 } }, // gf2p8affineqb
4160     { ISD::BITREVERSE, MVT::v16i32,  {  1,  9,  2,  4 } }, // gf2p8affineqb
4161     { ISD::BITREVERSE, MVT::v2i64,   {  1,  8,  2,  4 } }, // gf2p8affineqb
4162     { ISD::BITREVERSE, MVT::v4i64,   {  1,  9,  2,  4 } }, // gf2p8affineqb
4163     { ISD::BITREVERSE, MVT::v8i64,   {  1,  9,  2,  4 } }, // gf2p8affineqb
4164     { X86ISD::VROTLI,  MVT::v16i8,   {  1,  6,  1,  2 } }, // gf2p8affineqb
4165     { X86ISD::VROTLI,  MVT::v32i8,   {  1,  6,  1,  2 } }, // gf2p8affineqb
4166     { X86ISD::VROTLI,  MVT::v64i8,   {  1,  6,  1,  2 } }, // gf2p8affineqb
4167   };
4168   static const CostKindTblEntry GLMCostTbl[] = {
4169     { ISD::FSQRT,      MVT::f32,     { 19, 20, 1, 1 } }, // sqrtss
4170     { ISD::FSQRT,      MVT::v4f32,   { 37, 41, 1, 5 } }, // sqrtps
4171     { ISD::FSQRT,      MVT::f64,     { 34, 35, 1, 1 } }, // sqrtsd
4172     { ISD::FSQRT,      MVT::v2f64,   { 67, 71, 1, 5 } }, // sqrtpd
4173   };
4174   static const CostKindTblEntry SLMCostTbl[] = {
4175     { ISD::BSWAP,      MVT::v2i64,   {  5,  5, 1, 5 } },
4176     { ISD::BSWAP,      MVT::v4i32,   {  5,  5, 1, 5 } },
4177     { ISD::BSWAP,      MVT::v8i16,   {  5,  5, 1, 5 } },
4178     { ISD::FSQRT,      MVT::f32,     { 20, 20, 1, 1 } }, // sqrtss
4179     { ISD::FSQRT,      MVT::v4f32,   { 40, 41, 1, 5 } }, // sqrtps
4180     { ISD::FSQRT,      MVT::f64,     { 35, 35, 1, 1 } }, // sqrtsd
4181     { ISD::FSQRT,      MVT::v2f64,   { 70, 71, 1, 5 } }, // sqrtpd
4182   };
4183   static const CostKindTblEntry SSE42CostTbl[] = {
4184     { ISD::FMAXNUM,    MVT::f32,     {  5,  5,  7,  7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4185     { ISD::FMAXNUM,    MVT::v4f32,   {  4,  4,  4,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4186     { ISD::FMAXNUM,    MVT::f64,     {  5,  5,  7,  7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4187     { ISD::FMAXNUM,    MVT::v2f64,   {  4,  4,  4,  5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4188     { ISD::FSQRT,      MVT::f32,     { 18, 18,  1,  1 } }, // Nehalem from http://www.agner.org/
4189     { ISD::FSQRT,      MVT::v4f32,   { 18, 18,  1,  1 } }, // Nehalem from http://www.agner.org/
4190   };
4191   static const CostKindTblEntry SSE41CostTbl[] = {
4192     { ISD::ABS,        MVT::v2i64,   {  3,  4,  3,  5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4193     { ISD::SADDSAT,    MVT::v2i64,   { 10, 14, 17, 21 } },
4194     { ISD::SADDSAT,    MVT::v4i32,   {  5, 11,  8, 10 } },
4195     { ISD::SSUBSAT,    MVT::v2i64,   { 12, 19, 25, 29 } },
4196     { ISD::SSUBSAT,    MVT::v4i32,   {  6, 14, 10, 12 } },
4197     { ISD::SMAX,       MVT::v2i64,   {  3,  7,  2,  3 } },
4198     { ISD::SMAX,       MVT::v4i32,   {  1,  1,  1,  1 } },
4199     { ISD::SMAX,       MVT::v16i8,   {  1,  1,  1,  1 } },
4200     { ISD::SMIN,       MVT::v2i64,   {  3,  7,  2,  3 } },
4201     { ISD::SMIN,       MVT::v4i32,   {  1,  1,  1,  1 } },
4202     { ISD::SMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
4203     { ISD::SMULO,      MVT::v2i64,   {  9, 11, 13, 17 } },
4204     { ISD::SMULO,      MVT::v4i32,   { 20, 24, 13, 19 } },
4205     { ISD::SMULO,      MVT::v8i16,   {  5,  9,  8,  8 } },
4206     { ISD::SMULO,      MVT::v16i8,   { 13, 22, 24, 25 } },
4207     { ISD::UADDSAT,    MVT::v2i64,   {  6, 13, 14, 14 } },
4208     { ISD::UADDSAT,    MVT::v4i32,   {  2,  2,  4,  4 } },
4209     { ISD::USUBSAT,    MVT::v2i64,   {  6, 10, 14, 14 } },
4210     { ISD::USUBSAT,    MVT::v4i32,   {  1,  2,  2,  2 } },
4211     { ISD::UMAX,       MVT::v2i64,   {  2, 11,  6,  7 } },
4212     { ISD::UMAX,       MVT::v4i32,   {  1,  1,  1,  1 } },
4213     { ISD::UMAX,       MVT::v8i16,   {  1,  1,  1,  1 } },
4214     { ISD::UMIN,       MVT::v2i64,   {  2, 11,  6,  7 } },
4215     { ISD::UMIN,       MVT::v4i32,   {  1,  1,  1,  1 } },
4216     { ISD::UMIN,       MVT::v8i16,   {  1,  1,  1,  1 } },
4217     { ISD::UMULO,      MVT::v2i64,   { 14, 20, 15, 20 } },
4218     { ISD::UMULO,      MVT::v4i32,   { 19, 22, 12, 18 } },
4219     { ISD::UMULO,      MVT::v8i16,   {  4,  9,  7,  7 } },
4220     { ISD::UMULO,      MVT::v16i8,   { 13, 19, 18, 20 } },
4221   };
4222   static const CostKindTblEntry SSSE3CostTbl[] = {
4223     { ISD::ABS,        MVT::v4i32,   {  1,  2,  1,  1 } },
4224     { ISD::ABS,        MVT::v8i16,   {  1,  2,  1,  1 } },
4225     { ISD::ABS,        MVT::v16i8,   {  1,  2,  1,  1 } },
4226     { ISD::BITREVERSE, MVT::v2i64,   { 16, 20, 11, 21 } },
4227     { ISD::BITREVERSE, MVT::v4i32,   { 16, 20, 11, 21 } },
4228     { ISD::BITREVERSE, MVT::v8i16,   { 16, 20, 11, 21 } },
4229     { ISD::BITREVERSE, MVT::v16i8,   { 11, 12, 10, 16 } },
4230     { ISD::BSWAP,      MVT::v2i64,   {  2,  3,  1,  5 } },
4231     { ISD::BSWAP,      MVT::v4i32,   {  2,  3,  1,  5 } },
4232     { ISD::BSWAP,      MVT::v8i16,   {  2,  3,  1,  5 } },
4233     { ISD::CTLZ,       MVT::v2i64,   { 18, 28, 28, 35 } },
4234     { ISD::CTLZ,       MVT::v4i32,   { 15, 20, 22, 28 } },
4235     { ISD::CTLZ,       MVT::v8i16,   { 13, 17, 16, 22 } },
4236     { ISD::CTLZ,       MVT::v16i8,   { 11, 15, 10, 16 } },
4237     { ISD::CTPOP,      MVT::v2i64,   { 13, 19, 12, 18 } },
4238     { ISD::CTPOP,      MVT::v4i32,   { 18, 24, 16, 22 } },
4239     { ISD::CTPOP,      MVT::v8i16,   { 13, 18, 14, 20 } },
4240     { ISD::CTPOP,      MVT::v16i8,   { 11, 12, 10, 16 } },
4241     { ISD::CTTZ,       MVT::v2i64,   { 13, 25, 15, 22 } },
4242     { ISD::CTTZ,       MVT::v4i32,   { 18, 26, 19, 25 } },
4243     { ISD::CTTZ,       MVT::v8i16,   { 13, 20, 17, 23 } },
4244     { ISD::CTTZ,       MVT::v16i8,   { 11, 16, 13, 19 } }
4245   };
4246   static const CostKindTblEntry SSE2CostTbl[] = {
4247     { ISD::ABS,        MVT::v2i64,   {  3,  6,  5,  5 } },
4248     { ISD::ABS,        MVT::v4i32,   {  1,  4,  4,  4 } },
4249     { ISD::ABS,        MVT::v8i16,   {  1,  2,  3,  3 } },
4250     { ISD::ABS,        MVT::v16i8,   {  1,  2,  3,  3 } },
4251     { ISD::BITREVERSE, MVT::v2i64,   { 16, 20, 32, 32 } },
4252     { ISD::BITREVERSE, MVT::v4i32,   { 16, 20, 30, 30 } },
4253     { ISD::BITREVERSE, MVT::v8i16,   { 16, 20, 25, 25 } },
4254     { ISD::BITREVERSE, MVT::v16i8,   { 11, 12, 21, 21 } },
4255     { ISD::BSWAP,      MVT::v2i64,   {  5,  6, 11, 11 } },
4256     { ISD::BSWAP,      MVT::v4i32,   {  5,  5,  9,  9 } },
4257     { ISD::BSWAP,      MVT::v8i16,   {  5,  5,  4,  5 } },
4258     { ISD::CTLZ,       MVT::v2i64,   { 10, 45, 36, 38 } },
4259     { ISD::CTLZ,       MVT::v4i32,   { 10, 45, 38, 40 } },
4260     { ISD::CTLZ,       MVT::v8i16,   {  9, 38, 32, 34 } },
4261     { ISD::CTLZ,       MVT::v16i8,   {  8, 39, 29, 32 } },
4262     { ISD::CTPOP,      MVT::v2i64,   { 12, 26, 16, 18 } },
4263     { ISD::CTPOP,      MVT::v4i32,   { 15, 29, 21, 23 } },
4264     { ISD::CTPOP,      MVT::v8i16,   { 13, 25, 18, 20 } },
4265     { ISD::CTPOP,      MVT::v16i8,   { 10, 21, 14, 16 } },
4266     { ISD::CTTZ,       MVT::v2i64,   { 14, 28, 19, 21 } },
4267     { ISD::CTTZ,       MVT::v4i32,   { 18, 31, 24, 26 } },
4268     { ISD::CTTZ,       MVT::v8i16,   { 16, 27, 21, 23 } },
4269     { ISD::CTTZ,       MVT::v16i8,   { 13, 23, 17, 19 } },
4270     { ISD::SADDSAT,    MVT::v2i64,   { 12, 14, 24, 24 } },
4271     { ISD::SADDSAT,    MVT::v4i32,   {  6, 11, 11, 12 } },
4272     { ISD::SADDSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
4273     { ISD::SADDSAT,    MVT::v16i8,   {  1,  2,  1,  1 } },
4274     { ISD::SMAX,       MVT::v2i64,   {  4,  8, 15, 15 } },
4275     { ISD::SMAX,       MVT::v4i32,   {  2,  4,  5,  5 } },
4276     { ISD::SMAX,       MVT::v8i16,   {  1,  1,  1,  1 } },
4277     { ISD::SMAX,       MVT::v16i8,   {  2,  4,  5,  5 } },
4278     { ISD::SMIN,       MVT::v2i64,   {  4,  8, 15, 15 } },
4279     { ISD::SMIN,       MVT::v4i32,   {  2,  4,  5,  5 } },
4280     { ISD::SMIN,       MVT::v8i16,   {  1,  1,  1,  1 } },
4281     { ISD::SMIN,       MVT::v16i8,   {  2,  4,  5,  5 } },
4282     { ISD::SMULO,      MVT::v2i64,   { 30, 33, 13, 23 } },
4283     { ISD::SMULO,      MVT::v4i32,   { 20, 24, 23, 23 } },
4284     { ISD::SMULO,      MVT::v8i16,   {  5, 10,  8,  8 } },
4285     { ISD::SMULO,      MVT::v16i8,   { 13, 23, 24, 25 } },
4286     { ISD::SSUBSAT,    MVT::v2i64,   { 16, 19, 31, 31 } },
4287     { ISD::SSUBSAT,    MVT::v4i32,   {  6, 14, 12, 13 } },
4288     { ISD::SSUBSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
4289     { ISD::SSUBSAT,    MVT::v16i8,   {  1,  2,  1,  1 } },
4290     { ISD::UADDSAT,    MVT::v2i64,   {  7, 13, 14, 14 } },
4291     { ISD::UADDSAT,    MVT::v4i32,   {  4,  5,  7,  7 } },
4292     { ISD::UADDSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
4293     { ISD::UADDSAT,    MVT::v16i8,   {  1,  2,  1,  1 } },
4294     { ISD::UMAX,       MVT::v2i64,   {  4,  8, 15, 15 } },
4295     { ISD::UMAX,       MVT::v4i32,   {  2,  5,  8,  8 } },
4296     { ISD::UMAX,       MVT::v8i16,   {  1,  3,  3,  3 } },
4297     { ISD::UMAX,       MVT::v16i8,   {  1,  1,  1,  1 } },
4298     { ISD::UMIN,       MVT::v2i64,   {  4,  8, 15, 15 } },
4299     { ISD::UMIN,       MVT::v4i32,   {  2,  5,  8,  8 } },
4300     { ISD::UMIN,       MVT::v8i16,   {  1,  3,  3,  3 } },
4301     { ISD::UMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
4302     { ISD::UMULO,      MVT::v2i64,   { 30, 33, 15, 29 } },
4303     { ISD::UMULO,      MVT::v4i32,   { 19, 22, 14, 18 } },
4304     { ISD::UMULO,      MVT::v8i16,   {  4,  9,  7,  7 } },
4305     { ISD::UMULO,      MVT::v16i8,   { 13, 19, 20, 20 } },
4306     { ISD::USUBSAT,    MVT::v2i64,   {  7, 10, 14, 14 } },
4307     { ISD::USUBSAT,    MVT::v4i32,   {  4,  4,  7,  7 } },
4308     { ISD::USUBSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
4309     { ISD::USUBSAT,    MVT::v16i8,   {  1,  2,  1,  1 } },
4310     { ISD::FMAXNUM,    MVT::f64,     {  5,  5,  7,  7 } },
4311     { ISD::FMAXNUM,    MVT::v2f64,   {  4,  6,  6,  6 } },
4312     { ISD::FSQRT,      MVT::f64,     { 32, 32,  1,  1 } }, // Nehalem from http://www.agner.org/
4313     { ISD::FSQRT,      MVT::v2f64,   { 32, 32,  1,  1 } }, // Nehalem from http://www.agner.org/
4314   };
4315   static const CostKindTblEntry SSE1CostTbl[] = {
4316     { ISD::FMAXNUM,    MVT::f32,     {  5,  5,  7,  7 } },
4317     { ISD::FMAXNUM,    MVT::v4f32,   {  4,  6,  6,  6 } },
4318     { ISD::FSQRT,      MVT::f32,     { 28, 30,  1,  2 } }, // Pentium III from http://www.agner.org/
4319     { ISD::FSQRT,      MVT::v4f32,   { 56, 56,  1,  2 } }, // Pentium III from http://www.agner.org/
4320   };
4321   static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4322     { ISD::CTTZ,       MVT::i64,     {  1,  1,  1,  1 } },
4323   };
4324   static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4325     { ISD::CTTZ,       MVT::i32,     {  1,  1,  1,  1 } },
4326     { ISD::CTTZ,       MVT::i16,     {  2,  1,  1,  1 } },
4327     { ISD::CTTZ,       MVT::i8,      {  2,  1,  1,  1 } },
4328   };
4329   static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4330     { ISD::CTLZ,       MVT::i64,     {  1,  1,  1,  1 } },
4331   };
4332   static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4333     { ISD::CTLZ,       MVT::i32,     {  1,  1,  1,  1 } },
4334     { ISD::CTLZ,       MVT::i16,     {  2,  1,  1,  1 } },
4335     { ISD::CTLZ,       MVT::i8,      {  2,  1,  1,  1 } },
4336   };
4337   static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4338     { ISD::CTPOP,      MVT::i64,     {  1,  1,  1,  1 } }, // popcnt
4339   };
4340   static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4341     { ISD::CTPOP,      MVT::i32,     {  1,  1,  1,  1 } }, // popcnt
4342     { ISD::CTPOP,      MVT::i16,     {  1,  1,  2,  2 } }, // popcnt(zext())
4343     { ISD::CTPOP,      MVT::i8,      {  1,  1,  2,  2 } }, // popcnt(zext())
4344   };
4345   static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4346     { ISD::ABS,        MVT::i64,     {  1,  2,  3,  3 } }, // SUB+CMOV
4347     { ISD::BITREVERSE, MVT::i64,     { 10, 12, 20, 22 } },
4348     { ISD::BSWAP,      MVT::i64,     {  1,  2,  1,  2 } },
4349     { ISD::CTLZ,       MVT::i64,     {  1,  2,  3,  3 } }, // MOV+BSR+XOR
4350     { ISD::CTLZ,       MVT::i32,     {  1,  2,  3,  3 } }, // MOV+BSR+XOR
4351     { ISD::CTLZ,       MVT::i16,     {  2,  2,  3,  3 } }, // MOV+BSR+XOR
4352     { ISD::CTLZ,       MVT::i8,      {  2,  2,  4,  3 } }, // MOV+BSR+XOR
4353     { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{  1,  2,  2,  2 } }, // BSR+XOR
4354     { ISD::CTTZ,       MVT::i64,     {  1,  2,  2,  2 } }, // MOV+BSF
4355     { ISD::CTTZ,       MVT::i32,     {  1,  2,  2,  2 } }, // MOV+BSF
4356     { ISD::CTTZ,       MVT::i16,     {  2,  2,  2,  2 } }, // MOV+BSF
4357     { ISD::CTTZ,       MVT::i8,      {  2,  2,  2,  2 } }, // MOV+BSF
4358     { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{  1,  2,  1,  2 } }, // BSF
4359     { ISD::CTPOP,      MVT::i64,     { 10,  6, 19, 19 } },
4360     { ISD::ROTL,       MVT::i64,     {  2,  3,  1,  3 } },
4361     { ISD::ROTR,       MVT::i64,     {  2,  3,  1,  3 } },
4362     { X86ISD::VROTLI,  MVT::i64,     {  1,  1,  1,  1 } },
4363     { ISD::FSHL,       MVT::i64,     {  4,  4,  1,  4 } },
4364     { ISD::SADDSAT,    MVT::i64,     {  4,  4,  7, 10 } },
4365     { ISD::SSUBSAT,    MVT::i64,     {  4,  5,  8, 11 } },
4366     { ISD::UADDSAT,    MVT::i64,     {  2,  3,  4,  7 } },
4367     { ISD::USUBSAT,    MVT::i64,     {  2,  3,  4,  7 } },
4368     { ISD::SMAX,       MVT::i64,     {  1,  3,  2,  3 } },
4369     { ISD::SMIN,       MVT::i64,     {  1,  3,  2,  3 } },
4370     { ISD::UMAX,       MVT::i64,     {  1,  3,  2,  3 } },
4371     { ISD::UMIN,       MVT::i64,     {  1,  3,  2,  3 } },
4372     { ISD::SADDO,      MVT::i64,     {  2,  2,  4,  6 } },
4373     { ISD::UADDO,      MVT::i64,     {  2,  2,  4,  6 } },
4374     { ISD::SMULO,      MVT::i64,     {  4,  4,  4,  6 } },
4375     { ISD::UMULO,      MVT::i64,     {  8,  8,  4,  7 } },
4376   };
4377   static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4378     { ISD::ABS,        MVT::i32,     {  1,  2,  3,  3 } }, // SUB+XOR+SRA or SUB+CMOV
4379     { ISD::ABS,        MVT::i16,     {  2,  2,  3,  3 } }, // SUB+XOR+SRA or SUB+CMOV
4380     { ISD::ABS,        MVT::i8,      {  2,  4,  4,  3 } }, // SUB+XOR+SRA
4381     { ISD::BITREVERSE, MVT::i32,     {  9, 12, 17, 19 } },
4382     { ISD::BITREVERSE, MVT::i16,     {  9, 12, 17, 19 } },
4383     { ISD::BITREVERSE, MVT::i8,      {  7,  9, 13, 14 } },
4384     { ISD::BSWAP,      MVT::i32,     {  1,  1,  1,  1 } },
4385     { ISD::BSWAP,      MVT::i16,     {  1,  2,  1,  2 } }, // ROL
4386     { ISD::CTLZ,       MVT::i32,     {  2,  2,  4,  5 } }, // BSR+XOR or BSR+XOR+CMOV
4387     { ISD::CTLZ,       MVT::i16,     {  2,  2,  4,  5 } }, // BSR+XOR or BSR+XOR+CMOV
4388     { ISD::CTLZ,       MVT::i8,      {  2,  2,  5,  6 } }, // BSR+XOR or BSR+XOR+CMOV
4389     { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{  1,  2,  2,  2 } }, // BSR+XOR
4390     { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{  2,  2,  2,  2 } }, // BSR+XOR
4391     { ISD::CTLZ_ZERO_UNDEF, MVT::i8, {  2,  2,  3,  3 } }, // BSR+XOR
4392     { ISD::CTTZ,       MVT::i32,     {  2,  2,  3,  3 } }, // TEST+BSF+CMOV/BRANCH
4393     { ISD::CTTZ,       MVT::i16,     {  2,  2,  2,  3 } }, // TEST+BSF+CMOV/BRANCH
4394     { ISD::CTTZ,       MVT::i8,      {  2,  2,  2,  3 } }, // TEST+BSF+CMOV/BRANCH
4395     { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{  1,  2,  1,  2 } }, // BSF
4396     { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{  2,  2,  1,  2 } }, // BSF
4397     { ISD::CTTZ_ZERO_UNDEF, MVT::i8, {  2,  2,  1,  2 } }, // BSF
4398     { ISD::CTPOP,      MVT::i32,     {  8,  7, 15, 15 } },
4399     { ISD::CTPOP,      MVT::i16,     {  9,  8, 17, 17 } },
4400     { ISD::CTPOP,      MVT::i8,      {  7,  6,  6,  6 } },
4401     { ISD::ROTL,       MVT::i32,     {  2,  3,  1,  3 } },
4402     { ISD::ROTL,       MVT::i16,     {  2,  3,  1,  3 } },
4403     { ISD::ROTL,       MVT::i8,      {  2,  3,  1,  3 } },
4404     { ISD::ROTR,       MVT::i32,     {  2,  3,  1,  3 } },
4405     { ISD::ROTR,       MVT::i16,     {  2,  3,  1,  3 } },
4406     { ISD::ROTR,       MVT::i8,      {  2,  3,  1,  3 } },
4407     { X86ISD::VROTLI,  MVT::i32,     {  1,  1,  1,  1 } },
4408     { X86ISD::VROTLI,  MVT::i16,     {  1,  1,  1,  1 } },
4409     { X86ISD::VROTLI,  MVT::i8,      {  1,  1,  1,  1 } },
4410     { ISD::FSHL,       MVT::i32,     {  4,  4,  1,  4 } },
4411     { ISD::FSHL,       MVT::i16,     {  4,  4,  2,  5 } },
4412     { ISD::FSHL,       MVT::i8,      {  4,  4,  2,  5 } },
4413     { ISD::SADDSAT,    MVT::i32,     {  3,  4,  6,  9 } },
4414     { ISD::SADDSAT,    MVT::i16,     {  4,  4,  7, 10 } },
4415     { ISD::SADDSAT,    MVT::i8,      {  4,  5,  8, 11 } },
4416     { ISD::SSUBSAT,    MVT::i32,     {  4,  4,  7, 10 } },
4417     { ISD::SSUBSAT,    MVT::i16,     {  4,  4,  7, 10 } },
4418     { ISD::SSUBSAT,    MVT::i8,      {  4,  5,  8, 11 } },
4419     { ISD::UADDSAT,    MVT::i32,     {  2,  3,  4,  7 } },
4420     { ISD::UADDSAT,    MVT::i16,     {  2,  3,  4,  7 } },
4421     { ISD::UADDSAT,    MVT::i8,      {  3,  3,  5,  8 } },
4422     { ISD::USUBSAT,    MVT::i32,     {  2,  3,  4,  7 } },
4423     { ISD::USUBSAT,    MVT::i16,     {  2,  3,  4,  7 } },
4424     { ISD::USUBSAT,    MVT::i8,      {  3,  3,  5,  8 } },
4425     { ISD::SMAX,       MVT::i32,     {  1,  2,  2,  3 } },
4426     { ISD::SMAX,       MVT::i16,     {  1,  4,  2,  4 } },
4427     { ISD::SMAX,       MVT::i8,      {  1,  4,  2,  4 } },
4428     { ISD::SMIN,       MVT::i32,     {  1,  2,  2,  3 } },
4429     { ISD::SMIN,       MVT::i16,     {  1,  4,  2,  4 } },
4430     { ISD::SMIN,       MVT::i8,      {  1,  4,  2,  4 } },
4431     { ISD::UMAX,       MVT::i32,     {  1,  2,  2,  3 } },
4432     { ISD::UMAX,       MVT::i16,     {  1,  4,  2,  4 } },
4433     { ISD::UMAX,       MVT::i8,      {  1,  4,  2,  4 } },
4434     { ISD::UMIN,       MVT::i32,     {  1,  2,  2,  3 } },
4435     { ISD::UMIN,       MVT::i16,     {  1,  4,  2,  4 } },
4436     { ISD::UMIN,       MVT::i8,      {  1,  4,  2,  4 } },
4437     { ISD::SADDO,      MVT::i32,     {  2,  2,  4,  6 } },
4438     { ISD::SADDO,      MVT::i16,     {  2,  2,  4,  6 } },
4439     { ISD::SADDO,      MVT::i8,      {  2,  2,  4,  6 } },
4440     { ISD::UADDO,      MVT::i32,     {  2,  2,  4,  6 } },
4441     { ISD::UADDO,      MVT::i16,     {  2,  2,  4,  6 } },
4442     { ISD::UADDO,      MVT::i8,      {  2,  2,  4,  6 } },
4443     { ISD::SMULO,      MVT::i32,     {  2,  2,  4,  6 } },
4444     { ISD::SMULO,      MVT::i16,     {  5,  5,  4,  6 } },
4445     { ISD::SMULO,      MVT::i8,      {  6,  6,  4,  6 } },
4446     { ISD::UMULO,      MVT::i32,     {  6,  6,  4,  8 } },
4447     { ISD::UMULO,      MVT::i16,     {  6,  6,  4,  9 } },
4448     { ISD::UMULO,      MVT::i8,      {  6,  6,  4,  6 } },
4449   };
4450 
4451   Type *RetTy = ICA.getReturnType();
4452   Type *OpTy = RetTy;
4453   Intrinsic::ID IID = ICA.getID();
4454   unsigned ISD = ISD::DELETED_NODE;
4455   switch (IID) {
4456   default:
4457     break;
4458   case Intrinsic::abs:
4459     ISD = ISD::ABS;
4460     break;
4461   case Intrinsic::bitreverse:
4462     ISD = ISD::BITREVERSE;
4463     break;
4464   case Intrinsic::bswap:
4465     ISD = ISD::BSWAP;
4466     break;
4467   case Intrinsic::ctlz:
4468     ISD = ISD::CTLZ;
4469     break;
4470   case Intrinsic::ctpop:
4471     ISD = ISD::CTPOP;
4472     break;
4473   case Intrinsic::cttz:
4474     ISD = ISD::CTTZ;
4475     break;
4476   case Intrinsic::fshl:
4477     ISD = ISD::FSHL;
4478     if (!ICA.isTypeBasedOnly()) {
4479       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4480       if (Args[0] == Args[1]) {
4481         ISD = ISD::ROTL;
4482         // Handle uniform constant rotation amounts.
4483         // TODO: Handle funnel-shift cases.
4484         const APInt *Amt;
4485         if (Args[2] &&
4486             PatternMatch::match(Args[2], PatternMatch::m_APIntAllowPoison(Amt)))
4487           ISD = X86ISD::VROTLI;
4488       }
4489     }
4490     break;
4491   case Intrinsic::fshr:
4492     // FSHR has same costs so don't duplicate.
4493     ISD = ISD::FSHL;
4494     if (!ICA.isTypeBasedOnly()) {
4495       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4496       if (Args[0] == Args[1]) {
4497         ISD = ISD::ROTR;
4498         // Handle uniform constant rotation amount.
4499         // TODO: Handle funnel-shift cases.
4500         const APInt *Amt;
4501         if (Args[2] &&
4502             PatternMatch::match(Args[2], PatternMatch::m_APIntAllowPoison(Amt)))
4503           ISD = X86ISD::VROTLI;
4504       }
4505     }
4506     break;
4507   case Intrinsic::lrint:
4508   case Intrinsic::llrint: {
4509     // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4510     // have the same costs as the CVTTP2SI (fptosi) instructions
4511     const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4512     return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4513                             TTI::CastContextHint::None, CostKind);
4514   }
4515   case Intrinsic::maxnum:
4516   case Intrinsic::minnum:
4517     // FMINNUM has same costs so don't duplicate.
4518     ISD = ISD::FMAXNUM;
4519     break;
4520   case Intrinsic::sadd_sat:
4521     ISD = ISD::SADDSAT;
4522     break;
4523   case Intrinsic::smax:
4524     ISD = ISD::SMAX;
4525     break;
4526   case Intrinsic::smin:
4527     ISD = ISD::SMIN;
4528     break;
4529   case Intrinsic::ssub_sat:
4530     ISD = ISD::SSUBSAT;
4531     break;
4532   case Intrinsic::uadd_sat:
4533     ISD = ISD::UADDSAT;
4534     break;
4535   case Intrinsic::umax:
4536     ISD = ISD::UMAX;
4537     break;
4538   case Intrinsic::umin:
4539     ISD = ISD::UMIN;
4540     break;
4541   case Intrinsic::usub_sat:
4542     ISD = ISD::USUBSAT;
4543     break;
4544   case Intrinsic::sqrt:
4545     ISD = ISD::FSQRT;
4546     break;
4547   case Intrinsic::sadd_with_overflow:
4548   case Intrinsic::ssub_with_overflow:
4549     // SSUBO has same costs so don't duplicate.
4550     ISD = ISD::SADDO;
4551     OpTy = RetTy->getContainedType(0);
4552     break;
4553   case Intrinsic::uadd_with_overflow:
4554   case Intrinsic::usub_with_overflow:
4555     // USUBO has same costs so don't duplicate.
4556     ISD = ISD::UADDO;
4557     OpTy = RetTy->getContainedType(0);
4558     break;
4559   case Intrinsic::smul_with_overflow:
4560     ISD = ISD::SMULO;
4561     OpTy = RetTy->getContainedType(0);
4562     break;
4563   case Intrinsic::umul_with_overflow:
4564     ISD = ISD::UMULO;
4565     OpTy = RetTy->getContainedType(0);
4566     break;
4567   }
4568 
4569   if (ISD != ISD::DELETED_NODE) {
4570     auto adjustTableCost = [&](int ISD, unsigned Cost,
4571                                std::pair<InstructionCost, MVT> LT,
4572                                FastMathFlags FMF) -> InstructionCost {
4573       InstructionCost LegalizationCost = LT.first;
4574       MVT MTy = LT.second;
4575 
4576       // If there are no NANs to deal with, then these are reduced to a
4577       // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4578       // assume is used in the non-fast case.
4579       if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4580         if (FMF.noNaNs())
4581           return LegalizationCost * 1;
4582       }
4583 
4584       // For cases where some ops can be folded into a load/store, assume free.
4585       if (MTy.isScalarInteger()) {
4586         if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4587           if (const Instruction *II = ICA.getInst()) {
4588             if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4589               return TTI::TCC_Free;
4590             if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4591               if (LI->hasOneUse())
4592                 return TTI::TCC_Free;
4593             }
4594           }
4595         }
4596       }
4597 
4598       return LegalizationCost * (int)Cost;
4599     };
4600 
4601     // Legalize the type.
4602     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4603     MVT MTy = LT.second;
4604 
4605     // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4606     if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4607          (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4608         !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4609       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4610       if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4611         if (Cst->isAllOnesValue())
4612           ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF;
4613     }
4614 
4615     // FSQRT is a single instruction.
4616     if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4617       return LT.first;
4618 
4619     if (ST->useGLMDivSqrtCosts())
4620       if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4621         if (auto KindCost = Entry->Cost[CostKind])
4622           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4623 
4624     if (ST->useSLMArithCosts())
4625       if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4626         if (auto KindCost = Entry->Cost[CostKind])
4627           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4628 
4629     if (ST->hasVBMI2())
4630       if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4631         if (auto KindCost = Entry->Cost[CostKind])
4632           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4633 
4634     if (ST->hasBITALG())
4635       if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4636         if (auto KindCost = Entry->Cost[CostKind])
4637           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4638 
4639     if (ST->hasVPOPCNTDQ())
4640       if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4641         if (auto KindCost = Entry->Cost[CostKind])
4642           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4643 
4644     if (ST->hasGFNI())
4645       if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4646         if (auto KindCost = Entry->Cost[CostKind])
4647           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4648 
4649     if (ST->hasCDI())
4650       if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4651         if (auto KindCost = Entry->Cost[CostKind])
4652           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4653 
4654     if (ST->hasBWI())
4655       if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4656         if (auto KindCost = Entry->Cost[CostKind])
4657           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4658 
4659     if (ST->hasAVX512())
4660       if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4661         if (auto KindCost = Entry->Cost[CostKind])
4662           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4663 
4664     if (ST->hasXOP())
4665       if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4666         if (auto KindCost = Entry->Cost[CostKind])
4667           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4668 
4669     if (ST->hasAVX2())
4670       if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4671         if (auto KindCost = Entry->Cost[CostKind])
4672           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4673 
4674     if (ST->hasAVX())
4675       if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4676         if (auto KindCost = Entry->Cost[CostKind])
4677           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4678 
4679     if (ST->hasSSE42())
4680       if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4681         if (auto KindCost = Entry->Cost[CostKind])
4682           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4683 
4684     if (ST->hasSSE41())
4685       if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4686         if (auto KindCost = Entry->Cost[CostKind])
4687           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4688 
4689     if (ST->hasSSSE3())
4690       if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4691         if (auto KindCost = Entry->Cost[CostKind])
4692           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4693 
4694     if (ST->hasSSE2())
4695       if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4696         if (auto KindCost = Entry->Cost[CostKind])
4697           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4698 
4699     if (ST->hasSSE1())
4700       if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4701         if (auto KindCost = Entry->Cost[CostKind])
4702           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4703 
4704     if (ST->hasBMI()) {
4705       if (ST->is64Bit())
4706         if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4707           if (auto KindCost = Entry->Cost[CostKind])
4708             return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4709 
4710       if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4711         if (auto KindCost = Entry->Cost[CostKind])
4712           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4713     }
4714 
4715     if (ST->hasLZCNT()) {
4716       if (ST->is64Bit())
4717         if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4718           if (auto KindCost = Entry->Cost[CostKind])
4719             return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4720 
4721       if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4722         if (auto KindCost = Entry->Cost[CostKind])
4723           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4724     }
4725 
4726     if (ST->hasPOPCNT()) {
4727       if (ST->is64Bit())
4728         if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4729           if (auto KindCost = Entry->Cost[CostKind])
4730             return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4731 
4732       if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4733         if (auto KindCost = Entry->Cost[CostKind])
4734           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4735     }
4736 
4737     if (ST->is64Bit())
4738       if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4739         if (auto KindCost = Entry->Cost[CostKind])
4740           return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4741 
4742     if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4743       if (auto KindCost = Entry->Cost[CostKind])
4744         return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4745 
4746     // Without arg data, we need to compute the expanded costs of custom lowered
4747     // intrinsics to prevent use of the (very low) default costs.
4748     if (ICA.isTypeBasedOnly() &&
4749         (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4750       Type *CondTy = RetTy->getWithNewBitWidth(1);
4751       InstructionCost Cost = 0;
4752       Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4753       Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4754       Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4755       Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4756       Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4757       Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4758                                  CmpInst::ICMP_EQ, CostKind);
4759       Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4760                                  CmpInst::ICMP_EQ, CostKind);
4761       return Cost;
4762     }
4763   }
4764 
4765   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
4766 }
4767 
4768 InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
4769                                                TTI::TargetCostKind CostKind,
4770                                                unsigned Index, Value *Op0,
4771                                                Value *Op1) {
4772   static const CostTblEntry SLMCostTbl[] = {
4773      { ISD::EXTRACT_VECTOR_ELT,       MVT::i8,      4 },
4774      { ISD::EXTRACT_VECTOR_ELT,       MVT::i16,     4 },
4775      { ISD::EXTRACT_VECTOR_ELT,       MVT::i32,     4 },
4776      { ISD::EXTRACT_VECTOR_ELT,       MVT::i64,     7 }
4777    };
4778 
4779   assert(Val->isVectorTy() && "This must be a vector type");
4780   Type *ScalarType = Val->getScalarType();
4781   InstructionCost RegisterFileMoveCost = 0;
4782 
4783   // Non-immediate extraction/insertion can be handled as a sequence of
4784   // aliased loads+stores via the stack.
4785   if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4786                        Opcode == Instruction::InsertElement)) {
4787     // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4788     // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4789 
4790     // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4791     assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4792     Align VecAlign = DL.getPrefTypeAlign(Val);
4793     Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4794 
4795     // Extract - store vector to stack, load scalar.
4796     if (Opcode == Instruction::ExtractElement) {
4797       return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4798              getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4799                              CostKind);
4800     }
4801     // Insert - store vector to stack, store scalar, load vector.
4802     if (Opcode == Instruction::InsertElement) {
4803       return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4804              getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4805                              CostKind) +
4806              getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4807     }
4808   }
4809 
4810   if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4811                        Opcode == Instruction::InsertElement)) {
4812     // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4813     if (Opcode == Instruction::ExtractElement &&
4814         ScalarType->getScalarSizeInBits() == 1 &&
4815         cast<FixedVectorType>(Val)->getNumElements() > 1)
4816       return 1;
4817 
4818     // Legalize the type.
4819     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4820 
4821     // This type is legalized to a scalar type.
4822     if (!LT.second.isVector())
4823       return TTI::TCC_Free;
4824 
4825     // The type may be split. Normalize the index to the new type.
4826     unsigned SizeInBits = LT.second.getSizeInBits();
4827     unsigned NumElts = LT.second.getVectorNumElements();
4828     unsigned SubNumElts = NumElts;
4829     Index = Index % NumElts;
4830 
4831     // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4832     // For inserts, we also need to insert the subvector back.
4833     if (SizeInBits > 128) {
4834       assert((SizeInBits % 128) == 0 && "Illegal vector");
4835       unsigned NumSubVecs = SizeInBits / 128;
4836       SubNumElts = NumElts / NumSubVecs;
4837       if (SubNumElts <= Index) {
4838         RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4839         Index %= SubNumElts;
4840       }
4841     }
4842 
4843     MVT MScalarTy = LT.second.getScalarType();
4844     auto IsCheapPInsrPExtrInsertPS = [&]() {
4845       // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4846       // Inserting f32 into index0 is just movss.
4847       // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4848       return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4849              (MScalarTy.isInteger() && ST->hasSSE41()) ||
4850              (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4851               Opcode == Instruction::InsertElement) ||
4852              (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4853               Opcode == Instruction::InsertElement);
4854     };
4855 
4856     if (Index == 0) {
4857       // Floating point scalars are already located in index #0.
4858       // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4859       // true for all.
4860       if (ScalarType->isFloatingPointTy() &&
4861           (Opcode != Instruction::InsertElement || !Op0 ||
4862            isa<UndefValue>(Op0)))
4863         return RegisterFileMoveCost;
4864 
4865       if (Opcode == Instruction::InsertElement &&
4866           isa_and_nonnull<UndefValue>(Op0)) {
4867         // Consider the gather cost to be cheap.
4868         if (isa_and_nonnull<LoadInst>(Op1))
4869           return RegisterFileMoveCost;
4870         if (!IsCheapPInsrPExtrInsertPS()) {
4871           // mov constant-to-GPR + movd/movq GPR -> XMM.
4872           if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4873             return 2 + RegisterFileMoveCost;
4874           // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4875           return 1 + RegisterFileMoveCost;
4876         }
4877       }
4878 
4879       // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4880       if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4881         return 1 + RegisterFileMoveCost;
4882     }
4883 
4884     int ISD = TLI->InstructionOpcodeToISD(Opcode);
4885     assert(ISD && "Unexpected vector opcode");
4886     if (ST->useSLMArithCosts())
4887       if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4888         return Entry->Cost + RegisterFileMoveCost;
4889 
4890     // Consider cheap cases.
4891     if (IsCheapPInsrPExtrInsertPS())
4892       return 1 + RegisterFileMoveCost;
4893 
4894     // For extractions we just need to shuffle the element to index 0, which
4895     // should be very cheap (assume cost = 1). For insertions we need to shuffle
4896     // the elements to its destination. In both cases we must handle the
4897     // subvector move(s).
4898     // If the vector type is already less than 128-bits then don't reduce it.
4899     // TODO: Under what circumstances should we shuffle using the full width?
4900     InstructionCost ShuffleCost = 1;
4901     if (Opcode == Instruction::InsertElement) {
4902       auto *SubTy = cast<VectorType>(Val);
4903       EVT VT = TLI->getValueType(DL, Val);
4904       if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4905         SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4906       ShuffleCost =
4907           getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, {}, CostKind, 0, SubTy);
4908     }
4909     int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4910     return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4911   }
4912 
4913   return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4914          RegisterFileMoveCost;
4915 }
4916 
4917 InstructionCost X86TTIImpl::getScalarizationOverhead(
4918     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4919     TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
4920   assert(DemandedElts.getBitWidth() ==
4921              cast<FixedVectorType>(Ty)->getNumElements() &&
4922          "Vector size mismatch");
4923 
4924   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4925   MVT MScalarTy = LT.second.getScalarType();
4926   unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4927   InstructionCost Cost = 0;
4928 
4929   constexpr unsigned LaneBitWidth = 128;
4930   assert((LegalVectorBitWidth < LaneBitWidth ||
4931           (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4932          "Illegal vector");
4933 
4934   const int NumLegalVectors = *LT.first.getValue();
4935   assert(NumLegalVectors >= 0 && "Negative cost!");
4936 
4937   // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4938   // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4939   if (Insert) {
4940     if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4941         (MScalarTy.isInteger() && ST->hasSSE41()) ||
4942         (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4943       // For types we can insert directly, insertion into 128-bit sub vectors is
4944       // cheap, followed by a cheap chain of concatenations.
4945       if (LegalVectorBitWidth <= LaneBitWidth) {
4946         Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4947                                                 /*Extract*/ false, CostKind);
4948       } else {
4949         // In each 128-lane, if at least one index is demanded but not all
4950         // indices are demanded and this 128-lane is not the first 128-lane of
4951         // the legalized-vector, then this 128-lane needs a extracti128; If in
4952         // each 128-lane, there is at least one demanded index, this 128-lane
4953         // needs a inserti128.
4954 
4955         // The following cases will help you build a better understanding:
4956         // Assume we insert several elements into a v8i32 vector in avx2,
4957         // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4958         // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4959         // inserti128.
4960         // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4961         assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4962         unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4963         unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4964         unsigned NumLegalElts =
4965             LT.second.getVectorNumElements() * NumLegalVectors;
4966         assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4967                "Vector has been legalized to smaller element count");
4968         assert((NumLegalElts % NumLanesTotal) == 0 &&
4969                "Unexpected elts per lane");
4970         unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4971 
4972         APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4973         auto *LaneTy =
4974             FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4975 
4976         for (unsigned I = 0; I != NumLanesTotal; ++I) {
4977           APInt LaneEltMask = WidenedDemandedElts.extractBits(
4978               NumEltsPerLane, NumEltsPerLane * I);
4979           if (LaneEltMask.isZero())
4980             continue;
4981           // FIXME: we don't need to extract if all non-demanded elements
4982           //        are legalization-inserted padding.
4983           if (!LaneEltMask.isAllOnes())
4984             Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind,
4985                                    I * NumEltsPerLane, LaneTy);
4986           Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4987                                                   /*Extract*/ false, CostKind);
4988         }
4989 
4990         APInt AffectedLanes =
4991             APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4992         APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4993             AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4994         for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4995           for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4996             unsigned I = NumLegalLanes * LegalVec + Lane;
4997             // No need to insert unaffected lane; or lane 0 of each legal vector
4998             // iff ALL lanes of that vector were affected and will be inserted.
4999             if (!AffectedLanes[I] ||
5000                 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
5001               continue;
5002             Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, {}, CostKind,
5003                                    I * NumEltsPerLane, LaneTy);
5004           }
5005         }
5006       }
5007     } else if (LT.second.isVector()) {
5008       // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
5009       // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
5010       // series of UNPCK followed by CONCAT_VECTORS - all of these can be
5011       // considered cheap.
5012       if (Ty->isIntOrIntVectorTy())
5013         Cost += DemandedElts.popcount();
5014 
5015       // Get the smaller of the legalized or original pow2-extended number of
5016       // vector elements, which represents the number of unpacks we'll end up
5017       // performing.
5018       unsigned NumElts = LT.second.getVectorNumElements();
5019       unsigned Pow2Elts =
5020           PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
5021       Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
5022     }
5023   }
5024 
5025   if (Extract) {
5026     // vXi1 can be efficiently extracted with MOVMSK.
5027     // TODO: AVX512 predicate mask handling.
5028     // NOTE: This doesn't work well for roundtrip scalarization.
5029     if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5030       unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5031       unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5032       unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5033       return MOVMSKCost;
5034     }
5035 
5036     if (LT.second.isVector()) {
5037       unsigned NumLegalElts =
5038           LT.second.getVectorNumElements() * NumLegalVectors;
5039       assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5040              "Vector has been legalized to smaller element count");
5041 
5042       // If we're extracting elements from a 128-bit subvector lane,
5043       // we only need to extract each lane once, not for every element.
5044       if (LegalVectorBitWidth > LaneBitWidth) {
5045         unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5046         unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5047         assert((NumLegalElts % NumLanesTotal) == 0 &&
5048                "Unexpected elts per lane");
5049         unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5050 
5051         // Add cost for each demanded 128-bit subvector extraction.
5052         // Luckily this is a lot easier than for insertion.
5053         APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5054         auto *LaneTy =
5055             FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5056 
5057         for (unsigned I = 0; I != NumLanesTotal; ++I) {
5058           APInt LaneEltMask = WidenedDemandedElts.extractBits(
5059               NumEltsPerLane, I * NumEltsPerLane);
5060           if (LaneEltMask.isZero())
5061             continue;
5062           Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind,
5063                                  I * NumEltsPerLane, LaneTy);
5064           Cost += BaseT::getScalarizationOverhead(
5065               LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5066         }
5067 
5068         return Cost;
5069       }
5070     }
5071 
5072     // Fallback to default extraction.
5073     Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5074                                             Extract, CostKind);
5075   }
5076 
5077   return Cost;
5078 }
5079 
5080 InstructionCost
5081 X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5082                                       int VF, const APInt &DemandedDstElts,
5083                                       TTI::TargetCostKind CostKind) {
5084   const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5085   // We don't differentiate element types here, only element bit width.
5086   EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5087 
5088   auto bailout = [&]() {
5089     return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5090                                             DemandedDstElts, CostKind);
5091   };
5092 
5093   // For now, only deal with AVX512 cases.
5094   if (!ST->hasAVX512())
5095     return bailout();
5096 
5097   // Do we have a native shuffle for this element type, or should we promote?
5098   unsigned PromEltTyBits = EltTyBits;
5099   switch (EltTyBits) {
5100   case 32:
5101   case 64:
5102     break; // AVX512F.
5103   case 16:
5104     if (!ST->hasBWI())
5105       PromEltTyBits = 32; // promote to i32, AVX512F.
5106     break;                // AVX512BW
5107   case 8:
5108     if (!ST->hasVBMI())
5109       PromEltTyBits = 32; // promote to i32, AVX512F.
5110     break;                // AVX512VBMI
5111   case 1:
5112     // There is no support for shuffling i1 elements. We *must* promote.
5113     if (ST->hasBWI()) {
5114       if (ST->hasVBMI())
5115         PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5116       else
5117         PromEltTyBits = 16; // promote to i16, AVX512BW.
5118       break;
5119     }
5120     PromEltTyBits = 32; // promote to i32, AVX512F.
5121     break;
5122   default:
5123     return bailout();
5124   }
5125   auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5126 
5127   auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5128   auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5129 
5130   int NumDstElements = VF * ReplicationFactor;
5131   auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5132   auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5133 
5134   // Legalize the types.
5135   MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5136   MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5137   MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5138   MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5139   // They should have legalized into vector types.
5140   if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5141       !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5142     return bailout();
5143 
5144   if (PromEltTyBits != EltTyBits) {
5145     // If we have to perform the shuffle with wider elt type than our data type,
5146     // then we will first need to anyext (we don't care about the new bits)
5147     // the source elements, and then truncate Dst elements.
5148     InstructionCost PromotionCost;
5149     PromotionCost += getCastInstrCost(
5150         Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5151         TargetTransformInfo::CastContextHint::None, CostKind);
5152     PromotionCost +=
5153         getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5154                          /*Src=*/PromDstVecTy,
5155                          TargetTransformInfo::CastContextHint::None, CostKind);
5156     return PromotionCost + getReplicationShuffleCost(PromEltTy,
5157                                                      ReplicationFactor, VF,
5158                                                      DemandedDstElts, CostKind);
5159   }
5160 
5161   assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5162          LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5163          "We expect that the legalization doesn't affect the element width, "
5164          "doesn't coalesce/split elements.");
5165 
5166   unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5167   unsigned NumDstVectors =
5168       divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5169 
5170   auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5171 
5172   // Not all the produced Dst elements may be demanded. In our case,
5173   // given that a single Dst vector is formed by a single shuffle,
5174   // if all elements that will form a single Dst vector aren't demanded,
5175   // then we won't need to do that shuffle, so adjust the cost accordingly.
5176   APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5177       DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5178   unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5179 
5180   InstructionCost SingleShuffleCost = getShuffleCost(
5181       TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/{}, CostKind,
5182       /*Index=*/0, /*SubTp=*/nullptr);
5183   return NumDstVectorsDemanded * SingleShuffleCost;
5184 }
5185 
5186 InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
5187                                             MaybeAlign Alignment,
5188                                             unsigned AddressSpace,
5189                                             TTI::TargetCostKind CostKind,
5190                                             TTI::OperandValueInfo OpInfo,
5191                                             const Instruction *I) {
5192   // TODO: Handle other cost kinds.
5193   if (CostKind != TTI::TCK_RecipThroughput) {
5194     if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5195       // Store instruction with index and scale costs 2 Uops.
5196       // Check the preceding GEP to identify non-const indices.
5197       if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5198         if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5199           return TTI::TCC_Basic * 2;
5200       }
5201     }
5202     return TTI::TCC_Basic;
5203   }
5204 
5205   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5206          "Invalid Opcode");
5207   // Type legalization can't handle structs
5208   if (TLI->getValueType(DL, Src, true) == MVT::Other)
5209     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5210                                   CostKind, OpInfo, I);
5211 
5212   // Legalize the type.
5213   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5214 
5215   auto *VTy = dyn_cast<FixedVectorType>(Src);
5216 
5217   InstructionCost Cost = 0;
5218 
5219   // Add a cost for constant load to vector.
5220   if (Opcode == Instruction::Store && OpInfo.isConstant())
5221     Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5222                             /*AddressSpace=*/0, CostKind, OpInfo);
5223 
5224   // Handle the simple case of non-vectors.
5225   // NOTE: this assumes that legalization never creates vector from scalars!
5226   if (!VTy || !LT.second.isVector()) {
5227     // Each load/store unit costs 1.
5228     return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5229   }
5230 
5231   bool IsLoad = Opcode == Instruction::Load;
5232 
5233   Type *EltTy = VTy->getElementType();
5234 
5235   const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5236 
5237   // Source of truth: how many elements were there in the original IR vector?
5238   const unsigned SrcNumElt = VTy->getNumElements();
5239 
5240   // How far have we gotten?
5241   int NumEltRemaining = SrcNumElt;
5242   // Note that we intentionally capture by-reference, NumEltRemaining changes.
5243   auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5244 
5245   const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5246 
5247   // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5248   const unsigned XMMBits = 128;
5249   if (XMMBits % EltTyBits != 0)
5250     // Vector size must be a multiple of the element size. I.e. no padding.
5251     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5252                                   CostKind, OpInfo, I);
5253   const int NumEltPerXMM = XMMBits / EltTyBits;
5254 
5255   auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5256 
5257   for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5258        NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5259     // How many elements would a single op deal with at once?
5260     if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5261       // Vector size must be a multiple of the element size. I.e. no padding.
5262       return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5263                                     CostKind, OpInfo, I);
5264     int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5265 
5266     assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5267     assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5268             (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5269            "Unless we haven't halved the op size yet, "
5270            "we have less than two op's sized units of work left.");
5271 
5272     auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5273                           ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5274                           : XMMVecTy;
5275 
5276     assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5277            "After halving sizes, the vector elt count is no longer a multiple "
5278            "of number of elements per operation?");
5279     auto *CoalescedVecTy =
5280         CurrNumEltPerOp == 1
5281             ? CurrVecTy
5282             : FixedVectorType::get(
5283                   IntegerType::get(Src->getContext(),
5284                                    EltTyBits * CurrNumEltPerOp),
5285                   CurrVecTy->getNumElements() / CurrNumEltPerOp);
5286     assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5287                DL.getTypeSizeInBits(CurrVecTy) &&
5288            "coalesciing elements doesn't change vector width.");
5289 
5290     while (NumEltRemaining > 0) {
5291       assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5292 
5293       // Can we use this vector size, as per the remaining element count?
5294       // Iff the vector is naturally aligned, we can do a wide load regardless.
5295       if (NumEltRemaining < CurrNumEltPerOp &&
5296           (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
5297           CurrOpSizeBytes != 1)
5298         break; // Try smalled vector size.
5299 
5300       // This isn't exactly right. We're using slow unaligned 32-byte accesses
5301       // as a proxy for a double-pumped AVX memory interface such as on
5302       // Sandybridge.
5303       // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5304       // will be scalarized.
5305       if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5306         Cost += 2;
5307       else if (CurrOpSizeBytes < 4)
5308         Cost += 2;
5309       else
5310         Cost += 1;
5311 
5312       // If we're loading a uniform value, then we don't need to split the load,
5313       // loading just a single (widest) vector can be reused by all splits.
5314       if (IsLoad && OpInfo.isUniform())
5315         return Cost;
5316 
5317       bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5318 
5319       // If we have fully processed the previous reg, we need to replenish it.
5320       if (SubVecEltsLeft == 0) {
5321         SubVecEltsLeft += CurrVecTy->getNumElements();
5322         // And that's free only for the 0'th subvector of a legalized vector.
5323         if (!Is0thSubVec)
5324           Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
5325                                         : TTI::ShuffleKind::SK_ExtractSubvector,
5326                                  VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5327       }
5328 
5329       // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5330       // for smaller widths (32/16/8) we have to insert/extract them separately.
5331       // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5332       // but let's pretend that it is also true for 16/8 bit wide ops...)
5333       if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5334         int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5335         assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5336         int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5337         APInt DemandedElts =
5338             APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5339                               CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5340         assert(DemandedElts.popcount() == 1 && "Inserting single value");
5341         Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5342                                          !IsLoad, CostKind);
5343       }
5344 
5345       SubVecEltsLeft -= CurrNumEltPerOp;
5346       NumEltRemaining -= CurrNumEltPerOp;
5347       Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
5348     }
5349   }
5350 
5351   assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5352 
5353   return Cost;
5354 }
5355 
5356 InstructionCost
5357 X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5358                                   unsigned AddressSpace,
5359                                   TTI::TargetCostKind CostKind) {
5360   bool IsLoad = (Instruction::Load == Opcode);
5361   bool IsStore = (Instruction::Store == Opcode);
5362 
5363   auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5364   if (!SrcVTy)
5365     // To calculate scalar take the regular cost, without mask
5366     return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5367 
5368   unsigned NumElem = SrcVTy->getNumElements();
5369   auto *MaskTy =
5370       FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5371   if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5372       (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5373     // Scalarization
5374     APInt DemandedElts = APInt::getAllOnes(NumElem);
5375     InstructionCost MaskSplitCost = getScalarizationOverhead(
5376         MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5377     InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5378         Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5379         CmpInst::BAD_ICMP_PREDICATE, CostKind);
5380     InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5381     InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5382     InstructionCost ValueSplitCost = getScalarizationOverhead(
5383         SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5384     InstructionCost MemopCost =
5385         NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5386                                          Alignment, AddressSpace, CostKind);
5387     return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5388   }
5389 
5390   // Legalize the type.
5391   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5392   auto VT = TLI->getValueType(DL, SrcVTy);
5393   InstructionCost Cost = 0;
5394   MVT Ty = LT.second;
5395   if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5396     // APX masked load/store for scalar is cheap.
5397     return Cost + LT.first;
5398 
5399   if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5400       LT.second.getVectorNumElements() == NumElem)
5401     // Promotion requires extend/truncate for data and a shuffle for mask.
5402     Cost +=
5403         getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, {}, CostKind, 0,
5404                        nullptr) +
5405         getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, {}, CostKind, 0, nullptr);
5406 
5407   else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5408     auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5409                                            Ty.getVectorNumElements());
5410     // Expanding requires fill mask with zeroes
5411     Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, {}, CostKind, 0,
5412                            MaskTy);
5413   }
5414 
5415   // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5416   if (!ST->hasAVX512())
5417     return Cost + LT.first * (IsLoad ? 2 : 8);
5418 
5419   // AVX-512 masked load/store is cheaper
5420   return Cost + LT.first;
5421 }
5422 
5423 InstructionCost
5424 X86TTIImpl::getPointersChainCost(ArrayRef<const Value *> Ptrs,
5425                                  const Value *Base,
5426                                  const TTI::PointersChainInfo &Info,
5427                                  Type *AccessTy, TTI::TargetCostKind CostKind) {
5428   if (Info.isSameBase() && Info.isKnownStride()) {
5429     // If all the pointers have known stride all the differences are translated
5430     // into constants. X86 memory addressing allows encoding it into
5431     // displacement. So we just need to take the base GEP cost.
5432     if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5433       SmallVector<const Value *> Indices(BaseGEP->indices());
5434       return getGEPCost(BaseGEP->getSourceElementType(),
5435                         BaseGEP->getPointerOperand(), Indices, nullptr,
5436                         CostKind);
5437     }
5438     return TTI::TCC_Free;
5439   }
5440   return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5441 }
5442 
5443 InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
5444                                                       ScalarEvolution *SE,
5445                                                       const SCEV *Ptr) {
5446   // Address computations in vectorized code with non-consecutive addresses will
5447   // likely result in more instructions compared to scalar code where the
5448   // computation can more often be merged into the index mode. The resulting
5449   // extra micro-ops can significantly decrease throughput.
5450   const unsigned NumVectorInstToHideOverhead = 10;
5451 
5452   // Cost modeling of Strided Access Computation is hidden by the indexing
5453   // modes of X86 regardless of the stride value. We dont believe that there
5454   // is a difference between constant strided access in gerenal and constant
5455   // strided value which is less than or equal to 64.
5456   // Even in the case of (loop invariant) stride whose value is not known at
5457   // compile time, the address computation will not incur more than one extra
5458   // ADD instruction.
5459   if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5460     // TODO: AVX2 is the current cut-off because we don't have correct
5461     //       interleaving costs for prior ISA's.
5462     if (!BaseT::isStridedAccess(Ptr))
5463       return NumVectorInstToHideOverhead;
5464     if (!BaseT::getConstantStrideStep(SE, Ptr))
5465       return 1;
5466   }
5467 
5468   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5469 }
5470 
5471 InstructionCost
5472 X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
5473                                        std::optional<FastMathFlags> FMF,
5474                                        TTI::TargetCostKind CostKind) {
5475   if (TTI::requiresOrderedReduction(FMF))
5476     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5477 
5478   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5479   // and make it as the cost.
5480 
5481   static const CostTblEntry SLMCostTbl[] = {
5482     { ISD::FADD,  MVT::v2f64,   3 },
5483     { ISD::ADD,   MVT::v2i64,   5 },
5484   };
5485 
5486   static const CostTblEntry SSE2CostTbl[] = {
5487     { ISD::FADD,  MVT::v2f64,   2 },
5488     { ISD::FADD,  MVT::v2f32,   2 },
5489     { ISD::FADD,  MVT::v4f32,   4 },
5490     { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
5491     { ISD::ADD,   MVT::v2i32,   2 }, // FIXME: chosen to be less than v4i32
5492     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
5493     { ISD::ADD,   MVT::v2i16,   2 },      // The data reported by the IACA tool is "4.3".
5494     { ISD::ADD,   MVT::v4i16,   3 },      // The data reported by the IACA tool is "4.3".
5495     { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
5496     { ISD::ADD,   MVT::v2i8,    2 },
5497     { ISD::ADD,   MVT::v4i8,    2 },
5498     { ISD::ADD,   MVT::v8i8,    2 },
5499     { ISD::ADD,   MVT::v16i8,   3 },
5500   };
5501 
5502   static const CostTblEntry AVX1CostTbl[] = {
5503     { ISD::FADD,  MVT::v4f64,   3 },
5504     { ISD::FADD,  MVT::v4f32,   3 },
5505     { ISD::FADD,  MVT::v8f32,   4 },
5506     { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
5507     { ISD::ADD,   MVT::v4i64,   3 },
5508     { ISD::ADD,   MVT::v8i32,   5 },
5509     { ISD::ADD,   MVT::v16i16,  5 },
5510     { ISD::ADD,   MVT::v32i8,   4 },
5511   };
5512 
5513   int ISD = TLI->InstructionOpcodeToISD(Opcode);
5514   assert(ISD && "Invalid opcode");
5515 
5516   // Before legalizing the type, give a chance to look up illegal narrow types
5517   // in the table.
5518   // FIXME: Is there a better way to do this?
5519   EVT VT = TLI->getValueType(DL, ValTy);
5520   if (VT.isSimple()) {
5521     MVT MTy = VT.getSimpleVT();
5522     if (ST->useSLMArithCosts())
5523       if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5524         return Entry->Cost;
5525 
5526     if (ST->hasAVX())
5527       if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5528         return Entry->Cost;
5529 
5530     if (ST->hasSSE2())
5531       if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5532         return Entry->Cost;
5533   }
5534 
5535   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5536 
5537   MVT MTy = LT.second;
5538 
5539   auto *ValVTy = cast<FixedVectorType>(ValTy);
5540 
5541   // Special case: vXi8 mul reductions are performed as vXi16.
5542   if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5543     auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5544     auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5545     return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5546                             TargetTransformInfo::CastContextHint::None,
5547                             CostKind) +
5548            getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5549   }
5550 
5551   InstructionCost ArithmeticCost = 0;
5552   if (LT.first != 1 && MTy.isVector() &&
5553       MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5554     // Type needs to be split. We need LT.first - 1 arithmetic ops.
5555     auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5556                                             MTy.getVectorNumElements());
5557     ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5558     ArithmeticCost *= LT.first - 1;
5559   }
5560 
5561   if (ST->useSLMArithCosts())
5562     if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5563       return ArithmeticCost + Entry->Cost;
5564 
5565   if (ST->hasAVX())
5566     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5567       return ArithmeticCost + Entry->Cost;
5568 
5569   if (ST->hasSSE2())
5570     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5571       return ArithmeticCost + Entry->Cost;
5572 
5573   // FIXME: These assume a naive kshift+binop lowering, which is probably
5574   // conservative in most cases.
5575   static const CostTblEntry AVX512BoolReduction[] = {
5576     { ISD::AND,  MVT::v2i1,   3 },
5577     { ISD::AND,  MVT::v4i1,   5 },
5578     { ISD::AND,  MVT::v8i1,   7 },
5579     { ISD::AND,  MVT::v16i1,  9 },
5580     { ISD::AND,  MVT::v32i1, 11 },
5581     { ISD::AND,  MVT::v64i1, 13 },
5582     { ISD::OR,   MVT::v2i1,   3 },
5583     { ISD::OR,   MVT::v4i1,   5 },
5584     { ISD::OR,   MVT::v8i1,   7 },
5585     { ISD::OR,   MVT::v16i1,  9 },
5586     { ISD::OR,   MVT::v32i1, 11 },
5587     { ISD::OR,   MVT::v64i1, 13 },
5588   };
5589 
5590   static const CostTblEntry AVX2BoolReduction[] = {
5591     { ISD::AND,  MVT::v16i16,  2 }, // vpmovmskb + cmp
5592     { ISD::AND,  MVT::v32i8,   2 }, // vpmovmskb + cmp
5593     { ISD::OR,   MVT::v16i16,  2 }, // vpmovmskb + cmp
5594     { ISD::OR,   MVT::v32i8,   2 }, // vpmovmskb + cmp
5595   };
5596 
5597   static const CostTblEntry AVX1BoolReduction[] = {
5598     { ISD::AND,  MVT::v4i64,   2 }, // vmovmskpd + cmp
5599     { ISD::AND,  MVT::v8i32,   2 }, // vmovmskps + cmp
5600     { ISD::AND,  MVT::v16i16,  4 }, // vextractf128 + vpand + vpmovmskb + cmp
5601     { ISD::AND,  MVT::v32i8,   4 }, // vextractf128 + vpand + vpmovmskb + cmp
5602     { ISD::OR,   MVT::v4i64,   2 }, // vmovmskpd + cmp
5603     { ISD::OR,   MVT::v8i32,   2 }, // vmovmskps + cmp
5604     { ISD::OR,   MVT::v16i16,  4 }, // vextractf128 + vpor + vpmovmskb + cmp
5605     { ISD::OR,   MVT::v32i8,   4 }, // vextractf128 + vpor + vpmovmskb + cmp
5606   };
5607 
5608   static const CostTblEntry SSE2BoolReduction[] = {
5609     { ISD::AND,  MVT::v2i64,   2 }, // movmskpd + cmp
5610     { ISD::AND,  MVT::v4i32,   2 }, // movmskps + cmp
5611     { ISD::AND,  MVT::v8i16,   2 }, // pmovmskb + cmp
5612     { ISD::AND,  MVT::v16i8,   2 }, // pmovmskb + cmp
5613     { ISD::OR,   MVT::v2i64,   2 }, // movmskpd + cmp
5614     { ISD::OR,   MVT::v4i32,   2 }, // movmskps + cmp
5615     { ISD::OR,   MVT::v8i16,   2 }, // pmovmskb + cmp
5616     { ISD::OR,   MVT::v16i8,   2 }, // pmovmskb + cmp
5617   };
5618 
5619   // Handle bool allof/anyof patterns.
5620   if (ValVTy->getElementType()->isIntegerTy(1)) {
5621     InstructionCost ArithmeticCost = 0;
5622     if (LT.first != 1 && MTy.isVector() &&
5623         MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5624       // Type needs to be split. We need LT.first - 1 arithmetic ops.
5625       auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5626                                               MTy.getVectorNumElements());
5627       ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5628       ArithmeticCost *= LT.first - 1;
5629     }
5630 
5631     if (ST->hasAVX512())
5632       if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5633         return ArithmeticCost + Entry->Cost;
5634     if (ST->hasAVX2())
5635       if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5636         return ArithmeticCost + Entry->Cost;
5637     if (ST->hasAVX())
5638       if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5639         return ArithmeticCost + Entry->Cost;
5640     if (ST->hasSSE2())
5641       if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5642         return ArithmeticCost + Entry->Cost;
5643 
5644     return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5645   }
5646 
5647   unsigned NumVecElts = ValVTy->getNumElements();
5648   unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5649 
5650   // Special case power of 2 reductions where the scalar type isn't changed
5651   // by type legalization.
5652   if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5653     return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5654 
5655   InstructionCost ReductionCost = 0;
5656 
5657   auto *Ty = ValVTy;
5658   if (LT.first != 1 && MTy.isVector() &&
5659       MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5660     // Type needs to be split. We need LT.first - 1 arithmetic ops.
5661     Ty = FixedVectorType::get(ValVTy->getElementType(),
5662                               MTy.getVectorNumElements());
5663     ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5664     ReductionCost *= LT.first - 1;
5665     NumVecElts = MTy.getVectorNumElements();
5666   }
5667 
5668   // Now handle reduction with the legal type, taking into account size changes
5669   // at each level.
5670   while (NumVecElts > 1) {
5671     // Determine the size of the remaining vector we need to reduce.
5672     unsigned Size = NumVecElts * ScalarSize;
5673     NumVecElts /= 2;
5674     // If we're reducing from 256/512 bits, use an extract_subvector.
5675     if (Size > 128) {
5676       auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5677       ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
5678                                       CostKind, NumVecElts, SubTy);
5679       Ty = SubTy;
5680     } else if (Size == 128) {
5681       // Reducing from 128 bits is a permute of v2f64/v2i64.
5682       FixedVectorType *ShufTy;
5683       if (ValVTy->isFloatingPointTy())
5684         ShufTy =
5685             FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5686       else
5687         ShufTy =
5688             FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5689       ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5690                                       CostKind, 0, nullptr);
5691     } else if (Size == 64) {
5692       // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5693       FixedVectorType *ShufTy;
5694       if (ValVTy->isFloatingPointTy())
5695         ShufTy =
5696             FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5697       else
5698         ShufTy =
5699             FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5700       ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5701                                       CostKind, 0, nullptr);
5702     } else {
5703       // Reducing from smaller size is a shift by immediate.
5704       auto *ShiftTy = FixedVectorType::get(
5705           Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5706       ReductionCost += getArithmeticInstrCost(
5707           Instruction::LShr, ShiftTy, CostKind,
5708           {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5709           {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None});
5710     }
5711 
5712     // Add the arithmetic op for this level.
5713     ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5714   }
5715 
5716   // Add the final extract element to the cost.
5717   return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5718                                             CostKind, 0, nullptr, nullptr);
5719 }
5720 
5721 InstructionCost X86TTIImpl::getMinMaxCost(Intrinsic::ID IID, Type *Ty,
5722                                           TTI::TargetCostKind CostKind,
5723                                           FastMathFlags FMF) {
5724   IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5725   return getIntrinsicInstrCost(ICA, CostKind);
5726 }
5727 
5728 InstructionCost
5729 X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy,
5730                                    FastMathFlags FMF,
5731                                    TTI::TargetCostKind CostKind) {
5732   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5733 
5734   MVT MTy = LT.second;
5735 
5736   int ISD;
5737   if (ValTy->isIntOrIntVectorTy()) {
5738     ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5739                                                              : ISD::SMIN;
5740   } else {
5741     assert(ValTy->isFPOrFPVectorTy() &&
5742            "Expected float point or integer vector type.");
5743     ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5744               ? ISD::FMINNUM
5745               : ISD::FMINIMUM;
5746   }
5747 
5748   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5749   // and make it as the cost.
5750 
5751   static const CostTblEntry SSE2CostTbl[] = {
5752       {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5753       {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5754       {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5755   };
5756 
5757   static const CostTblEntry SSE41CostTbl[] = {
5758       {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5759       {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5760       {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5761       {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5762       {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5763       {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5764       {ISD::SMIN, MVT::v2i8,  3}, // pminsb
5765       {ISD::SMIN, MVT::v4i8,  5}, // pminsb
5766       {ISD::SMIN, MVT::v8i8,  7}, // pminsb
5767       {ISD::SMIN, MVT::v16i8, 6},
5768       {ISD::UMIN, MVT::v2i8,  3}, // same as sse2
5769       {ISD::UMIN, MVT::v4i8,  5}, // same as sse2
5770       {ISD::UMIN, MVT::v8i8,  7}, // same as sse2
5771       {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5772   };
5773 
5774   static const CostTblEntry AVX1CostTbl[] = {
5775       {ISD::SMIN, MVT::v16i16, 6},
5776       {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5777       {ISD::SMIN, MVT::v32i8, 8},
5778       {ISD::UMIN, MVT::v32i8, 8},
5779   };
5780 
5781   static const CostTblEntry AVX512BWCostTbl[] = {
5782       {ISD::SMIN, MVT::v32i16, 8},
5783       {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5784       {ISD::SMIN, MVT::v64i8, 10},
5785       {ISD::UMIN, MVT::v64i8, 10},
5786   };
5787 
5788   // Before legalizing the type, give a chance to look up illegal narrow types
5789   // in the table.
5790   // FIXME: Is there a better way to do this?
5791   EVT VT = TLI->getValueType(DL, ValTy);
5792   if (VT.isSimple()) {
5793     MVT MTy = VT.getSimpleVT();
5794     if (ST->hasBWI())
5795       if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5796         return Entry->Cost;
5797 
5798     if (ST->hasAVX())
5799       if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5800         return Entry->Cost;
5801 
5802     if (ST->hasSSE41())
5803       if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5804         return Entry->Cost;
5805 
5806     if (ST->hasSSE2())
5807       if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5808         return Entry->Cost;
5809   }
5810 
5811   auto *ValVTy = cast<FixedVectorType>(ValTy);
5812   unsigned NumVecElts = ValVTy->getNumElements();
5813 
5814   auto *Ty = ValVTy;
5815   InstructionCost MinMaxCost = 0;
5816   if (LT.first != 1 && MTy.isVector() &&
5817       MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5818     // Type needs to be split. We need LT.first - 1 operations ops.
5819     Ty = FixedVectorType::get(ValVTy->getElementType(),
5820                               MTy.getVectorNumElements());
5821     MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5822     MinMaxCost *= LT.first - 1;
5823     NumVecElts = MTy.getVectorNumElements();
5824   }
5825 
5826   if (ST->hasBWI())
5827     if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5828       return MinMaxCost + Entry->Cost;
5829 
5830   if (ST->hasAVX())
5831     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5832       return MinMaxCost + Entry->Cost;
5833 
5834   if (ST->hasSSE41())
5835     if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5836       return MinMaxCost + Entry->Cost;
5837 
5838   if (ST->hasSSE2())
5839     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5840       return MinMaxCost + Entry->Cost;
5841 
5842   unsigned ScalarSize = ValTy->getScalarSizeInBits();
5843 
5844   // Special case power of 2 reductions where the scalar type isn't changed
5845   // by type legalization.
5846   if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5847       ScalarSize != MTy.getScalarSizeInBits())
5848     return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5849 
5850   // Now handle reduction with the legal type, taking into account size changes
5851   // at each level.
5852   while (NumVecElts > 1) {
5853     // Determine the size of the remaining vector we need to reduce.
5854     unsigned Size = NumVecElts * ScalarSize;
5855     NumVecElts /= 2;
5856     // If we're reducing from 256/512 bits, use an extract_subvector.
5857     if (Size > 128) {
5858       auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5859       MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind,
5860                                    NumVecElts, SubTy);
5861       Ty = SubTy;
5862     } else if (Size == 128) {
5863       // Reducing from 128 bits is a permute of v2f64/v2i64.
5864       VectorType *ShufTy;
5865       if (ValTy->isFloatingPointTy())
5866         ShufTy =
5867             FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
5868       else
5869         ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5870       MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5871                                    CostKind, 0, nullptr);
5872     } else if (Size == 64) {
5873       // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5874       FixedVectorType *ShufTy;
5875       if (ValTy->isFloatingPointTy())
5876         ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5877       else
5878         ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5879       MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5880                                    CostKind, 0, nullptr);
5881     } else {
5882       // Reducing from smaller size is a shift by immediate.
5883       auto *ShiftTy = FixedVectorType::get(
5884           Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5885       MinMaxCost += getArithmeticInstrCost(
5886           Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5887           {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5888           {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None});
5889     }
5890 
5891     // Add the arithmetic op for this level.
5892     MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5893   }
5894 
5895   // Add the final extract element to the cost.
5896   return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5897                                          CostKind, 0, nullptr, nullptr);
5898 }
5899 
5900 /// Calculate the cost of materializing a 64-bit value. This helper
5901 /// method might only calculate a fraction of a larger immediate. Therefore it
5902 /// is valid to return a cost of ZERO.
5903 InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) {
5904   if (Val == 0)
5905     return TTI::TCC_Free;
5906 
5907   if (isInt<32>(Val))
5908     return TTI::TCC_Basic;
5909 
5910   return 2 * TTI::TCC_Basic;
5911 }
5912 
5913 InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
5914                                           TTI::TargetCostKind CostKind) {
5915   assert(Ty->isIntegerTy());
5916 
5917   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5918   if (BitSize == 0)
5919     return ~0U;
5920 
5921   // Never hoist constants larger than 128bit, because this might lead to
5922   // incorrect code generation or assertions in codegen.
5923   // Fixme: Create a cost model for types larger than i128 once the codegen
5924   // issues have been fixed.
5925   if (BitSize > 128)
5926     return TTI::TCC_Free;
5927 
5928   if (Imm == 0)
5929     return TTI::TCC_Free;
5930 
5931   // Sign-extend all constants to a multiple of 64-bit.
5932   APInt ImmVal = Imm;
5933   if (BitSize % 64 != 0)
5934     ImmVal = Imm.sext(alignTo(BitSize, 64));
5935 
5936   // Split the constant into 64-bit chunks and calculate the cost for each
5937   // chunk.
5938   InstructionCost Cost = 0;
5939   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5940     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5941     int64_t Val = Tmp.getSExtValue();
5942     Cost += getIntImmCost(Val);
5943   }
5944   // We need at least one instruction to materialize the constant.
5945   return std::max<InstructionCost>(1, Cost);
5946 }
5947 
5948 InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
5949                                               const APInt &Imm, Type *Ty,
5950                                               TTI::TargetCostKind CostKind,
5951                                               Instruction *Inst) {
5952   assert(Ty->isIntegerTy());
5953 
5954   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5955   unsigned ImmBitWidth = Imm.getBitWidth();
5956 
5957   // There is no cost model for constants with a bit size of 0. Return TCC_Free
5958   // here, so that constant hoisting will ignore this constant.
5959   if (BitSize == 0)
5960     return TTI::TCC_Free;
5961 
5962   unsigned ImmIdx = ~0U;
5963   switch (Opcode) {
5964   default:
5965     return TTI::TCC_Free;
5966   case Instruction::GetElementPtr:
5967     // Always hoist the base address of a GetElementPtr. This prevents the
5968     // creation of new constants for every base constant that gets constant
5969     // folded with the offset.
5970     if (Idx == 0)
5971       return 2 * TTI::TCC_Basic;
5972     return TTI::TCC_Free;
5973   case Instruction::Store:
5974     ImmIdx = 0;
5975     break;
5976   case Instruction::ICmp:
5977     // This is an imperfect hack to prevent constant hoisting of
5978     // compares that might be trying to check if a 64-bit value fits in
5979     // 32-bits. The backend can optimize these cases using a right shift by 32.
5980     // Ideally we would check the compare predicate here. There also other
5981     // similar immediates the backend can use shifts for.
5982     if (Idx == 1 && ImmBitWidth == 64) {
5983       uint64_t ImmVal = Imm.getZExtValue();
5984       if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5985         return TTI::TCC_Free;
5986     }
5987     ImmIdx = 1;
5988     break;
5989   case Instruction::And:
5990     // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5991     // by using a 32-bit operation with implicit zero extension. Detect such
5992     // immediates here as the normal path expects bit 31 to be sign extended.
5993     if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
5994       return TTI::TCC_Free;
5995     // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
5996     if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
5997         Imm.isMask())
5998       return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
5999     ImmIdx = 1;
6000     break;
6001   case Instruction::Add:
6002   case Instruction::Sub:
6003     // For add/sub, we can use the opposite instruction for INT32_MIN.
6004     if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
6005       return TTI::TCC_Free;
6006     ImmIdx = 1;
6007     break;
6008   case Instruction::UDiv:
6009   case Instruction::SDiv:
6010   case Instruction::URem:
6011   case Instruction::SRem:
6012     // Division by constant is typically expanded later into a different
6013     // instruction sequence. This completely changes the constants.
6014     // Report them as "free" to stop ConstantHoist from marking them as opaque.
6015     return TTI::TCC_Free;
6016   case Instruction::Mul:
6017   case Instruction::Or:
6018   case Instruction::Xor:
6019     ImmIdx = 1;
6020     break;
6021   // Always return TCC_Free for the shift value of a shift instruction.
6022   case Instruction::Shl:
6023   case Instruction::LShr:
6024   case Instruction::AShr:
6025     if (Idx == 1)
6026       return TTI::TCC_Free;
6027     break;
6028   case Instruction::Trunc:
6029   case Instruction::ZExt:
6030   case Instruction::SExt:
6031   case Instruction::IntToPtr:
6032   case Instruction::PtrToInt:
6033   case Instruction::BitCast:
6034   case Instruction::PHI:
6035   case Instruction::Call:
6036   case Instruction::Select:
6037   case Instruction::Ret:
6038   case Instruction::Load:
6039     break;
6040   }
6041 
6042   if (Idx == ImmIdx) {
6043     uint64_t NumConstants = divideCeil(BitSize, 64);
6044     InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6045     return (Cost <= NumConstants * TTI::TCC_Basic)
6046                ? static_cast<int>(TTI::TCC_Free)
6047                : Cost;
6048   }
6049 
6050   return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6051 }
6052 
6053 InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
6054                                                 const APInt &Imm, Type *Ty,
6055                                                 TTI::TargetCostKind CostKind) {
6056   assert(Ty->isIntegerTy());
6057 
6058   unsigned BitSize = Ty->getPrimitiveSizeInBits();
6059   // There is no cost model for constants with a bit size of 0. Return TCC_Free
6060   // here, so that constant hoisting will ignore this constant.
6061   if (BitSize == 0)
6062     return TTI::TCC_Free;
6063 
6064   switch (IID) {
6065   default:
6066     return TTI::TCC_Free;
6067   case Intrinsic::sadd_with_overflow:
6068   case Intrinsic::uadd_with_overflow:
6069   case Intrinsic::ssub_with_overflow:
6070   case Intrinsic::usub_with_overflow:
6071   case Intrinsic::smul_with_overflow:
6072   case Intrinsic::umul_with_overflow:
6073     if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6074       return TTI::TCC_Free;
6075     break;
6076   case Intrinsic::experimental_stackmap:
6077     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6078       return TTI::TCC_Free;
6079     break;
6080   case Intrinsic::experimental_patchpoint_void:
6081   case Intrinsic::experimental_patchpoint:
6082     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6083       return TTI::TCC_Free;
6084     break;
6085   }
6086   return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6087 }
6088 
6089 InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode,
6090                                            TTI::TargetCostKind CostKind,
6091                                            const Instruction *I) {
6092   if (CostKind != TTI::TCK_RecipThroughput)
6093     return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6094   // Branches are assumed to be predicted.
6095   return TTI::TCC_Free;
6096 }
6097 
6098 int X86TTIImpl::getGatherOverhead() const {
6099   // Some CPUs have more overhead for gather. The specified overhead is relative
6100   // to the Load operation. "2" is the number provided by Intel architects. This
6101   // parameter is used for cost estimation of Gather Op and comparison with
6102   // other alternatives.
6103   // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6104   // enable gather with a -march.
6105   if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6106     return 2;
6107 
6108   return 1024;
6109 }
6110 
6111 int X86TTIImpl::getScatterOverhead() const {
6112   if (ST->hasAVX512())
6113     return 2;
6114 
6115   return 1024;
6116 }
6117 
6118 // Return an average cost of Gather / Scatter instruction, maybe improved later.
6119 InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6120                                             TTI::TargetCostKind CostKind,
6121                                             Type *SrcVTy, const Value *Ptr,
6122                                             Align Alignment,
6123                                             unsigned AddressSpace) {
6124 
6125   assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6126   unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6127 
6128   // Try to reduce index size from 64 bit (default for GEP)
6129   // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6130   // operation will use 16 x 64 indices which do not fit in a zmm and needs
6131   // to split. Also check that the base pointer is the same for all lanes,
6132   // and that there's at most one variable index.
6133   auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6134     unsigned IndexSize = DL.getPointerSizeInBits();
6135     const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6136     if (IndexSize < 64 || !GEP)
6137       return IndexSize;
6138 
6139     unsigned NumOfVarIndices = 0;
6140     const Value *Ptrs = GEP->getPointerOperand();
6141     if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6142       return IndexSize;
6143     for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6144       if (isa<Constant>(GEP->getOperand(I)))
6145         continue;
6146       Type *IndxTy = GEP->getOperand(I)->getType();
6147       if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6148         IndxTy = IndexVTy->getElementType();
6149       if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6150            !isa<SExtInst>(GEP->getOperand(I))) ||
6151           ++NumOfVarIndices > 1)
6152         return IndexSize; // 64
6153     }
6154     return (unsigned)32;
6155   };
6156 
6157   // Trying to reduce IndexSize to 32 bits for vector 16.
6158   // By default the IndexSize is equal to pointer size.
6159   unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6160                            ? getIndexSizeInBits(Ptr, DL)
6161                            : DL.getPointerSizeInBits();
6162 
6163   auto *IndexVTy = FixedVectorType::get(
6164       IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6165   std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6166   std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6167   InstructionCost::CostType SplitFactor =
6168       *std::max(IdxsLT.first, SrcLT.first).getValue();
6169   if (SplitFactor > 1) {
6170     // Handle splitting of vector of pointers
6171     auto *SplitSrcTy =
6172         FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6173     return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6174                                          Alignment, AddressSpace);
6175   }
6176 
6177   // If we didn't split, this will be a single gather/scatter instruction.
6178   if (CostKind == TTI::TCK_CodeSize)
6179     return 1;
6180 
6181   // The gather / scatter cost is given by Intel architects. It is a rough
6182   // number since we are looking at one instruction in a time.
6183   const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6184                                                        : getScatterOverhead();
6185   return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6186                                            MaybeAlign(Alignment), AddressSpace,
6187                                            CostKind);
6188 }
6189 
6190 /// Calculate the cost of Gather / Scatter operation
6191 InstructionCost X86TTIImpl::getGatherScatterOpCost(
6192     unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
6193     Align Alignment, TTI::TargetCostKind CostKind,
6194     const Instruction *I = nullptr) {
6195   if ((Opcode == Instruction::Load &&
6196        (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6197         forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
6198                                    Align(Alignment)))) ||
6199       (Opcode == Instruction::Store &&
6200        (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6201         forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
6202                                     Align(Alignment)))))
6203     return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
6204                                          Alignment, CostKind, I);
6205 
6206   assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6207   PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6208   if (!PtrTy && Ptr->getType()->isVectorTy())
6209     PtrTy = dyn_cast<PointerType>(
6210         cast<VectorType>(Ptr->getType())->getElementType());
6211   assert(PtrTy && "Unexpected type for Ptr argument");
6212   unsigned AddressSpace = PtrTy->getAddressSpace();
6213   return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6214                          AddressSpace);
6215 }
6216 
6217 bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
6218                                const TargetTransformInfo::LSRCost &C2) {
6219     // X86 specific here are "instruction number 1st priority".
6220     return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
6221                     C1.NumIVMuls, C1.NumBaseAdds,
6222                     C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6223            std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
6224                     C2.NumIVMuls, C2.NumBaseAdds,
6225                     C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6226 }
6227 
6228 bool X86TTIImpl::canMacroFuseCmp() {
6229   return ST->hasMacroFusion() || ST->hasBranchFusion();
6230 }
6231 
6232 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
6233   Type *ScalarTy = DataTy->getScalarType();
6234 
6235   // The backend can't handle a single element vector w/o CFCMOV.
6236   if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6237     return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy);
6238 
6239   if (!ST->hasAVX())
6240     return false;
6241 
6242   if (ScalarTy->isPointerTy())
6243     return true;
6244 
6245   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6246     return true;
6247 
6248   if (ScalarTy->isHalfTy() && ST->hasBWI())
6249     return true;
6250 
6251   if (ScalarTy->isBFloatTy() && ST->hasBF16())
6252     return true;
6253 
6254   if (!ScalarTy->isIntegerTy())
6255     return false;
6256 
6257   unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6258   return IntWidth == 32 || IntWidth == 64 ||
6259          ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6260 }
6261 
6262 bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
6263   return isLegalMaskedLoad(DataType, Alignment);
6264 }
6265 
6266 bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
6267   unsigned DataSize = DL.getTypeStoreSize(DataType);
6268   // The only supported nontemporal loads are for aligned vectors of 16 or 32
6269   // bytes.  Note that 32-byte nontemporal vector loads are supported by AVX2
6270   // (the equivalent stores only require AVX).
6271   if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6272     return DataSize == 16 ?  ST->hasSSE1() : ST->hasAVX2();
6273 
6274   return false;
6275 }
6276 
6277 bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
6278   unsigned DataSize = DL.getTypeStoreSize(DataType);
6279 
6280   // SSE4A supports nontemporal stores of float and double at arbitrary
6281   // alignment.
6282   if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6283     return true;
6284 
6285   // Besides the SSE4A subtarget exception above, only aligned stores are
6286   // available nontemporaly on any other subtarget.  And only stores with a size
6287   // of 4..32 bytes (powers of 2, only) are permitted.
6288   if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6289       !isPowerOf2_32(DataSize))
6290     return false;
6291 
6292   // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6293   // loads require AVX2).
6294   if (DataSize == 32)
6295     return ST->hasAVX();
6296   if (DataSize == 16)
6297     return ST->hasSSE1();
6298   return true;
6299 }
6300 
6301 bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy,
6302                                       ElementCount NumElements) const {
6303   // movddup
6304   return ST->hasSSE3() && !NumElements.isScalable() &&
6305          NumElements.getFixedValue() == 2 &&
6306          ElementTy == Type::getDoubleTy(ElementTy->getContext());
6307 }
6308 
6309 bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) {
6310   if (!isa<VectorType>(DataTy))
6311     return false;
6312 
6313   if (!ST->hasAVX512())
6314     return false;
6315 
6316   // The backend can't handle a single element vector.
6317   if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6318     return false;
6319 
6320   Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6321 
6322   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6323     return true;
6324 
6325   if (!ScalarTy->isIntegerTy())
6326     return false;
6327 
6328   unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6329   return IntWidth == 32 || IntWidth == 64 ||
6330          ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6331 }
6332 
6333 bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) {
6334   return isLegalMaskedExpandLoad(DataTy, Alignment);
6335 }
6336 
6337 bool X86TTIImpl::supportsGather() const {
6338   // Some CPUs have better gather performance than others.
6339   // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6340   // enable gather with a -march.
6341   return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6342 }
6343 
6344 bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
6345   // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6346   // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6347   // it to 8 elements, but zeroing upper bits of the mask vector will add more
6348   // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6349   // Check, maybe the gather/scatter instruction is better in the VariableMask
6350   // case.
6351   unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6352   return NumElts == 1 ||
6353          (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6354 }
6355 
6356 bool X86TTIImpl::isLegalMaskedGatherScatter(Type *DataTy, Align Alignment) {
6357   Type *ScalarTy = DataTy->getScalarType();
6358   if (ScalarTy->isPointerTy())
6359     return true;
6360 
6361   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6362     return true;
6363 
6364   if (!ScalarTy->isIntegerTy())
6365     return false;
6366 
6367   unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6368   return IntWidth == 32 || IntWidth == 64;
6369 }
6370 
6371 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
6372   if (!supportsGather() || !ST->preferGather())
6373     return false;
6374   return isLegalMaskedGatherScatter(DataTy, Alignment);
6375 }
6376 
6377 bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6378                                  unsigned Opcode1,
6379                                  const SmallBitVector &OpcodeMask) const {
6380   // ADDSUBPS  4xf32 SSE3
6381   // VADDSUBPS 4xf32 AVX
6382   // VADDSUBPS 8xf32 AVX2
6383   // ADDSUBPD  2xf64 SSE3
6384   // VADDSUBPD 2xf64 AVX
6385   // VADDSUBPD 4xf64 AVX2
6386 
6387   unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6388   assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6389   if (!isPowerOf2_32(NumElements))
6390     return false;
6391   // Check the opcode pattern. We apply the mask on the opcode arguments and
6392   // then check if it is what we expect.
6393   for (int Lane : seq<int>(0, NumElements)) {
6394     unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6395     // We expect FSub for even lanes and FAdd for odd lanes.
6396     if (Lane % 2 == 0 && Opc != Instruction::FSub)
6397       return false;
6398     if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6399       return false;
6400   }
6401   // Now check that the pattern is supported by the target ISA.
6402   Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6403   if (ElemTy->isFloatTy())
6404     return ST->hasSSE3() && NumElements % 4 == 0;
6405   if (ElemTy->isDoubleTy())
6406     return ST->hasSSE3() && NumElements % 2 == 0;
6407   return false;
6408 }
6409 
6410 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6411   // AVX2 doesn't support scatter
6412   if (!ST->hasAVX512() || !ST->preferScatter())
6413     return false;
6414   return isLegalMaskedGatherScatter(DataType, Alignment);
6415 }
6416 
6417 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6418   EVT VT = TLI->getValueType(DL, DataType);
6419   return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6420 }
6421 
6422 bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction* I) {
6423   // FDIV is always expensive, even if it has a very low uop count.
6424   // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6425   if (I->getOpcode() == Instruction::FDiv)
6426     return true;
6427 
6428   return BaseT::isExpensiveToSpeculativelyExecute(I);
6429 }
6430 
6431 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
6432   return false;
6433 }
6434 
6435 bool X86TTIImpl::areInlineCompatible(const Function *Caller,
6436                                      const Function *Callee) const {
6437   const TargetMachine &TM = getTLI()->getTargetMachine();
6438 
6439   // Work this as a subsetting of subtarget features.
6440   const FeatureBitset &CallerBits =
6441       TM.getSubtargetImpl(*Caller)->getFeatureBits();
6442   const FeatureBitset &CalleeBits =
6443       TM.getSubtargetImpl(*Callee)->getFeatureBits();
6444 
6445   // Check whether features are the same (apart from the ignore list).
6446   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6447   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6448   if (RealCallerBits == RealCalleeBits)
6449     return true;
6450 
6451   // If the features are a subset, we need to additionally check for calls
6452   // that may become ABI-incompatible as a result of inlining.
6453   if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6454     return false;
6455 
6456   for (const Instruction &I : instructions(Callee)) {
6457     if (const auto *CB = dyn_cast<CallBase>(&I)) {
6458       // Having more target features is fine for inline ASM.
6459       if (CB->isInlineAsm())
6460         continue;
6461 
6462       SmallVector<Type *, 8> Types;
6463       for (Value *Arg : CB->args())
6464         Types.push_back(Arg->getType());
6465       if (!CB->getType()->isVoidTy())
6466         Types.push_back(CB->getType());
6467 
6468       // Simple types are always ABI compatible.
6469       auto IsSimpleTy = [](Type *Ty) {
6470         return !Ty->isVectorTy() && !Ty->isAggregateType();
6471       };
6472       if (all_of(Types, IsSimpleTy))
6473         continue;
6474 
6475       if (Function *NestedCallee = CB->getCalledFunction()) {
6476         // Assume that intrinsics are always ABI compatible.
6477         if (NestedCallee->isIntrinsic())
6478           continue;
6479 
6480         // Do a precise compatibility check.
6481         if (!areTypesABICompatible(Caller, NestedCallee, Types))
6482           return false;
6483       } else {
6484         // We don't know the target features of the callee,
6485         // assume it is incompatible.
6486         return false;
6487       }
6488     }
6489   }
6490   return true;
6491 }
6492 
6493 bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
6494                                        const Function *Callee,
6495                                        const ArrayRef<Type *> &Types) const {
6496   if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6497     return false;
6498 
6499   // If we get here, we know the target features match. If one function
6500   // considers 512-bit vectors legal and the other does not, consider them
6501   // incompatible.
6502   const TargetMachine &TM = getTLI()->getTargetMachine();
6503 
6504   if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6505       TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6506     return true;
6507 
6508   // Consider the arguments compatible if they aren't vectors or aggregates.
6509   // FIXME: Look at the size of vectors.
6510   // FIXME: Look at the element types of aggregates to see if there are vectors.
6511   return llvm::none_of(Types,
6512       [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6513 }
6514 
6515 X86TTIImpl::TTI::MemCmpExpansionOptions
6516 X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6517   TTI::MemCmpExpansionOptions Options;
6518   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6519   Options.NumLoadsPerBlock = 2;
6520   // All GPR and vector loads can be unaligned.
6521   Options.AllowOverlappingLoads = true;
6522   if (IsZeroCmp) {
6523     // Only enable vector loads for equality comparison. Right now the vector
6524     // version is not as fast for three way compare (see #33329).
6525     const unsigned PreferredWidth = ST->getPreferVectorWidth();
6526     if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6527       Options.LoadSizes.push_back(64);
6528     if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6529     if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6530   }
6531   if (ST->is64Bit()) {
6532     Options.LoadSizes.push_back(8);
6533   }
6534   Options.LoadSizes.push_back(4);
6535   Options.LoadSizes.push_back(2);
6536   Options.LoadSizes.push_back(1);
6537   return Options;
6538 }
6539 
6540 bool X86TTIImpl::prefersVectorizedAddressing() const {
6541   return supportsGather();
6542 }
6543 
6544 bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const {
6545   return false;
6546 }
6547 
6548 bool X86TTIImpl::enableInterleavedAccessVectorization() {
6549   // TODO: We expect this to be beneficial regardless of arch,
6550   // but there are currently some unexplained performance artifacts on Atom.
6551   // As a temporary solution, disable on Atom.
6552   return !(ST->isAtom());
6553 }
6554 
6555 // Get estimation for interleaved load/store operations and strided load.
6556 // \p Indices contains indices for strided load.
6557 // \p Factor - the factor of interleaving.
6558 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
6559 InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
6560     unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6561     ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6562     TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6563   // VecTy for interleave memop is <VF*Factor x Elt>.
6564   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6565   // VecTy = <12 x i32>.
6566 
6567   // Calculate the number of memory operations (NumOfMemOps), required
6568   // for load/store the VecTy.
6569   MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6570   unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6571   unsigned LegalVTSize = LegalVT.getStoreSize();
6572   unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6573 
6574   // Get the cost of one memory operation.
6575   auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6576                                              LegalVT.getVectorNumElements());
6577   InstructionCost MemOpCost;
6578   bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6579   if (UseMaskedMemOp)
6580     MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6581                                       AddressSpace, CostKind);
6582   else
6583     MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6584                                 AddressSpace, CostKind);
6585 
6586   unsigned VF = VecTy->getNumElements() / Factor;
6587   MVT VT =
6588       MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF);
6589 
6590   InstructionCost MaskCost;
6591   if (UseMaskedMemOp) {
6592     APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6593     for (unsigned Index : Indices) {
6594       assert(Index < Factor && "Invalid index for interleaved memory op");
6595       for (unsigned Elm = 0; Elm < VF; Elm++)
6596         DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6597     }
6598 
6599     Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6600 
6601     MaskCost = getReplicationShuffleCost(
6602         I1Type, Factor, VF,
6603         UseMaskForGaps ? DemandedLoadStoreElts
6604                        : APInt::getAllOnes(VecTy->getNumElements()),
6605         CostKind);
6606 
6607     // The Gaps mask is invariant and created outside the loop, therefore the
6608     // cost of creating it is not accounted for here. However if we have both
6609     // a MaskForGaps and some other mask that guards the execution of the
6610     // memory access, we need to account for the cost of And-ing the two masks
6611     // inside the loop.
6612     if (UseMaskForGaps) {
6613       auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6614       MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6615     }
6616   }
6617 
6618   if (Opcode == Instruction::Load) {
6619     // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6620     // contain the cost of the optimized shuffle sequence that the
6621     // X86InterleavedAccess pass will generate.
6622     // The cost of loads and stores are computed separately from the table.
6623 
6624     // X86InterleavedAccess support only the following interleaved-access group.
6625     static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6626         {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6627         {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6628         {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6629     };
6630 
6631     if (const auto *Entry =
6632             CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6633       return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6634     //If an entry does not exist, fallback to the default implementation.
6635 
6636     // Kind of shuffle depends on number of loaded values.
6637     // If we load the entire data in one register, we can use a 1-src shuffle.
6638     // Otherwise, we'll merge 2 sources in each operation.
6639     TTI::ShuffleKind ShuffleKind =
6640         (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6641 
6642     InstructionCost ShuffleCost =
6643         getShuffleCost(ShuffleKind, SingleMemOpTy, {}, CostKind, 0, nullptr);
6644 
6645     unsigned NumOfLoadsInInterleaveGrp =
6646         Indices.size() ? Indices.size() : Factor;
6647     auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6648                                           VecTy->getNumElements() / Factor);
6649     InstructionCost NumOfResults =
6650         getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6651 
6652     // About a half of the loads may be folded in shuffles when we have only
6653     // one result. If we have more than one result, or the loads are masked,
6654     // we do not fold loads at all.
6655     unsigned NumOfUnfoldedLoads =
6656         UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6657 
6658     // Get a number of shuffle operations per result.
6659     unsigned NumOfShufflesPerResult =
6660         std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6661 
6662     // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6663     // When we have more than one destination, we need additional instructions
6664     // to keep sources.
6665     InstructionCost NumOfMoves = 0;
6666     if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6667       NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6668 
6669     InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6670                            MaskCost + NumOfUnfoldedLoads * MemOpCost +
6671                            NumOfMoves;
6672 
6673     return Cost;
6674   }
6675 
6676   // Store.
6677   assert(Opcode == Instruction::Store &&
6678          "Expected Store Instruction at this  point");
6679   // X86InterleavedAccess support only the following interleaved-access group.
6680   static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6681       {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6682       {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6683       {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6684 
6685       {4, MVT::v8i8, 10},  // interleave 4 x 8i8  into 32i8  (and store)
6686       {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8  (and store)
6687       {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6688       {4, MVT::v64i8, 24}  // interleave 4 x 32i8 into 256i8 (and store)
6689   };
6690 
6691   if (const auto *Entry =
6692           CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6693     return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6694   //If an entry does not exist, fallback to the default implementation.
6695 
6696   // There is no strided stores meanwhile. And store can't be folded in
6697   // shuffle.
6698   unsigned NumOfSources = Factor; // The number of values to be merged.
6699   InstructionCost ShuffleCost = getShuffleCost(
6700       TTI::SK_PermuteTwoSrc, SingleMemOpTy, {}, CostKind, 0, nullptr);
6701   unsigned NumOfShufflesPerStore = NumOfSources - 1;
6702 
6703   // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6704   // We need additional instructions to keep sources.
6705   unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6706   InstructionCost Cost =
6707       MaskCost +
6708       NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6709       NumOfMoves;
6710   return Cost;
6711 }
6712 
6713 InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
6714     unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6715     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6716     bool UseMaskForCond, bool UseMaskForGaps) {
6717   auto *VecTy = cast<FixedVectorType>(BaseTy);
6718 
6719   auto isSupportedOnAVX512 = [&](Type *VecTy) {
6720     Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6721     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6722         EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6723       return true;
6724     if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6725       return ST->hasBWI();
6726     if (EltTy->isBFloatTy())
6727       return ST->hasBF16();
6728     return false;
6729   };
6730   if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6731     return getInterleavedMemoryOpCostAVX512(
6732         Opcode, VecTy, Factor, Indices, Alignment,
6733         AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6734 
6735   if (UseMaskForCond || UseMaskForGaps)
6736     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6737                                              Alignment, AddressSpace, CostKind,
6738                                              UseMaskForCond, UseMaskForGaps);
6739 
6740   // Get estimation for interleaved load/store operations for SSE-AVX2.
6741   // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6742   // computing the cost using a generic formula as a function of generic
6743   // shuffles. We therefore use a lookup table instead, filled according to
6744   // the instruction sequences that codegen currently generates.
6745 
6746   // VecTy for interleave memop is <VF*Factor x Elt>.
6747   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6748   // VecTy = <12 x i32>.
6749   MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6750 
6751   // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6752   // the VF=2, while v2i128 is an unsupported MVT vector type
6753   // (see MachineValueType.h::getVectorVT()).
6754   if (!LegalVT.isVector())
6755     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6756                                              Alignment, AddressSpace, CostKind);
6757 
6758   unsigned VF = VecTy->getNumElements() / Factor;
6759   Type *ScalarTy = VecTy->getElementType();
6760   // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6761   if (!ScalarTy->isIntegerTy())
6762     ScalarTy =
6763         Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6764 
6765   // Get the cost of all the memory operations.
6766   // FIXME: discount dead loads.
6767   InstructionCost MemOpCosts = getMemoryOpCost(
6768       Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6769 
6770   auto *VT = FixedVectorType::get(ScalarTy, VF);
6771   EVT ETy = TLI->getValueType(DL, VT);
6772   if (!ETy.isSimple())
6773     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6774                                              Alignment, AddressSpace, CostKind);
6775 
6776   // TODO: Complete for other data-types and strides.
6777   // Each combination of Stride, element bit width and VF results in a different
6778   // sequence; The cost tables are therefore accessed with:
6779   // Factor (stride) and VectorType=VFxiN.
6780   // The Cost accounts only for the shuffle sequence;
6781   // The cost of the loads/stores is accounted for separately.
6782   //
6783   static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6784       {2, MVT::v2i8, 2},  // (load 4i8 and) deinterleave into 2 x 2i8
6785       {2, MVT::v4i8, 2},  // (load 8i8 and) deinterleave into 2 x 4i8
6786       {2, MVT::v8i8, 2},  // (load 16i8 and) deinterleave into 2 x 8i8
6787       {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6788       {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6789 
6790       {2, MVT::v8i16, 6},   // (load 16i16 and) deinterleave into 2 x 8i16
6791       {2, MVT::v16i16, 9},  // (load 32i16 and) deinterleave into 2 x 16i16
6792       {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6793 
6794       {2, MVT::v8i32, 4},   // (load 16i32 and) deinterleave into 2 x 8i32
6795       {2, MVT::v16i32, 8},  // (load 32i32 and) deinterleave into 2 x 16i32
6796       {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6797 
6798       {2, MVT::v4i64, 4},   // (load 8i64 and) deinterleave into 2 x 4i64
6799       {2, MVT::v8i64, 8},   // (load 16i64 and) deinterleave into 2 x 8i64
6800       {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6801       {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6802 
6803       {3, MVT::v2i8, 3},   // (load 6i8 and) deinterleave into 3 x 2i8
6804       {3, MVT::v4i8, 3},   // (load 12i8 and) deinterleave into 3 x 4i8
6805       {3, MVT::v8i8, 6},   // (load 24i8 and) deinterleave into 3 x 8i8
6806       {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6807       {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6808 
6809       {3, MVT::v2i16, 5},   // (load 6i16 and) deinterleave into 3 x 2i16
6810       {3, MVT::v4i16, 7},   // (load 12i16 and) deinterleave into 3 x 4i16
6811       {3, MVT::v8i16, 9},   // (load 24i16 and) deinterleave into 3 x 8i16
6812       {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6813       {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6814 
6815       {3, MVT::v2i32, 3},   // (load 6i32 and) deinterleave into 3 x 2i32
6816       {3, MVT::v4i32, 3},   // (load 12i32 and) deinterleave into 3 x 4i32
6817       {3, MVT::v8i32, 7},   // (load 24i32 and) deinterleave into 3 x 8i32
6818       {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6819       {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6820 
6821       {3, MVT::v2i64, 1},   // (load 6i64 and) deinterleave into 3 x 2i64
6822       {3, MVT::v4i64, 5},   // (load 12i64 and) deinterleave into 3 x 4i64
6823       {3, MVT::v8i64, 10},  // (load 24i64 and) deinterleave into 3 x 8i64
6824       {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6825 
6826       {4, MVT::v2i8, 4},   // (load 8i8 and) deinterleave into 4 x 2i8
6827       {4, MVT::v4i8, 4},   // (load 16i8 and) deinterleave into 4 x 4i8
6828       {4, MVT::v8i8, 12},  // (load 32i8 and) deinterleave into 4 x 8i8
6829       {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6830       {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6831 
6832       {4, MVT::v2i16, 6},    // (load 8i16 and) deinterleave into 4 x 2i16
6833       {4, MVT::v4i16, 17},   // (load 16i16 and) deinterleave into 4 x 4i16
6834       {4, MVT::v8i16, 33},   // (load 32i16 and) deinterleave into 4 x 8i16
6835       {4, MVT::v16i16, 75},  // (load 64i16 and) deinterleave into 4 x 16i16
6836       {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6837 
6838       {4, MVT::v2i32, 4},   // (load 8i32 and) deinterleave into 4 x 2i32
6839       {4, MVT::v4i32, 8},   // (load 16i32 and) deinterleave into 4 x 4i32
6840       {4, MVT::v8i32, 16},  // (load 32i32 and) deinterleave into 4 x 8i32
6841       {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6842       {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6843 
6844       {4, MVT::v2i64, 6},  // (load 8i64 and) deinterleave into 4 x 2i64
6845       {4, MVT::v4i64, 8},  // (load 16i64 and) deinterleave into 4 x 4i64
6846       {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6847       {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6848 
6849       {6, MVT::v2i8, 6},   // (load 12i8 and) deinterleave into 6 x 2i8
6850       {6, MVT::v4i8, 14},  // (load 24i8 and) deinterleave into 6 x 4i8
6851       {6, MVT::v8i8, 18},  // (load 48i8 and) deinterleave into 6 x 8i8
6852       {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6853       {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6854 
6855       {6, MVT::v2i16, 13},   // (load 12i16 and) deinterleave into 6 x 2i16
6856       {6, MVT::v4i16, 9},    // (load 24i16 and) deinterleave into 6 x 4i16
6857       {6, MVT::v8i16, 39},   // (load 48i16 and) deinterleave into 6 x 8i16
6858       {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6859       {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6860 
6861       {6, MVT::v2i32, 6},   // (load 12i32 and) deinterleave into 6 x 2i32
6862       {6, MVT::v4i32, 15},  // (load 24i32 and) deinterleave into 6 x 4i32
6863       {6, MVT::v8i32, 31},  // (load 48i32 and) deinterleave into 6 x 8i32
6864       {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6865 
6866       {6, MVT::v2i64, 6},  // (load 12i64 and) deinterleave into 6 x 2i64
6867       {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6868       {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6869 
6870       {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6871   };
6872 
6873   static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6874       {2, MVT::v4i16, 2},   // (load 8i16 and) deinterleave into 2 x 4i16
6875   };
6876 
6877   static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6878       {2, MVT::v2i16, 2},   // (load 4i16 and) deinterleave into 2 x 2i16
6879       {2, MVT::v4i16, 7},   // (load 8i16 and) deinterleave into 2 x 4i16
6880 
6881       {2, MVT::v2i32, 2},   // (load 4i32 and) deinterleave into 2 x 2i32
6882       {2, MVT::v4i32, 2},   // (load 8i32 and) deinterleave into 2 x 4i32
6883 
6884       {2, MVT::v2i64, 2},   // (load 4i64 and) deinterleave into 2 x 2i64
6885   };
6886 
6887   static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6888       {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6889       {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6890 
6891       {2, MVT::v8i16, 3},  // interleave 2 x 8i16 into 16i16 (and store)
6892       {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6893       {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6894 
6895       {2, MVT::v4i32, 2},   // interleave 2 x 4i32 into 8i32 (and store)
6896       {2, MVT::v8i32, 4},   // interleave 2 x 8i32 into 16i32 (and store)
6897       {2, MVT::v16i32, 8},  // interleave 2 x 16i32 into 32i32 (and store)
6898       {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6899 
6900       {2, MVT::v2i64, 2},   // interleave 2 x 2i64 into 4i64 (and store)
6901       {2, MVT::v4i64, 4},   // interleave 2 x 4i64 into 8i64 (and store)
6902       {2, MVT::v8i64, 8},   // interleave 2 x 8i64 into 16i64 (and store)
6903       {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6904       {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6905 
6906       {3, MVT::v2i8, 4},   // interleave 3 x 2i8 into 6i8 (and store)
6907       {3, MVT::v4i8, 4},   // interleave 3 x 4i8 into 12i8 (and store)
6908       {3, MVT::v8i8, 6},   // interleave 3 x 8i8 into 24i8 (and store)
6909       {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6910       {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6911 
6912       {3, MVT::v2i16, 4},   // interleave 3 x 2i16 into 6i16 (and store)
6913       {3, MVT::v4i16, 6},   // interleave 3 x 4i16 into 12i16 (and store)
6914       {3, MVT::v8i16, 12},  // interleave 3 x 8i16 into 24i16 (and store)
6915       {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6916       {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6917 
6918       {3, MVT::v2i32, 4},   // interleave 3 x 2i32 into 6i32 (and store)
6919       {3, MVT::v4i32, 5},   // interleave 3 x 4i32 into 12i32 (and store)
6920       {3, MVT::v8i32, 11},  // interleave 3 x 8i32 into 24i32 (and store)
6921       {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6922       {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6923 
6924       {3, MVT::v2i64, 4},   // interleave 3 x 2i64 into 6i64 (and store)
6925       {3, MVT::v4i64, 6},   // interleave 3 x 4i64 into 12i64 (and store)
6926       {3, MVT::v8i64, 12},  // interleave 3 x 8i64 into 24i64 (and store)
6927       {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6928 
6929       {4, MVT::v2i8, 4},   // interleave 4 x 2i8 into 8i8 (and store)
6930       {4, MVT::v4i8, 4},   // interleave 4 x 4i8 into 16i8 (and store)
6931       {4, MVT::v8i8, 4},   // interleave 4 x 8i8 into 32i8 (and store)
6932       {4, MVT::v16i8, 8},  // interleave 4 x 16i8 into 64i8 (and store)
6933       {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6934 
6935       {4, MVT::v2i16, 2},   // interleave 4 x 2i16 into 8i16 (and store)
6936       {4, MVT::v4i16, 6},   // interleave 4 x 4i16 into 16i16 (and store)
6937       {4, MVT::v8i16, 10},  // interleave 4 x 8i16 into 32i16 (and store)
6938       {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6939       {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6940 
6941       {4, MVT::v2i32, 5},   // interleave 4 x 2i32 into 8i32 (and store)
6942       {4, MVT::v4i32, 6},   // interleave 4 x 4i32 into 16i32 (and store)
6943       {4, MVT::v8i32, 16},  // interleave 4 x 8i32 into 32i32 (and store)
6944       {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6945       {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6946 
6947       {4, MVT::v2i64, 6},  // interleave 4 x 2i64 into 8i64 (and store)
6948       {4, MVT::v4i64, 8},  // interleave 4 x 4i64 into 16i64 (and store)
6949       {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6950       {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6951 
6952       {6, MVT::v2i8, 7},   // interleave 6 x 2i8 into 12i8 (and store)
6953       {6, MVT::v4i8, 9},   // interleave 6 x 4i8 into 24i8 (and store)
6954       {6, MVT::v8i8, 16},  // interleave 6 x 8i8 into 48i8 (and store)
6955       {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6956       {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6957 
6958       {6, MVT::v2i16, 10},  // interleave 6 x 2i16 into 12i16 (and store)
6959       {6, MVT::v4i16, 15},  // interleave 6 x 4i16 into 24i16 (and store)
6960       {6, MVT::v8i16, 21},  // interleave 6 x 8i16 into 48i16 (and store)
6961       {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6962       {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6963 
6964       {6, MVT::v2i32, 9},   // interleave 6 x 2i32 into 12i32 (and store)
6965       {6, MVT::v4i32, 12},  // interleave 6 x 4i32 into 24i32 (and store)
6966       {6, MVT::v8i32, 33},  // interleave 6 x 8i32 into 48i32 (and store)
6967       {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6968 
6969       {6, MVT::v2i64, 8},  // interleave 6 x 2i64 into 12i64 (and store)
6970       {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6971       {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6972   };
6973 
6974   static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6975       {2, MVT::v2i8, 1},   // interleave 2 x 2i8 into 4i8 (and store)
6976       {2, MVT::v4i8, 1},   // interleave 2 x 4i8 into 8i8 (and store)
6977       {2, MVT::v8i8, 1},   // interleave 2 x 8i8 into 16i8 (and store)
6978 
6979       {2, MVT::v2i16, 1},  // interleave 2 x 2i16 into 4i16 (and store)
6980       {2, MVT::v4i16, 1},  // interleave 2 x 4i16 into 8i16 (and store)
6981 
6982       {2, MVT::v2i32, 1},  // interleave 2 x 2i32 into 4i32 (and store)
6983   };
6984 
6985   if (Opcode == Instruction::Load) {
6986     auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6987                               MemOpCosts](const CostTblEntry *Entry) {
6988       // NOTE: this is just an approximation!
6989       //       It can over/under -estimate the cost!
6990       return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6991     };
6992 
6993     if (ST->hasAVX2())
6994       if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6995                                               ETy.getSimpleVT()))
6996         return GetDiscountedCost(Entry);
6997 
6998     if (ST->hasSSSE3())
6999       if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
7000                                               ETy.getSimpleVT()))
7001         return GetDiscountedCost(Entry);
7002 
7003     if (ST->hasSSE2())
7004       if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
7005                                               ETy.getSimpleVT()))
7006         return GetDiscountedCost(Entry);
7007   } else {
7008     assert(Opcode == Instruction::Store &&
7009            "Expected Store Instruction at this point");
7010     assert((!Indices.size() || Indices.size() == Factor) &&
7011            "Interleaved store only supports fully-interleaved groups.");
7012     if (ST->hasAVX2())
7013       if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
7014                                               ETy.getSimpleVT()))
7015         return MemOpCosts + Entry->Cost;
7016 
7017     if (ST->hasSSE2())
7018       if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
7019                                               ETy.getSimpleVT()))
7020         return MemOpCosts + Entry->Cost;
7021   }
7022 
7023   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7024                                            Alignment, AddressSpace, CostKind,
7025                                            UseMaskForCond, UseMaskForGaps);
7026 }
7027 
7028 InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
7029                                                  StackOffset BaseOffset,
7030                                                  bool HasBaseReg, int64_t Scale,
7031                                                  unsigned AddrSpace) const {
7032   // Scaling factors are not free at all.
7033   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7034   // will take 2 allocations in the out of order engine instead of 1
7035   // for plain addressing mode, i.e. inst (reg1).
7036   // E.g.,
7037   // vaddps (%rsi,%rdx), %ymm0, %ymm1
7038   // Requires two allocations (one for the load, one for the computation)
7039   // whereas:
7040   // vaddps (%rsi), %ymm0, %ymm1
7041   // Requires just 1 allocation, i.e., freeing allocations for other operations
7042   // and having less micro operations to execute.
7043   //
7044   // For some X86 architectures, this is even worse because for instance for
7045   // stores, the complex addressing mode forces the instruction to use the
7046   // "load" ports instead of the dedicated "store" port.
7047   // E.g., on Haswell:
7048   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7049   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7050   TargetLoweringBase::AddrMode AM;
7051   AM.BaseGV = BaseGV;
7052   AM.BaseOffs = BaseOffset.getFixed();
7053   AM.HasBaseReg = HasBaseReg;
7054   AM.Scale = Scale;
7055   AM.ScalableOffset = BaseOffset.getScalable();
7056   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7057     // Scale represents reg2 * scale, thus account for 1
7058     // as soon as we use a second register.
7059     return AM.Scale != 0;
7060   return -1;
7061 }
7062 
7063 InstructionCost X86TTIImpl::getBranchMispredictPenalty() const {
7064   // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7065   return 14;
7066 }
7067 
7068 bool X86TTIImpl::isVectorShiftByScalarCheap(Type *Ty) const {
7069   unsigned Bits = Ty->getScalarSizeInBits();
7070 
7071   // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7072   // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7073   if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7074     return false;
7075 
7076   // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7077   // shifts just as cheap as scalar ones.
7078   if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7079     return false;
7080 
7081   // AVX512BW has shifts such as vpsllvw.
7082   if (ST->hasBWI() && Bits == 16)
7083     return false;
7084 
7085   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7086   // fully general vector.
7087   return true;
7088 }
7089 
7090 unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7091                                        Type *ScalarValTy) const {
7092   if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7093     return 4;
7094   }
7095   return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
7096 }
7097 
7098 bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I,
7099                                             SmallVectorImpl<Use *> &Ops) const {
7100   using namespace llvm::PatternMatch;
7101 
7102   FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7103   if (!VTy)
7104     return false;
7105 
7106   if (I->getOpcode() == Instruction::Mul &&
7107       VTy->getElementType()->isIntegerTy(64)) {
7108     for (auto &Op : I->operands()) {
7109       // Make sure we are not already sinking this operand
7110       if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7111         continue;
7112 
7113       // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7114       // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7115       if (ST->hasSSE41() &&
7116           match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7117                                  m_SpecificInt(32)))) {
7118         Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7119         Ops.push_back(&Op);
7120       } else if (ST->hasSSE2() &&
7121                  match(Op.get(),
7122                        m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7123         Ops.push_back(&Op);
7124       }
7125     }
7126 
7127     return !Ops.empty();
7128   }
7129 
7130   // A uniform shift amount in a vector shift or funnel shift may be much
7131   // cheaper than a generic variable vector shift, so make that pattern visible
7132   // to SDAG by sinking the shuffle instruction next to the shift.
7133   int ShiftAmountOpNum = -1;
7134   if (I->isShift())
7135     ShiftAmountOpNum = 1;
7136   else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7137     if (II->getIntrinsicID() == Intrinsic::fshl ||
7138         II->getIntrinsicID() == Intrinsic::fshr)
7139       ShiftAmountOpNum = 2;
7140   }
7141 
7142   if (ShiftAmountOpNum == -1)
7143     return false;
7144 
7145   auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7146   if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7147       isVectorShiftByScalarCheap(I->getType())) {
7148     Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7149     return true;
7150   }
7151 
7152   return false;
7153 }
7154