10b57cec5SDimitry Andric //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric /// \file 90b57cec5SDimitry Andric /// This file implements a TargetTransformInfo analysis pass specific to the 100b57cec5SDimitry Andric /// X86 target machine. It uses the target's detailed information to provide 110b57cec5SDimitry Andric /// more precise answers to certain TTI queries, while letting the target 120b57cec5SDimitry Andric /// independent and default TTI implementations handle the rest. 130b57cec5SDimitry Andric /// 140b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 150b57cec5SDimitry Andric /// About Cost Model numbers used below it's necessary to say the following: 16bdd1243dSDimitry Andric /// the numbers correspond to some "generic" X86 CPU instead of usage of a 17bdd1243dSDimitry Andric /// specific CPU model. Usually the numbers correspond to the CPU where the 18bdd1243dSDimitry Andric /// feature first appeared. For example, if we do Subtarget.hasSSE42() in 190b57cec5SDimitry Andric /// the lookups below the cost is based on Nehalem as that was the first CPU 20bdd1243dSDimitry Andric /// to support that feature level and thus has most likely the worst case cost, 21bdd1243dSDimitry Andric /// although we may discard an outlying worst cost from one CPU (e.g. Atom). 22bdd1243dSDimitry Andric /// 230b57cec5SDimitry Andric /// Some examples of other technologies/CPUs: 240b57cec5SDimitry Andric /// SSE 3 - Pentium4 / Athlon64 250b57cec5SDimitry Andric /// SSE 4.1 - Penryn 26bdd1243dSDimitry Andric /// SSE 4.2 - Nehalem / Silvermont 27bdd1243dSDimitry Andric /// AVX - Sandy Bridge / Jaguar / Bulldozer 28bdd1243dSDimitry Andric /// AVX2 - Haswell / Ryzen 290b57cec5SDimitry Andric /// AVX-512 - Xeon Phi / Skylake 30bdd1243dSDimitry Andric /// 310b57cec5SDimitry Andric /// And some examples of instruction target dependent costs (latency) 320b57cec5SDimitry Andric /// divss sqrtss rsqrtss 330b57cec5SDimitry Andric /// AMD K7 11-16 19 3 340b57cec5SDimitry Andric /// Piledriver 9-24 13-15 5 350b57cec5SDimitry Andric /// Jaguar 14 16 2 360b57cec5SDimitry Andric /// Pentium II,III 18 30 2 370b57cec5SDimitry Andric /// Nehalem 7-14 7-18 3 380b57cec5SDimitry Andric /// Haswell 10-13 11 5 39bdd1243dSDimitry Andric /// 40bdd1243dSDimitry Andric /// Interpreting the 4 TargetCostKind types: 41bdd1243dSDimitry Andric /// TCK_RecipThroughput and TCK_Latency should try to match the worst case 42bdd1243dSDimitry Andric /// values reported by the CPU scheduler models (and llvm-mca). 43bdd1243dSDimitry Andric /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the 44bdd1243dSDimitry Andric /// actual encoding size of the instruction. 45bdd1243dSDimitry Andric /// TCK_SizeAndLatency should match the worst case micro-op counts reported by 46bdd1243dSDimitry Andric /// by the CPU scheduler models (and llvm-mca), to ensure that they are 47bdd1243dSDimitry Andric /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are 48bdd1243dSDimitry Andric /// often used as the cost thresholds where TCK_SizeAndLatency is requested. 490b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 500b57cec5SDimitry Andric 510b57cec5SDimitry Andric #include "X86TargetTransformInfo.h" 520b57cec5SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h" 530b57cec5SDimitry Andric #include "llvm/CodeGen/BasicTTIImpl.h" 540b57cec5SDimitry Andric #include "llvm/CodeGen/CostTable.h" 550b57cec5SDimitry Andric #include "llvm/CodeGen/TargetLowering.h" 5604eeddc0SDimitry Andric #include "llvm/IR/InstIterator.h" 570b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h" 580b57cec5SDimitry Andric #include "llvm/Support/Debug.h" 59bdd1243dSDimitry Andric #include <optional> 600b57cec5SDimitry Andric 610b57cec5SDimitry Andric using namespace llvm; 620b57cec5SDimitry Andric 630b57cec5SDimitry Andric #define DEBUG_TYPE "x86tti" 640b57cec5SDimitry Andric 650b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 660b57cec5SDimitry Andric // 670b57cec5SDimitry Andric // X86 cost model. 680b57cec5SDimitry Andric // 690b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 700b57cec5SDimitry Andric 71bdd1243dSDimitry Andric // Helper struct to store/access costs for each cost kind. 72bdd1243dSDimitry Andric // TODO: Move this to allow other targets to use it? 73bdd1243dSDimitry Andric struct CostKindCosts { 74bdd1243dSDimitry Andric unsigned RecipThroughputCost = ~0U; 75bdd1243dSDimitry Andric unsigned LatencyCost = ~0U; 76bdd1243dSDimitry Andric unsigned CodeSizeCost = ~0U; 77bdd1243dSDimitry Andric unsigned SizeAndLatencyCost = ~0U; 78bdd1243dSDimitry Andric 79bdd1243dSDimitry Andric std::optional<unsigned> 80bdd1243dSDimitry Andric operator[](TargetTransformInfo::TargetCostKind Kind) const { 81bdd1243dSDimitry Andric unsigned Cost = ~0U; 82bdd1243dSDimitry Andric switch (Kind) { 83bdd1243dSDimitry Andric case TargetTransformInfo::TCK_RecipThroughput: 84bdd1243dSDimitry Andric Cost = RecipThroughputCost; 85bdd1243dSDimitry Andric break; 86bdd1243dSDimitry Andric case TargetTransformInfo::TCK_Latency: 87bdd1243dSDimitry Andric Cost = LatencyCost; 88bdd1243dSDimitry Andric break; 89bdd1243dSDimitry Andric case TargetTransformInfo::TCK_CodeSize: 90bdd1243dSDimitry Andric Cost = CodeSizeCost; 91bdd1243dSDimitry Andric break; 92bdd1243dSDimitry Andric case TargetTransformInfo::TCK_SizeAndLatency: 93bdd1243dSDimitry Andric Cost = SizeAndLatencyCost; 94bdd1243dSDimitry Andric break; 95bdd1243dSDimitry Andric } 96bdd1243dSDimitry Andric if (Cost == ~0U) 97bdd1243dSDimitry Andric return std::nullopt; 98bdd1243dSDimitry Andric return Cost; 99bdd1243dSDimitry Andric } 100bdd1243dSDimitry Andric }; 101bdd1243dSDimitry Andric using CostKindTblEntry = CostTblEntryT<CostKindCosts>; 102*0fca6ea1SDimitry Andric using TypeConversionCostKindTblEntry = TypeConversionCostTblEntryT<CostKindCosts>; 103bdd1243dSDimitry Andric 1040b57cec5SDimitry Andric TargetTransformInfo::PopcntSupportKind 1050b57cec5SDimitry Andric X86TTIImpl::getPopcntSupport(unsigned TyWidth) { 1060b57cec5SDimitry Andric assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 1070b57cec5SDimitry Andric // TODO: Currently the __builtin_popcount() implementation using SSE3 1080b57cec5SDimitry Andric // instructions is inefficient. Once the problem is fixed, we should 1090b57cec5SDimitry Andric // call ST->hasSSE3() instead of ST->hasPOPCNT(). 1100b57cec5SDimitry Andric return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; 1110b57cec5SDimitry Andric } 1120b57cec5SDimitry Andric 113bdd1243dSDimitry Andric std::optional<unsigned> X86TTIImpl::getCacheSize( 1140b57cec5SDimitry Andric TargetTransformInfo::CacheLevel Level) const { 1150b57cec5SDimitry Andric switch (Level) { 1160b57cec5SDimitry Andric case TargetTransformInfo::CacheLevel::L1D: 1170b57cec5SDimitry Andric // - Penryn 1180b57cec5SDimitry Andric // - Nehalem 1190b57cec5SDimitry Andric // - Westmere 1200b57cec5SDimitry Andric // - Sandy Bridge 1210b57cec5SDimitry Andric // - Ivy Bridge 1220b57cec5SDimitry Andric // - Haswell 1230b57cec5SDimitry Andric // - Broadwell 1240b57cec5SDimitry Andric // - Skylake 1250b57cec5SDimitry Andric // - Kabylake 1260b57cec5SDimitry Andric return 32 * 1024; // 32 KByte 1270b57cec5SDimitry Andric case TargetTransformInfo::CacheLevel::L2D: 1280b57cec5SDimitry Andric // - Penryn 1290b57cec5SDimitry Andric // - Nehalem 1300b57cec5SDimitry Andric // - Westmere 1310b57cec5SDimitry Andric // - Sandy Bridge 1320b57cec5SDimitry Andric // - Ivy Bridge 1330b57cec5SDimitry Andric // - Haswell 1340b57cec5SDimitry Andric // - Broadwell 1350b57cec5SDimitry Andric // - Skylake 1360b57cec5SDimitry Andric // - Kabylake 1370b57cec5SDimitry Andric return 256 * 1024; // 256 KByte 1380b57cec5SDimitry Andric } 1390b57cec5SDimitry Andric 1400b57cec5SDimitry Andric llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 1410b57cec5SDimitry Andric } 1420b57cec5SDimitry Andric 143bdd1243dSDimitry Andric std::optional<unsigned> X86TTIImpl::getCacheAssociativity( 1440b57cec5SDimitry Andric TargetTransformInfo::CacheLevel Level) const { 1450b57cec5SDimitry Andric // - Penryn 1460b57cec5SDimitry Andric // - Nehalem 1470b57cec5SDimitry Andric // - Westmere 1480b57cec5SDimitry Andric // - Sandy Bridge 1490b57cec5SDimitry Andric // - Ivy Bridge 1500b57cec5SDimitry Andric // - Haswell 1510b57cec5SDimitry Andric // - Broadwell 1520b57cec5SDimitry Andric // - Skylake 1530b57cec5SDimitry Andric // - Kabylake 1540b57cec5SDimitry Andric switch (Level) { 1550b57cec5SDimitry Andric case TargetTransformInfo::CacheLevel::L1D: 156bdd1243dSDimitry Andric [[fallthrough]]; 1570b57cec5SDimitry Andric case TargetTransformInfo::CacheLevel::L2D: 1580b57cec5SDimitry Andric return 8; 1590b57cec5SDimitry Andric } 1600b57cec5SDimitry Andric 1610b57cec5SDimitry Andric llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 1620b57cec5SDimitry Andric } 1630b57cec5SDimitry Andric 1648bcb0991SDimitry Andric unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { 1658bcb0991SDimitry Andric bool Vector = (ClassID == 1); 1660b57cec5SDimitry Andric if (Vector && !ST->hasSSE1()) 1670b57cec5SDimitry Andric return 0; 1680b57cec5SDimitry Andric 1690b57cec5SDimitry Andric if (ST->is64Bit()) { 1700b57cec5SDimitry Andric if (Vector && ST->hasAVX512()) 1710b57cec5SDimitry Andric return 32; 172*0fca6ea1SDimitry Andric if (!Vector && ST->hasEGPR()) 173*0fca6ea1SDimitry Andric return 32; 1740b57cec5SDimitry Andric return 16; 1750b57cec5SDimitry Andric } 1760b57cec5SDimitry Andric return 8; 1770b57cec5SDimitry Andric } 1780b57cec5SDimitry Andric 179*0fca6ea1SDimitry Andric bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty) const { 180*0fca6ea1SDimitry Andric if (!ST->hasCF()) 181*0fca6ea1SDimitry Andric return false; 182*0fca6ea1SDimitry Andric if (!Ty) 183*0fca6ea1SDimitry Andric return true; 184*0fca6ea1SDimitry Andric // Conditional faulting is supported by CFCMOV, which only accepts 185*0fca6ea1SDimitry Andric // 16/32/64-bit operands. 186*0fca6ea1SDimitry Andric // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's 187*0fca6ea1SDimitry Andric // profitable. 188*0fca6ea1SDimitry Andric auto *VTy = dyn_cast<FixedVectorType>(Ty); 189*0fca6ea1SDimitry Andric if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1)) 190*0fca6ea1SDimitry Andric return false; 191*0fca6ea1SDimitry Andric auto *ScalarTy = Ty->getScalarType(); 192*0fca6ea1SDimitry Andric switch (cast<IntegerType>(ScalarTy)->getBitWidth()) { 193*0fca6ea1SDimitry Andric default: 194*0fca6ea1SDimitry Andric return false; 195*0fca6ea1SDimitry Andric case 16: 196*0fca6ea1SDimitry Andric case 32: 197*0fca6ea1SDimitry Andric case 64: 198*0fca6ea1SDimitry Andric return true; 199*0fca6ea1SDimitry Andric } 200*0fca6ea1SDimitry Andric } 201*0fca6ea1SDimitry Andric 202fe6060f1SDimitry Andric TypeSize 203fe6060f1SDimitry Andric X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 2040b57cec5SDimitry Andric unsigned PreferVectorWidth = ST->getPreferVectorWidth(); 205fe6060f1SDimitry Andric switch (K) { 206fe6060f1SDimitry Andric case TargetTransformInfo::RGK_Scalar: 207fe6060f1SDimitry Andric return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); 208fe6060f1SDimitry Andric case TargetTransformInfo::RGK_FixedWidthVector: 2095f757f3fSDimitry Andric if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512) 210fe6060f1SDimitry Andric return TypeSize::getFixed(512); 2110b57cec5SDimitry Andric if (ST->hasAVX() && PreferVectorWidth >= 256) 212fe6060f1SDimitry Andric return TypeSize::getFixed(256); 2130b57cec5SDimitry Andric if (ST->hasSSE1() && PreferVectorWidth >= 128) 214fe6060f1SDimitry Andric return TypeSize::getFixed(128); 215fe6060f1SDimitry Andric return TypeSize::getFixed(0); 216fe6060f1SDimitry Andric case TargetTransformInfo::RGK_ScalableVector: 217fe6060f1SDimitry Andric return TypeSize::getScalable(0); 2180b57cec5SDimitry Andric } 2190b57cec5SDimitry Andric 220fe6060f1SDimitry Andric llvm_unreachable("Unsupported register kind"); 2210b57cec5SDimitry Andric } 2220b57cec5SDimitry Andric 2230b57cec5SDimitry Andric unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { 224fe6060f1SDimitry Andric return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 225bdd1243dSDimitry Andric .getFixedValue(); 2260b57cec5SDimitry Andric } 2270b57cec5SDimitry Andric 22806c3fb27SDimitry Andric unsigned X86TTIImpl::getMaxInterleaveFactor(ElementCount VF) { 2290b57cec5SDimitry Andric // If the loop will not be vectorized, don't interleave the loop. 2300b57cec5SDimitry Andric // Let regular unroll to unroll the loop, which saves the overflow 2310b57cec5SDimitry Andric // check and memory check cost. 23206c3fb27SDimitry Andric if (VF.isScalar()) 2330b57cec5SDimitry Andric return 1; 2340b57cec5SDimitry Andric 2350b57cec5SDimitry Andric if (ST->isAtom()) 2360b57cec5SDimitry Andric return 1; 2370b57cec5SDimitry Andric 2380b57cec5SDimitry Andric // Sandybridge and Haswell have multiple execution ports and pipelined 2390b57cec5SDimitry Andric // vector units. 2400b57cec5SDimitry Andric if (ST->hasAVX()) 2410b57cec5SDimitry Andric return 4; 2420b57cec5SDimitry Andric 2430b57cec5SDimitry Andric return 2; 2440b57cec5SDimitry Andric } 2450b57cec5SDimitry Andric 246fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getArithmeticInstrCost( 247fe6060f1SDimitry Andric unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 248bdd1243dSDimitry Andric TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 249bdd1243dSDimitry Andric ArrayRef<const Value *> Args, 250480093f4SDimitry Andric const Instruction *CxtI) { 251fe6060f1SDimitry Andric 252fe6060f1SDimitry Andric // vXi8 multiplications are always promoted to vXi16. 25306c3fb27SDimitry Andric // Sub-128-bit types can be extended/packed more efficiently. 254fe6060f1SDimitry Andric if (Opcode == Instruction::Mul && Ty->isVectorTy() && 25506c3fb27SDimitry Andric Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) { 256fe6060f1SDimitry Andric Type *WideVecTy = 257fe6060f1SDimitry Andric VectorType::getExtendedElementVectorType(cast<VectorType>(Ty)); 258fe6060f1SDimitry Andric return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty, 259fe6060f1SDimitry Andric TargetTransformInfo::CastContextHint::None, 260fe6060f1SDimitry Andric CostKind) + 261fe6060f1SDimitry Andric getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, 262fe6060f1SDimitry Andric TargetTransformInfo::CastContextHint::None, 263fe6060f1SDimitry Andric CostKind) + 264bdd1243dSDimitry Andric getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info); 265fe6060f1SDimitry Andric } 266fe6060f1SDimitry Andric 2670b57cec5SDimitry Andric // Legalize the type. 268bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 2690b57cec5SDimitry Andric 2700b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 2710b57cec5SDimitry Andric assert(ISD && "Invalid opcode"); 2720b57cec5SDimitry Andric 273349cc55cSDimitry Andric if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && 27406c3fb27SDimitry Andric (LT.second.getScalarType() == MVT::i32 || 27506c3fb27SDimitry Andric LT.second.getScalarType() == MVT::i64)) { 276349cc55cSDimitry Andric // Check if the operands can be represented as a smaller datatype. 277349cc55cSDimitry Andric bool Op1Signed = false, Op2Signed = false; 278349cc55cSDimitry Andric unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); 279349cc55cSDimitry Andric unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); 280349cc55cSDimitry Andric unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); 281bdd1243dSDimitry Andric bool SignedMode = Op1Signed || Op2Signed; 282349cc55cSDimitry Andric 28306c3fb27SDimitry Andric // If both vXi32 are representable as i15 and at least one is constant, 284349cc55cSDimitry Andric // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we 285349cc55cSDimitry Andric // can treat this as PMADDWD which has the same costs as a vXi16 multiply. 28606c3fb27SDimitry Andric if (OpMinSize <= 15 && !ST->isPMADDWDSlow() && 28706c3fb27SDimitry Andric LT.second.getScalarType() == MVT::i32) { 288349cc55cSDimitry Andric bool Op1Constant = 289349cc55cSDimitry Andric isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]); 290349cc55cSDimitry Andric bool Op2Constant = 291349cc55cSDimitry Andric isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]); 292349cc55cSDimitry Andric bool Op1Sext = isa<SExtInst>(Args[0]) && 293349cc55cSDimitry Andric (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); 294349cc55cSDimitry Andric bool Op2Sext = isa<SExtInst>(Args[1]) && 295349cc55cSDimitry Andric (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); 296349cc55cSDimitry Andric 297349cc55cSDimitry Andric bool IsZeroExtended = !Op1Signed || !Op2Signed; 298349cc55cSDimitry Andric bool IsConstant = Op1Constant || Op2Constant; 299349cc55cSDimitry Andric bool IsSext = Op1Sext || Op2Sext; 300349cc55cSDimitry Andric if (IsConstant || IsZeroExtended || IsSext) 301349cc55cSDimitry Andric LT.second = 302349cc55cSDimitry Andric MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements()); 303349cc55cSDimitry Andric } 304349cc55cSDimitry Andric 305bdd1243dSDimitry Andric // Check if the vXi32 operands can be shrunk into a smaller datatype. 306bdd1243dSDimitry Andric // This should match the codegen from reduceVMULWidth. 307bdd1243dSDimitry Andric // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()). 308bdd1243dSDimitry Andric if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) { 3090b57cec5SDimitry Andric if (OpMinSize <= 7) 3100b57cec5SDimitry Andric return LT.first * 3; // pmullw/sext 311e8d8bef9SDimitry Andric if (!SignedMode && OpMinSize <= 8) 3120b57cec5SDimitry Andric return LT.first * 3; // pmullw/zext 3130b57cec5SDimitry Andric if (OpMinSize <= 15) 3140b57cec5SDimitry Andric return LT.first * 5; // pmullw/pmulhw/pshuf 315e8d8bef9SDimitry Andric if (!SignedMode && OpMinSize <= 16) 3160b57cec5SDimitry Andric return LT.first * 5; // pmullw/pmulhw/pshuf 3170b57cec5SDimitry Andric } 31806c3fb27SDimitry Andric 31906c3fb27SDimitry Andric // If both vXi64 are representable as (unsigned) i32, then we can perform 32006c3fb27SDimitry Andric // the multiple with a single PMULUDQ instruction. 32106c3fb27SDimitry Andric // TODO: Add (SSE41+) PMULDQ handling for signed extensions. 32206c3fb27SDimitry Andric if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64) 32306c3fb27SDimitry Andric ISD = X86ISD::PMULUDQ; 3240b57cec5SDimitry Andric } 3250b57cec5SDimitry Andric 326bdd1243dSDimitry Andric // Vector multiply by pow2 will be simplified to shifts. 327bdd1243dSDimitry Andric // Vector multiply by -pow2 will be simplified to shifts/negates. 328bdd1243dSDimitry Andric if (ISD == ISD::MUL && Op2Info.isConstant() && 329bdd1243dSDimitry Andric (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) { 330bdd1243dSDimitry Andric InstructionCost Cost = 331bdd1243dSDimitry Andric getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, 332bdd1243dSDimitry Andric Op1Info.getNoProps(), Op2Info.getNoProps()); 333bdd1243dSDimitry Andric if (Op2Info.isNegatedPowerOf2()) 334bdd1243dSDimitry Andric Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); 335bdd1243dSDimitry Andric return Cost; 336bdd1243dSDimitry Andric } 337bdd1243dSDimitry Andric 338bdd1243dSDimitry Andric // On X86, vector signed division by constants power-of-two are 339bdd1243dSDimitry Andric // normally expanded to the sequence SRA + SRL + ADD + SRA. 340bdd1243dSDimitry Andric // The OperandValue properties may not be the same as that of the previous 341bdd1243dSDimitry Andric // operation; conservatively assume OP_None. 342bdd1243dSDimitry Andric if ((ISD == ISD::SDIV || ISD == ISD::SREM) && 343bdd1243dSDimitry Andric Op2Info.isConstant() && Op2Info.isPowerOf2()) { 344bdd1243dSDimitry Andric InstructionCost Cost = 345bdd1243dSDimitry Andric 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, 346bdd1243dSDimitry Andric Op1Info.getNoProps(), Op2Info.getNoProps()); 347bdd1243dSDimitry Andric Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, 348bdd1243dSDimitry Andric Op1Info.getNoProps(), Op2Info.getNoProps()); 349bdd1243dSDimitry Andric Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, 350bdd1243dSDimitry Andric Op1Info.getNoProps(), Op2Info.getNoProps()); 351bdd1243dSDimitry Andric 352bdd1243dSDimitry Andric if (ISD == ISD::SREM) { 353bdd1243dSDimitry Andric // For SREM: (X % C) is the equivalent of (X - (X/C)*C) 354bdd1243dSDimitry Andric Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), 355bdd1243dSDimitry Andric Op2Info.getNoProps()); 356bdd1243dSDimitry Andric Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(), 357bdd1243dSDimitry Andric Op2Info.getNoProps()); 358bdd1243dSDimitry Andric } 359bdd1243dSDimitry Andric 360bdd1243dSDimitry Andric return Cost; 361bdd1243dSDimitry Andric } 362bdd1243dSDimitry Andric 363bdd1243dSDimitry Andric // Vector unsigned division/remainder will be simplified to shifts/masks. 364bdd1243dSDimitry Andric if ((ISD == ISD::UDIV || ISD == ISD::UREM) && 365bdd1243dSDimitry Andric Op2Info.isConstant() && Op2Info.isPowerOf2()) { 366bdd1243dSDimitry Andric if (ISD == ISD::UDIV) 367bdd1243dSDimitry Andric return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, 368bdd1243dSDimitry Andric Op1Info.getNoProps(), Op2Info.getNoProps()); 369bdd1243dSDimitry Andric // UREM 370bdd1243dSDimitry Andric return getArithmeticInstrCost(Instruction::And, Ty, CostKind, 371bdd1243dSDimitry Andric Op1Info.getNoProps(), Op2Info.getNoProps()); 372bdd1243dSDimitry Andric } 373bdd1243dSDimitry Andric 374*0fca6ea1SDimitry Andric static const CostKindTblEntry GFNIUniformConstCostTable[] = { 375*0fca6ea1SDimitry Andric { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 376*0fca6ea1SDimitry Andric { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 377*0fca6ea1SDimitry Andric { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 378*0fca6ea1SDimitry Andric { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 379*0fca6ea1SDimitry Andric { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 380*0fca6ea1SDimitry Andric { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 381*0fca6ea1SDimitry Andric { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 382*0fca6ea1SDimitry Andric { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 383*0fca6ea1SDimitry Andric { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 384*0fca6ea1SDimitry Andric }; 385*0fca6ea1SDimitry Andric 386*0fca6ea1SDimitry Andric if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI()) 387*0fca6ea1SDimitry Andric if (const auto *Entry = 388*0fca6ea1SDimitry Andric CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second)) 389*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 390*0fca6ea1SDimitry Andric return LT.first * *KindCost; 391*0fca6ea1SDimitry Andric 392bdd1243dSDimitry Andric static const CostKindTblEntry AVX512BWUniformConstCostTable[] = { 393bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. 394bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. 395bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb. 396bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand. 397bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand. 398bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb. 399bdd1243dSDimitry Andric { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand. 400bdd1243dSDimitry Andric { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand. 401bdd1243dSDimitry Andric { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb. 402bdd1243dSDimitry Andric 403bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw 404bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw 405bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw 406bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw 407bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw 408bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw 4090b57cec5SDimitry Andric }; 4100b57cec5SDimitry Andric 411bdd1243dSDimitry Andric if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI()) 412bdd1243dSDimitry Andric if (const auto *Entry = 413bdd1243dSDimitry Andric CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second)) 414bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 415bdd1243dSDimitry Andric return LT.first * *KindCost; 4160b57cec5SDimitry Andric 417bdd1243dSDimitry Andric static const CostKindTblEntry AVX512UniformConstCostTable[] = { 418bdd1243dSDimitry Andric { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand. 419bdd1243dSDimitry Andric { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand. 420bdd1243dSDimitry Andric { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb. 4215ffd83dbSDimitry Andric 422bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split. 423bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split. 424bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split. 425e8d8bef9SDimitry Andric 426bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld 427bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld 428bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad 429bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld 430bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld 431bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad 432bdd1243dSDimitry Andric 433bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq 434bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq 435bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq 436bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq 437bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq 438bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq 439bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq 440bdd1243dSDimitry Andric 441bdd1243dSDimitry Andric { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence 442bdd1243dSDimitry Andric { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence 443bdd1243dSDimitry Andric { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence 444bdd1243dSDimitry Andric { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence 4450b57cec5SDimitry Andric }; 4460b57cec5SDimitry Andric 447bdd1243dSDimitry Andric if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512()) 448bdd1243dSDimitry Andric if (const auto *Entry = 449bdd1243dSDimitry Andric CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second)) 450bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 451bdd1243dSDimitry Andric return LT.first * *KindCost; 4520b57cec5SDimitry Andric 453bdd1243dSDimitry Andric static const CostKindTblEntry AVX2UniformConstCostTable[] = { 454bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand. 455bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand. 456bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb. 457bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand. 458bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand. 459bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb. 4600b57cec5SDimitry Andric 461bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw 462bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw 463bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw 464bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw 465bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw 466bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw 467e8d8bef9SDimitry Andric 468bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld 469bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld 470bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad 471bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld 472bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld 473bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad 474bdd1243dSDimitry Andric 475bdd1243dSDimitry Andric { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq 476bdd1243dSDimitry Andric { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq 477bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. 478bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq 479bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq 480bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split. 481bdd1243dSDimitry Andric 482bdd1243dSDimitry Andric { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence 483bdd1243dSDimitry Andric { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence 484bdd1243dSDimitry Andric { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence 485bdd1243dSDimitry Andric { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence 4860b57cec5SDimitry Andric }; 4870b57cec5SDimitry Andric 488bdd1243dSDimitry Andric if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2()) 489bdd1243dSDimitry Andric if (const auto *Entry = 490bdd1243dSDimitry Andric CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second)) 491bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 492bdd1243dSDimitry Andric return LT.first * *KindCost; 4930b57cec5SDimitry Andric 494bdd1243dSDimitry Andric static const CostKindTblEntry AVXUniformConstCostTable[] = { 495bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand. 496bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand. 497bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. 498bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split. 499bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split. 500bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split. 5010b57cec5SDimitry Andric 502bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw. 503bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw. 504bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw. 505bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split. 506bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split. 507bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split. 508e8d8bef9SDimitry Andric 509bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld. 510bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld. 511bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad. 512bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split. 513bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split. 514bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split. 515bdd1243dSDimitry Andric 516bdd1243dSDimitry Andric { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq. 517bdd1243dSDimitry Andric { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq. 518bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. 519bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. 520bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. 521bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split. 522bdd1243dSDimitry Andric 523bdd1243dSDimitry Andric { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split. 524bdd1243dSDimitry Andric { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split. 525bdd1243dSDimitry Andric { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split. 526bdd1243dSDimitry Andric { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split. 5270b57cec5SDimitry Andric }; 5280b57cec5SDimitry Andric 5290b57cec5SDimitry Andric // XOP has faster vXi8 shifts. 530bdd1243dSDimitry Andric if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() && 531bdd1243dSDimitry Andric (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) 532bdd1243dSDimitry Andric if (const auto *Entry = 533bdd1243dSDimitry Andric CostTableLookup(AVXUniformConstCostTable, ISD, LT.second)) 534bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 535bdd1243dSDimitry Andric return LT.first * *KindCost; 536bdd1243dSDimitry Andric 537bdd1243dSDimitry Andric static const CostKindTblEntry SSE2UniformConstCostTable[] = { 538bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. 539bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. 540bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. 541bdd1243dSDimitry Andric 542bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw. 543bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw. 544bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw. 545bdd1243dSDimitry Andric 546bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld 547bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld. 548bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad. 549bdd1243dSDimitry Andric 550bdd1243dSDimitry Andric { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq. 551bdd1243dSDimitry Andric { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq. 552bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle. 553bdd1243dSDimitry Andric 554bdd1243dSDimitry Andric { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence 555bdd1243dSDimitry Andric { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence 556bdd1243dSDimitry Andric { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence 557bdd1243dSDimitry Andric { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence 558bdd1243dSDimitry Andric }; 559bdd1243dSDimitry Andric 560bdd1243dSDimitry Andric // XOP has faster vXi8 shifts. 561bdd1243dSDimitry Andric if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() && 562bdd1243dSDimitry Andric (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) 5630b57cec5SDimitry Andric if (const auto *Entry = 5640b57cec5SDimitry Andric CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) 565bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 566bdd1243dSDimitry Andric return LT.first * *KindCost; 5670b57cec5SDimitry Andric 568bdd1243dSDimitry Andric static const CostKindTblEntry AVX512BWConstCostTable[] = { 569bdd1243dSDimitry Andric { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence 570bdd1243dSDimitry Andric { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 571bdd1243dSDimitry Andric { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence 572bdd1243dSDimitry Andric { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 573bdd1243dSDimitry Andric 574bdd1243dSDimitry Andric { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence 575bdd1243dSDimitry Andric { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence 576bdd1243dSDimitry Andric { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence 577bdd1243dSDimitry Andric { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence 5780b57cec5SDimitry Andric }; 5790b57cec5SDimitry Andric 580bdd1243dSDimitry Andric if (Op2Info.isConstant() && ST->hasBWI()) 5810b57cec5SDimitry Andric if (const auto *Entry = 5820b57cec5SDimitry Andric CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) 583bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 584bdd1243dSDimitry Andric return LT.first * *KindCost; 5850b57cec5SDimitry Andric 586bdd1243dSDimitry Andric static const CostKindTblEntry AVX512ConstCostTable[] = { 587bdd1243dSDimitry Andric { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence 588bdd1243dSDimitry Andric { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence 589bdd1243dSDimitry Andric { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence 590bdd1243dSDimitry Andric { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence 591bdd1243dSDimitry Andric 592bdd1243dSDimitry Andric { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence 593bdd1243dSDimitry Andric { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence 594bdd1243dSDimitry Andric { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence 595bdd1243dSDimitry Andric { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence 596bdd1243dSDimitry Andric 597bdd1243dSDimitry Andric { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence 598bdd1243dSDimitry Andric { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence 599bdd1243dSDimitry Andric { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence 600bdd1243dSDimitry Andric { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence 6010b57cec5SDimitry Andric }; 6020b57cec5SDimitry Andric 603bdd1243dSDimitry Andric if (Op2Info.isConstant() && ST->hasAVX512()) 6040b57cec5SDimitry Andric if (const auto *Entry = 6050b57cec5SDimitry Andric CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) 606bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 607bdd1243dSDimitry Andric return LT.first * *KindCost; 6080b57cec5SDimitry Andric 609bdd1243dSDimitry Andric static const CostKindTblEntry AVX2ConstCostTable[] = { 610bdd1243dSDimitry Andric { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence 611bdd1243dSDimitry Andric { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 612bdd1243dSDimitry Andric { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence 613bdd1243dSDimitry Andric { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 614bdd1243dSDimitry Andric 615bdd1243dSDimitry Andric { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence 616bdd1243dSDimitry Andric { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence 617bdd1243dSDimitry Andric { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence 618bdd1243dSDimitry Andric { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence 619bdd1243dSDimitry Andric 620bdd1243dSDimitry Andric { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence 621bdd1243dSDimitry Andric { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence 622bdd1243dSDimitry Andric { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence 623bdd1243dSDimitry Andric { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence 6240b57cec5SDimitry Andric }; 6250b57cec5SDimitry Andric 626bdd1243dSDimitry Andric if (Op2Info.isConstant() && ST->hasAVX2()) 6270b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) 628bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 629bdd1243dSDimitry Andric return LT.first * *KindCost; 6300b57cec5SDimitry Andric 631bdd1243dSDimitry Andric static const CostKindTblEntry AVXConstCostTable[] = { 632bdd1243dSDimitry Andric { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. 633bdd1243dSDimitry Andric { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. 634bdd1243dSDimitry Andric { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. 635bdd1243dSDimitry Andric { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. 636bdd1243dSDimitry Andric 637bdd1243dSDimitry Andric { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split. 638bdd1243dSDimitry Andric { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split. 639bdd1243dSDimitry Andric { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split. 640bdd1243dSDimitry Andric { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split. 641bdd1243dSDimitry Andric 642bdd1243dSDimitry Andric { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence 643bdd1243dSDimitry Andric { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence 644bdd1243dSDimitry Andric { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split. 645bdd1243dSDimitry Andric { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split. 6460b57cec5SDimitry Andric }; 6470b57cec5SDimitry Andric 648bdd1243dSDimitry Andric if (Op2Info.isConstant() && ST->hasAVX()) 649bdd1243dSDimitry Andric if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second)) 650bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 651bdd1243dSDimitry Andric return LT.first * *KindCost; 6520b57cec5SDimitry Andric 653bdd1243dSDimitry Andric static const CostKindTblEntry SSE41ConstCostTable[] = { 654bdd1243dSDimitry Andric { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence 655bdd1243dSDimitry Andric { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence 656bdd1243dSDimitry Andric }; 657bdd1243dSDimitry Andric 658bdd1243dSDimitry Andric if (Op2Info.isConstant() && ST->hasSSE41()) 659bdd1243dSDimitry Andric if (const auto *Entry = 660bdd1243dSDimitry Andric CostTableLookup(SSE41ConstCostTable, ISD, LT.second)) 661bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 662bdd1243dSDimitry Andric return LT.first * *KindCost; 663bdd1243dSDimitry Andric 664bdd1243dSDimitry Andric static const CostKindTblEntry SSE2ConstCostTable[] = { 665bdd1243dSDimitry Andric { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence 666bdd1243dSDimitry Andric { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 667bdd1243dSDimitry Andric { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence 668bdd1243dSDimitry Andric { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 669bdd1243dSDimitry Andric 670bdd1243dSDimitry Andric { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence 671bdd1243dSDimitry Andric { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence 672bdd1243dSDimitry Andric { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence 673bdd1243dSDimitry Andric { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence 674bdd1243dSDimitry Andric 675bdd1243dSDimitry Andric { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence 676bdd1243dSDimitry Andric { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence 677bdd1243dSDimitry Andric { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence 678bdd1243dSDimitry Andric { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence 679bdd1243dSDimitry Andric }; 680bdd1243dSDimitry Andric 681bdd1243dSDimitry Andric if (Op2Info.isConstant() && ST->hasSSE2()) 6820b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) 683bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 684bdd1243dSDimitry Andric return LT.first * *KindCost; 6850b57cec5SDimitry Andric 686bdd1243dSDimitry Andric static const CostKindTblEntry AVX512BWUniformCostTable[] = { 687bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. 688bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand. 689bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb. 690bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. 691bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. 692bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb. 693bdd1243dSDimitry Andric { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand. 694bdd1243dSDimitry Andric { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand. 695bdd1243dSDimitry Andric { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb. 696fe6060f1SDimitry Andric 697bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw 698bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw 699bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw 7005ffd83dbSDimitry Andric }; 7015ffd83dbSDimitry Andric 702bdd1243dSDimitry Andric if (ST->hasBWI() && Op2Info.isUniform()) 703bdd1243dSDimitry Andric if (const auto *Entry = 704bdd1243dSDimitry Andric CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second)) 705bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 706bdd1243dSDimitry Andric return LT.first * *KindCost; 7075ffd83dbSDimitry Andric 708bdd1243dSDimitry Andric static const CostKindTblEntry AVX512UniformCostTable[] = { 709bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split. 710bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split. 711bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split. 712bdd1243dSDimitry Andric 713bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld 714bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld 715bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad 716bdd1243dSDimitry Andric 717bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq 718bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq 719bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq 720bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq 721bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq 722bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq 723bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq 724bdd1243dSDimitry Andric }; 725bdd1243dSDimitry Andric 726bdd1243dSDimitry Andric if (ST->hasAVX512() && Op2Info.isUniform()) 727bdd1243dSDimitry Andric if (const auto *Entry = 728bdd1243dSDimitry Andric CostTableLookup(AVX512UniformCostTable, ISD, LT.second)) 729bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 730bdd1243dSDimitry Andric return LT.first * *KindCost; 731bdd1243dSDimitry Andric 732bdd1243dSDimitry Andric static const CostKindTblEntry AVX2UniformCostTable[] = { 7330b57cec5SDimitry Andric // Uniform splats are cheaper for the following instructions. 734bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. 735bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand. 736bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb. 737bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. 738bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. 739bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb. 740fe6060f1SDimitry Andric 741bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw. 742bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw. 743bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw. 744bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw. 745bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw. 746bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw. 747bdd1243dSDimitry Andric 748bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld 749bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld 750bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad 751bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld 752bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld 753bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad 754bdd1243dSDimitry Andric 755bdd1243dSDimitry Andric { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq 756bdd1243dSDimitry Andric { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq 757bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle. 758bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq 759bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq 760bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle. 7610b57cec5SDimitry Andric }; 7620b57cec5SDimitry Andric 763bdd1243dSDimitry Andric if (ST->hasAVX2() && Op2Info.isUniform()) 7640b57cec5SDimitry Andric if (const auto *Entry = 7650b57cec5SDimitry Andric CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) 766bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 767bdd1243dSDimitry Andric return LT.first * *KindCost; 7680b57cec5SDimitry Andric 769bdd1243dSDimitry Andric static const CostKindTblEntry AVXUniformCostTable[] = { 770bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand. 771bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand. 772bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb. 773bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split. 774bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split. 775bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split. 7760b57cec5SDimitry Andric 777bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw. 778bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw. 779bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw. 780bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split. 781bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split. 782bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split. 7830b57cec5SDimitry Andric 784bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld. 785bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld. 786bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad. 787bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split. 788bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split. 789bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split. 790bdd1243dSDimitry Andric 791bdd1243dSDimitry Andric { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq. 792bdd1243dSDimitry Andric { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq. 793bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle. 794bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split. 795bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split. 796bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split. 7970b57cec5SDimitry Andric }; 7980b57cec5SDimitry Andric 799bdd1243dSDimitry Andric // XOP has faster vXi8 shifts. 800bdd1243dSDimitry Andric if (ST->hasAVX() && Op2Info.isUniform() && 801bdd1243dSDimitry Andric (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) 802bdd1243dSDimitry Andric if (const auto *Entry = 803bdd1243dSDimitry Andric CostTableLookup(AVXUniformCostTable, ISD, LT.second)) 804bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 805bdd1243dSDimitry Andric return LT.first * *KindCost; 806bdd1243dSDimitry Andric 807bdd1243dSDimitry Andric static const CostKindTblEntry SSE2UniformCostTable[] = { 808bdd1243dSDimitry Andric // Uniform splats are cheaper for the following instructions. 809bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand. 810bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand. 811bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence. 812bdd1243dSDimitry Andric 813bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw. 814bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw. 815bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw. 816bdd1243dSDimitry Andric 817bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld 818bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld. 819bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad. 820bdd1243dSDimitry Andric 821bdd1243dSDimitry Andric { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq. 822bdd1243dSDimitry Andric { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq. 823bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub. 824bdd1243dSDimitry Andric }; 825bdd1243dSDimitry Andric 826bdd1243dSDimitry Andric if (ST->hasSSE2() && Op2Info.isUniform() && 827bdd1243dSDimitry Andric (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) 8280b57cec5SDimitry Andric if (const auto *Entry = 8290b57cec5SDimitry Andric CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) 830bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 831bdd1243dSDimitry Andric return LT.first * *KindCost; 8320b57cec5SDimitry Andric 833bdd1243dSDimitry Andric static const CostKindTblEntry AVX512DQCostTable[] = { 834bdd1243dSDimitry Andric { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq 835bdd1243dSDimitry Andric { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq 836bdd1243dSDimitry Andric { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq 8370b57cec5SDimitry Andric }; 8380b57cec5SDimitry Andric 8390b57cec5SDimitry Andric // Look for AVX512DQ lowering tricks for custom cases. 8400b57cec5SDimitry Andric if (ST->hasDQI()) 8410b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) 842bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 843bdd1243dSDimitry Andric return LT.first * *KindCost; 8440b57cec5SDimitry Andric 845bdd1243dSDimitry Andric static const CostKindTblEntry AVX512BWCostTable[] = { 846bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence. 847bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence. 848bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence. 849bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence. 850bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence. 851bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence. 852bdd1243dSDimitry Andric { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence. 853bdd1243dSDimitry Andric { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence. 854bdd1243dSDimitry Andric { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence. 855bdd1243dSDimitry Andric 856bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw 857bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw 858bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw 859bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw 860bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw 861bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw 862bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw 863bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw 864bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw 865bdd1243dSDimitry Andric 866bdd1243dSDimitry Andric { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb 867bdd1243dSDimitry Andric { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw 868bdd1243dSDimitry Andric 869bdd1243dSDimitry Andric { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb 870bdd1243dSDimitry Andric { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw 871bdd1243dSDimitry Andric { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd 872bdd1243dSDimitry Andric { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq 873bdd1243dSDimitry Andric 874bdd1243dSDimitry Andric { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb 875bdd1243dSDimitry Andric { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw 876bdd1243dSDimitry Andric 877*0fca6ea1SDimitry Andric { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc 878*0fca6ea1SDimitry Andric { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw 879*0fca6ea1SDimitry Andric { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw 880bdd1243dSDimitry Andric { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw 881bdd1243dSDimitry Andric 882bdd1243dSDimitry Andric { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb 883bdd1243dSDimitry Andric { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw 884bdd1243dSDimitry Andric { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd 885bdd1243dSDimitry Andric { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq 8860b57cec5SDimitry Andric }; 8870b57cec5SDimitry Andric 8880b57cec5SDimitry Andric // Look for AVX512BW lowering tricks for custom cases. 8890b57cec5SDimitry Andric if (ST->hasBWI()) 8900b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) 891bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 892bdd1243dSDimitry Andric return LT.first * *KindCost; 8930b57cec5SDimitry Andric 894bdd1243dSDimitry Andric static const CostKindTblEntry AVX512CostTable[] = { 895bdd1243dSDimitry Andric { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence. 896bdd1243dSDimitry Andric { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence. 897bdd1243dSDimitry Andric { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence. 8980b57cec5SDimitry Andric 899bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. 900bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. 901bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence. 9020b57cec5SDimitry Andric 903bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, 904bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, 905bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, 906bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, 907bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, 908bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, 909bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, 910bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, 911bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, 9120b57cec5SDimitry Andric 913bdd1243dSDimitry Andric { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, 914bdd1243dSDimitry Andric { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, 915bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, 916bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, 917bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, 918bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, 919bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, 920bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, 921bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, 9220b57cec5SDimitry Andric 923bdd1243dSDimitry Andric { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split 924bdd1243dSDimitry Andric { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split 9250b57cec5SDimitry Andric 926bdd1243dSDimitry Andric { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split 927bdd1243dSDimitry Andric { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split 928bdd1243dSDimitry Andric 929bdd1243dSDimitry Andric { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } }, 930bdd1243dSDimitry Andric { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } }, 931bdd1243dSDimitry Andric { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } }, 932bdd1243dSDimitry Andric { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } }, 933bdd1243dSDimitry Andric 934bdd1243dSDimitry Andric { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } }, 935bdd1243dSDimitry Andric { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } }, 936bdd1243dSDimitry Andric { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } }, 937bdd1243dSDimitry Andric { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } }, 938bdd1243dSDimitry Andric 939bdd1243dSDimitry Andric { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } }, 940bdd1243dSDimitry Andric { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } }, 941bdd1243dSDimitry Andric { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } }, 942bdd1243dSDimitry Andric { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } }, 943bdd1243dSDimitry Andric 944bdd1243dSDimitry Andric { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) 945bdd1243dSDimitry Andric { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) 946bdd1243dSDimitry Andric { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) 947bdd1243dSDimitry Andric { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add 948bdd1243dSDimitry Andric { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/ 949bdd1243dSDimitry Andric 95006c3fb27SDimitry Andric { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } }, 95106c3fb27SDimitry Andric 952bdd1243dSDimitry Andric { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ 953bdd1243dSDimitry Andric { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 954bdd1243dSDimitry Andric { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 955bdd1243dSDimitry Andric { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 956bdd1243dSDimitry Andric { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 957bdd1243dSDimitry Andric { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 958bdd1243dSDimitry Andric { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 959bdd1243dSDimitry Andric { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 960bdd1243dSDimitry Andric { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 961bdd1243dSDimitry Andric 962bdd1243dSDimitry Andric { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ 963bdd1243dSDimitry Andric { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ 964bdd1243dSDimitry Andric { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/ 965bdd1243dSDimitry Andric { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/ 966bdd1243dSDimitry Andric 967bdd1243dSDimitry Andric { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ 968bdd1243dSDimitry Andric { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 969bdd1243dSDimitry Andric { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 970bdd1243dSDimitry Andric { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 971bdd1243dSDimitry Andric { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 972bdd1243dSDimitry Andric { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 973bdd1243dSDimitry Andric { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 974bdd1243dSDimitry Andric { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 975bdd1243dSDimitry Andric { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 976bdd1243dSDimitry Andric 977bdd1243dSDimitry Andric { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ 978bdd1243dSDimitry Andric { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ 979bdd1243dSDimitry Andric { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/ 980bdd1243dSDimitry Andric { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/ 9810b57cec5SDimitry Andric }; 9820b57cec5SDimitry Andric 9830b57cec5SDimitry Andric if (ST->hasAVX512()) 9840b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) 985bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 986bdd1243dSDimitry Andric return LT.first * *KindCost; 9870b57cec5SDimitry Andric 988bdd1243dSDimitry Andric static const CostKindTblEntry AVX2ShiftCostTable[] = { 989fe6060f1SDimitry Andric // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to 9900b57cec5SDimitry Andric // customize them to detect the cases where shift amount is a scalar one. 991bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org) 992bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org) 993bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org) 994bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org) 995bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org) 996bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org) 997bdd1243dSDimitry Andric { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org) 998bdd1243dSDimitry Andric { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org) 999bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org) 1000bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org) 10010b57cec5SDimitry Andric }; 10020b57cec5SDimitry Andric 10035ffd83dbSDimitry Andric if (ST->hasAVX512()) { 1004bdd1243dSDimitry Andric if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant()) 10055ffd83dbSDimitry Andric // On AVX512, a packed v32i16 shift left by a constant build_vector 10065ffd83dbSDimitry Andric // is lowered into a vector multiply (vpmullw). 10075ffd83dbSDimitry Andric return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, 1008bdd1243dSDimitry Andric Op1Info.getNoProps(), Op2Info.getNoProps()); 10095ffd83dbSDimitry Andric } 10105ffd83dbSDimitry Andric 1011fe6060f1SDimitry Andric // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). 1012fe6060f1SDimitry Andric if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { 10130b57cec5SDimitry Andric if (ISD == ISD::SHL && LT.second == MVT::v16i16 && 1014bdd1243dSDimitry Andric Op2Info.isConstant()) 10150b57cec5SDimitry Andric // On AVX2, a packed v16i16 shift left by a constant build_vector 10160b57cec5SDimitry Andric // is lowered into a vector multiply (vpmullw). 10175ffd83dbSDimitry Andric return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, 1018bdd1243dSDimitry Andric Op1Info.getNoProps(), Op2Info.getNoProps()); 10190b57cec5SDimitry Andric 10200b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) 1021bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1022bdd1243dSDimitry Andric return LT.first * *KindCost; 10230b57cec5SDimitry Andric } 10240b57cec5SDimitry Andric 1025bdd1243dSDimitry Andric static const CostKindTblEntry XOPShiftCostTable[] = { 10260b57cec5SDimitry Andric // 128bit shifts take 1cy, but right shifts require negation beforehand. 1027bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } }, 1028bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } }, 1029bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } }, 1030bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } }, 1031bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } }, 1032bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } }, 1033bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } }, 1034bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } }, 1035bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } }, 1036bdd1243dSDimitry Andric { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } }, 1037bdd1243dSDimitry Andric { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, 1038bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } }, 10390b57cec5SDimitry Andric // 256bit shifts require splitting if AVX2 didn't catch them above. 1040bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } }, 1041bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } }, 1042bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } }, 1043bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } }, 1044bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } }, 1045bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } }, 1046bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } }, 1047bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } }, 1048bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } }, 1049bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } }, 1050bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } }, 1051bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } }, 10520b57cec5SDimitry Andric }; 10530b57cec5SDimitry Andric 10540b57cec5SDimitry Andric // Look for XOP lowering tricks. 10550b57cec5SDimitry Andric if (ST->hasXOP()) { 10560b57cec5SDimitry Andric // If the right shift is constant then we'll fold the negation so 10570b57cec5SDimitry Andric // it's as cheap as a left shift. 10580b57cec5SDimitry Andric int ShiftISD = ISD; 1059bdd1243dSDimitry Andric if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant()) 10600b57cec5SDimitry Andric ShiftISD = ISD::SHL; 10610b57cec5SDimitry Andric if (const auto *Entry = 10620b57cec5SDimitry Andric CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) 1063bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1064bdd1243dSDimitry Andric return LT.first * *KindCost; 10650b57cec5SDimitry Andric } 10660b57cec5SDimitry Andric 1067bdd1243dSDimitry Andric if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) { 10680b57cec5SDimitry Andric MVT VT = LT.second; 10690b57cec5SDimitry Andric // Vector shift left by non uniform constant can be lowered 10700b57cec5SDimitry Andric // into vector multiply. 10710b57cec5SDimitry Andric if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || 10720b57cec5SDimitry Andric ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) 10730b57cec5SDimitry Andric ISD = ISD::MUL; 10740b57cec5SDimitry Andric } 10750b57cec5SDimitry Andric 1076bdd1243dSDimitry Andric static const CostKindTblEntry GLMCostTable[] = { 1077bdd1243dSDimitry Andric { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss 1078bdd1243dSDimitry Andric { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps 1079bdd1243dSDimitry Andric { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd 1080bdd1243dSDimitry Andric { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd 1081bdd1243dSDimitry Andric }; 10820b57cec5SDimitry Andric 1083bdd1243dSDimitry Andric if (ST->useGLMDivSqrtCosts()) 1084bdd1243dSDimitry Andric if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second)) 1085bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1086bdd1243dSDimitry Andric return LT.first * *KindCost; 10870b57cec5SDimitry Andric 1088bdd1243dSDimitry Andric static const CostKindTblEntry SLMCostTable[] = { 1089bdd1243dSDimitry Andric { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld 1090bdd1243dSDimitry Andric { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw 1091bdd1243dSDimitry Andric { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd 1092bdd1243dSDimitry Andric { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss 1093bdd1243dSDimitry Andric { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd 1094bdd1243dSDimitry Andric { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps 1095bdd1243dSDimitry Andric { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss 1096bdd1243dSDimitry Andric { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps 1097bdd1243dSDimitry Andric { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd 1098bdd1243dSDimitry Andric { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd 1099bdd1243dSDimitry Andric { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd 1100bdd1243dSDimitry Andric { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd 1101bdd1243dSDimitry Andric // v2i64/v4i64 mul is custom lowered as a series of long: 1102bdd1243dSDimitry Andric // multiplies(3), shifts(3) and adds(2) 1103bdd1243dSDimitry Andric // slm muldq version throughput is 2 and addq throughput 4 1104bdd1243dSDimitry Andric // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + 1105bdd1243dSDimitry Andric // 3X4 (addq throughput) = 17 1106bdd1243dSDimitry Andric { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } }, 1107bdd1243dSDimitry Andric // slm addq\subq throughput is 4 1108bdd1243dSDimitry Andric { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } }, 1109bdd1243dSDimitry Andric { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } }, 1110bdd1243dSDimitry Andric }; 11110b57cec5SDimitry Andric 1112bdd1243dSDimitry Andric if (ST->useSLMArithCosts()) 1113bdd1243dSDimitry Andric if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second)) 1114bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1115bdd1243dSDimitry Andric return LT.first * *KindCost; 11160b57cec5SDimitry Andric 1117bdd1243dSDimitry Andric static const CostKindTblEntry AVX2CostTable[] = { 1118bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence. 1119bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence. 1120bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence. 1121bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. 11220b57cec5SDimitry Andric 1123bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence. 1124bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence. 1125bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence. 1126bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. 11270b57cec5SDimitry Andric 1128bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence. 1129bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence. 1130bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence. 1131bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence. 1132bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence. 1133bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence. 1134bdd1243dSDimitry Andric 1135bdd1243dSDimitry Andric { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb 1136bdd1243dSDimitry Andric { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb 1137bdd1243dSDimitry Andric { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw 1138bdd1243dSDimitry Andric { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw 1139bdd1243dSDimitry Andric { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd 1140bdd1243dSDimitry Andric { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd 1141bdd1243dSDimitry Andric { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq 1142bdd1243dSDimitry Andric { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq 1143bdd1243dSDimitry Andric 114406c3fb27SDimitry Andric { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack 1145*0fca6ea1SDimitry Andric { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw 114606c3fb27SDimitry Andric { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw 1147bdd1243dSDimitry Andric { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld 1148bdd1243dSDimitry Andric { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld 1149bdd1243dSDimitry Andric { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add 1150bdd1243dSDimitry Andric { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add 1151bdd1243dSDimitry Andric 115206c3fb27SDimitry Andric { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } }, 115306c3fb27SDimitry Andric 1154bdd1243dSDimitry Andric { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd 1155bdd1243dSDimitry Andric { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps 1156bdd1243dSDimitry Andric 1157bdd1243dSDimitry Andric { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd 1158bdd1243dSDimitry Andric { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss 1159bdd1243dSDimitry Andric { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd 1160bdd1243dSDimitry Andric { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps 1161bdd1243dSDimitry Andric { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd 1162bdd1243dSDimitry Andric { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps 1163bdd1243dSDimitry Andric 1164bdd1243dSDimitry Andric { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd 1165bdd1243dSDimitry Andric { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss 1166bdd1243dSDimitry Andric { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd 1167bdd1243dSDimitry Andric { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps 1168bdd1243dSDimitry Andric { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd 1169bdd1243dSDimitry Andric { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps 1170bdd1243dSDimitry Andric 1171bdd1243dSDimitry Andric { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd 1172bdd1243dSDimitry Andric { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss 1173bdd1243dSDimitry Andric { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd 1174bdd1243dSDimitry Andric { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps 1175bdd1243dSDimitry Andric { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd 1176bdd1243dSDimitry Andric { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps 1177bdd1243dSDimitry Andric 1178bdd1243dSDimitry Andric { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss 1179bdd1243dSDimitry Andric { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps 1180bdd1243dSDimitry Andric { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps 1181bdd1243dSDimitry Andric { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd 1182bdd1243dSDimitry Andric { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd 1183bdd1243dSDimitry Andric { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd 11840b57cec5SDimitry Andric }; 11850b57cec5SDimitry Andric 11860b57cec5SDimitry Andric // Look for AVX2 lowering tricks for custom cases. 11870b57cec5SDimitry Andric if (ST->hasAVX2()) 11880b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) 1189bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1190bdd1243dSDimitry Andric return LT.first * *KindCost; 11910b57cec5SDimitry Andric 1192bdd1243dSDimitry Andric static const CostKindTblEntry AVX1CostTable[] = { 11930b57cec5SDimitry Andric // We don't have to scalarize unsupported ops. We can issue two half-sized 11940b57cec5SDimitry Andric // operations and we only need to extract the upper YMM half. 11950b57cec5SDimitry Andric // Two ops + 1 extract + 1 insert = 4. 1196*0fca6ea1SDimitry Andric { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split 1197*0fca6ea1SDimitry Andric { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or 1198bdd1243dSDimitry Andric { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split 1199bdd1243dSDimitry Andric { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split 1200bdd1243dSDimitry Andric { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld 1201bdd1243dSDimitry Andric { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } }, 1202fe6060f1SDimitry Andric 1203bdd1243dSDimitry Andric { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps 1204bdd1243dSDimitry Andric { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps 1205bdd1243dSDimitry Andric { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps 1206bdd1243dSDimitry Andric { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps 12070b57cec5SDimitry Andric 1208bdd1243dSDimitry Andric { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps 1209bdd1243dSDimitry Andric { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps 1210bdd1243dSDimitry Andric { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps 1211bdd1243dSDimitry Andric { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps 12120b57cec5SDimitry Andric 1213bdd1243dSDimitry Andric { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps 1214bdd1243dSDimitry Andric { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps 1215bdd1243dSDimitry Andric { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps 1216bdd1243dSDimitry Andric { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps 1217fe6060f1SDimitry Andric 1218bdd1243dSDimitry Andric { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split 1219bdd1243dSDimitry Andric { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split 1220bdd1243dSDimitry Andric { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split 1221bdd1243dSDimitry Andric { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split 1222bdd1243dSDimitry Andric { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split 1223bdd1243dSDimitry Andric { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split 1224bdd1243dSDimitry Andric { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split 1225bdd1243dSDimitry Andric { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split 1226bdd1243dSDimitry Andric { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq 1227bdd1243dSDimitry Andric { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq 1228fe6060f1SDimitry Andric 1229bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence. 1230bdd1243dSDimitry Andric { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split. 1231bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence. 1232bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split. 1233bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld 1234bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split 1235bdd1243dSDimitry Andric { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. 1236bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. 1237fe6060f1SDimitry Andric 1238bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence. 1239bdd1243dSDimitry Andric { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split. 1240bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. 1241bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. 1242bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. 1243bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. 1244bdd1243dSDimitry Andric { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. 1245bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. 12460b57cec5SDimitry Andric 1247bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence. 1248bdd1243dSDimitry Andric { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split. 1249bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. 1250bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. 1251bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. 1252bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. 1253bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend. 1254bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split. 1255bdd1243dSDimitry Andric 1256bdd1243dSDimitry Andric { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ 1257bdd1243dSDimitry Andric { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ 1258bdd1243dSDimitry Andric 1259bdd1243dSDimitry Andric { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1260bdd1243dSDimitry Andric { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1261bdd1243dSDimitry Andric { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1262bdd1243dSDimitry Andric { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1263bdd1243dSDimitry Andric { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ 1264bdd1243dSDimitry Andric { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ 1265bdd1243dSDimitry Andric 1266bdd1243dSDimitry Andric { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1267bdd1243dSDimitry Andric { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1268bdd1243dSDimitry Andric { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1269bdd1243dSDimitry Andric { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1270bdd1243dSDimitry Andric { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ 1271bdd1243dSDimitry Andric { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ 1272bdd1243dSDimitry Andric 1273bdd1243dSDimitry Andric { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ 1274bdd1243dSDimitry Andric { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ 1275bdd1243dSDimitry Andric { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ 1276bdd1243dSDimitry Andric { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ 1277bdd1243dSDimitry Andric { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ 1278bdd1243dSDimitry Andric { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ 1279bdd1243dSDimitry Andric 1280bdd1243dSDimitry Andric { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ 1281bdd1243dSDimitry Andric { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ 1282bdd1243dSDimitry Andric { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/ 1283bdd1243dSDimitry Andric { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ 1284bdd1243dSDimitry Andric { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ 1285bdd1243dSDimitry Andric { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/ 12860b57cec5SDimitry Andric }; 12870b57cec5SDimitry Andric 12880b57cec5SDimitry Andric if (ST->hasAVX()) 12890b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) 1290bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1291bdd1243dSDimitry Andric return LT.first * *KindCost; 12920b57cec5SDimitry Andric 1293bdd1243dSDimitry Andric static const CostKindTblEntry SSE42CostTable[] = { 1294bdd1243dSDimitry Andric { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1295bdd1243dSDimitry Andric { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1296bdd1243dSDimitry Andric { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1297bdd1243dSDimitry Andric { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 12980b57cec5SDimitry Andric 1299bdd1243dSDimitry Andric { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1300bdd1243dSDimitry Andric { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1301bdd1243dSDimitry Andric { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1302bdd1243dSDimitry Andric { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 13030b57cec5SDimitry Andric 1304bdd1243dSDimitry Andric { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ 1305bdd1243dSDimitry Andric { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ 1306bdd1243dSDimitry Andric { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ 1307bdd1243dSDimitry Andric { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ 13080b57cec5SDimitry Andric 1309bdd1243dSDimitry Andric { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ 1310bdd1243dSDimitry Andric { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ 1311bdd1243dSDimitry Andric { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ 1312bdd1243dSDimitry Andric { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ 1313fe6060f1SDimitry Andric 1314bdd1243dSDimitry Andric { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add 13150b57cec5SDimitry Andric }; 13160b57cec5SDimitry Andric 13170b57cec5SDimitry Andric if (ST->hasSSE42()) 13180b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) 1319bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1320bdd1243dSDimitry Andric return LT.first * *KindCost; 13210b57cec5SDimitry Andric 1322bdd1243dSDimitry Andric static const CostKindTblEntry SSE41CostTable[] = { 1323bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence. 1324bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence. 1325bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld 13260b57cec5SDimitry Andric 1327bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence. 1328bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. 1329bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. 1330bdd1243dSDimitry Andric { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. 13310b57cec5SDimitry Andric 1332bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence. 1333bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. 1334bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. 1335bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence. 13360b57cec5SDimitry Andric 1337bdd1243dSDimitry Andric { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org) 13380b57cec5SDimitry Andric }; 13390b57cec5SDimitry Andric 13400b57cec5SDimitry Andric if (ST->hasSSE41()) 13410b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) 1342bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1343bdd1243dSDimitry Andric return LT.first * *KindCost; 13440b57cec5SDimitry Andric 1345*0fca6ea1SDimitry Andric static const CostKindTblEntry SSSE3CostTable[] = { 1346*0fca6ea1SDimitry Andric { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or 1347*0fca6ea1SDimitry Andric }; 1348*0fca6ea1SDimitry Andric 1349*0fca6ea1SDimitry Andric if (ST->hasSSSE3()) 1350*0fca6ea1SDimitry Andric if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second)) 1351*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1352*0fca6ea1SDimitry Andric return LT.first * *KindCost; 1353*0fca6ea1SDimitry Andric 1354bdd1243dSDimitry Andric static const CostKindTblEntry SSE2CostTable[] = { 13550b57cec5SDimitry Andric // We don't correctly identify costs of casts because they are marked as 13560b57cec5SDimitry Andric // custom. 1357bdd1243dSDimitry Andric { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence. 1358bdd1243dSDimitry Andric { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence. 1359bdd1243dSDimitry Andric { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq. 1360bdd1243dSDimitry Andric { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. 13610b57cec5SDimitry Andric 1362bdd1243dSDimitry Andric { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence. 1363bdd1243dSDimitry Andric { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. 1364bdd1243dSDimitry Andric { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. 1365bdd1243dSDimitry Andric { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. 13660b57cec5SDimitry Andric 1367bdd1243dSDimitry Andric { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence. 1368bdd1243dSDimitry Andric { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. 1369bdd1243dSDimitry Andric { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. 1370bdd1243dSDimitry Andric { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence. 13710b57cec5SDimitry Andric 1372bdd1243dSDimitry Andric { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand 1373bdd1243dSDimitry Andric { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand 1374bdd1243dSDimitry Andric { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand 1375bdd1243dSDimitry Andric { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand 13760b57cec5SDimitry Andric 1377bdd1243dSDimitry Andric { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por 1378bdd1243dSDimitry Andric { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por 1379bdd1243dSDimitry Andric { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por 1380bdd1243dSDimitry Andric { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por 13810b57cec5SDimitry Andric 1382bdd1243dSDimitry Andric { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor 1383bdd1243dSDimitry Andric { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor 1384bdd1243dSDimitry Andric { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor 1385bdd1243dSDimitry Andric { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor 1386fe6060f1SDimitry Andric 1387bdd1243dSDimitry Andric { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq 1388bdd1243dSDimitry Andric { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq 13890b57cec5SDimitry Andric 1390*0fca6ea1SDimitry Andric { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack 1391bdd1243dSDimitry Andric { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw 1392bdd1243dSDimitry Andric { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle 139306c3fb27SDimitry Andric { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add 139406c3fb27SDimitry Andric 139506c3fb27SDimitry Andric { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } }, 1396bdd1243dSDimitry Andric 1397bdd1243dSDimitry Andric { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1398bdd1243dSDimitry Andric { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1399bdd1243dSDimitry Andric { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1400bdd1243dSDimitry Andric { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1401bdd1243dSDimitry Andric 1402bdd1243dSDimitry Andric { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1403bdd1243dSDimitry Andric { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1404bdd1243dSDimitry Andric { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1405bdd1243dSDimitry Andric { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1406bdd1243dSDimitry Andric 1407bdd1243dSDimitry Andric { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1408bdd1243dSDimitry Andric { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1409bdd1243dSDimitry Andric { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1410bdd1243dSDimitry Andric 1411bdd1243dSDimitry Andric { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1412bdd1243dSDimitry Andric { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1413bdd1243dSDimitry Andric { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1414bdd1243dSDimitry Andric 1415bdd1243dSDimitry Andric { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1416bdd1243dSDimitry Andric { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ 14170b57cec5SDimitry Andric }; 14180b57cec5SDimitry Andric 14190b57cec5SDimitry Andric if (ST->hasSSE2()) 14200b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) 1421bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1422bdd1243dSDimitry Andric return LT.first * *KindCost; 14230b57cec5SDimitry Andric 1424bdd1243dSDimitry Andric static const CostKindTblEntry SSE1CostTable[] = { 1425bdd1243dSDimitry Andric { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/ 1426bdd1243dSDimitry Andric { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/ 14270b57cec5SDimitry Andric 1428bdd1243dSDimitry Andric { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ 1429bdd1243dSDimitry Andric { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ 1430fe6060f1SDimitry Andric 1431bdd1243dSDimitry Andric { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ 1432bdd1243dSDimitry Andric { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ 14330b57cec5SDimitry Andric 1434bdd1243dSDimitry Andric { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ 1435bdd1243dSDimitry Andric { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ 1436bdd1243dSDimitry Andric 1437bdd1243dSDimitry Andric { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ 1438bdd1243dSDimitry Andric { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ 1439fe6060f1SDimitry Andric }; 14400b57cec5SDimitry Andric 1441fe6060f1SDimitry Andric if (ST->hasSSE1()) 1442fe6060f1SDimitry Andric if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) 1443bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1444bdd1243dSDimitry Andric return LT.first * *KindCost; 1445fe6060f1SDimitry Andric 1446bdd1243dSDimitry Andric static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets 1447bdd1243dSDimitry Andric { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ 1448bdd1243dSDimitry Andric { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ 144906c3fb27SDimitry Andric { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } }, 1450fe6060f1SDimitry Andric }; 1451fe6060f1SDimitry Andric 1452fe6060f1SDimitry Andric if (ST->is64Bit()) 1453fe6060f1SDimitry Andric if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second)) 1454bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1455bdd1243dSDimitry Andric return LT.first * *KindCost; 1456fe6060f1SDimitry Andric 1457bdd1243dSDimitry Andric static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets 1458bdd1243dSDimitry Andric { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ 1459bdd1243dSDimitry Andric { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ 1460bdd1243dSDimitry Andric { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ 14610b57cec5SDimitry Andric 1462bdd1243dSDimitry Andric { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ 1463bdd1243dSDimitry Andric { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ 1464bdd1243dSDimitry Andric { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ 1465bdd1243dSDimitry Andric 146606c3fb27SDimitry Andric { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } }, 146706c3fb27SDimitry Andric { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } }, 146806c3fb27SDimitry Andric { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } }, 146906c3fb27SDimitry Andric 1470bdd1243dSDimitry Andric { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87) 1471bdd1243dSDimitry Andric { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87) 1472bdd1243dSDimitry Andric { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87) 1473bdd1243dSDimitry Andric { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87) 1474bdd1243dSDimitry Andric { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87) 14750b57cec5SDimitry Andric }; 14760b57cec5SDimitry Andric 1477fe6060f1SDimitry Andric if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second)) 1478bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1479bdd1243dSDimitry Andric return LT.first * *KindCost; 14800b57cec5SDimitry Andric 14810b57cec5SDimitry Andric // It is not a good idea to vectorize division. We have to scalarize it and 14820b57cec5SDimitry Andric // in the process we will often end up having to spilling regular 14830b57cec5SDimitry Andric // registers. The overhead of division is going to dominate most kernels 14840b57cec5SDimitry Andric // anyways so try hard to prevent vectorization of division - it is 14850b57cec5SDimitry Andric // generally a bad idea. Assume somewhat arbitrarily that we have to be able 14860b57cec5SDimitry Andric // to hide "20 cycles" for each lane. 1487bdd1243dSDimitry Andric if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() && 1488bdd1243dSDimitry Andric (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || 1489bdd1243dSDimitry Andric ISD == ISD::UREM)) { 1490bdd1243dSDimitry Andric InstructionCost ScalarCost = 1491bdd1243dSDimitry Andric getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind, 1492bdd1243dSDimitry Andric Op1Info.getNoProps(), Op2Info.getNoProps()); 14930b57cec5SDimitry Andric return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; 14940b57cec5SDimitry Andric } 14950b57cec5SDimitry Andric 1496bdd1243dSDimitry Andric // Handle some basic single instruction code size cases. 1497bdd1243dSDimitry Andric if (CostKind == TTI::TCK_CodeSize) { 1498bdd1243dSDimitry Andric switch (ISD) { 1499bdd1243dSDimitry Andric case ISD::FADD: 1500bdd1243dSDimitry Andric case ISD::FSUB: 1501bdd1243dSDimitry Andric case ISD::FMUL: 1502bdd1243dSDimitry Andric case ISD::FDIV: 1503bdd1243dSDimitry Andric case ISD::FNEG: 1504bdd1243dSDimitry Andric case ISD::AND: 1505bdd1243dSDimitry Andric case ISD::OR: 1506bdd1243dSDimitry Andric case ISD::XOR: 1507bdd1243dSDimitry Andric return LT.first; 1508bdd1243dSDimitry Andric break; 1509bdd1243dSDimitry Andric } 1510bdd1243dSDimitry Andric } 1511bdd1243dSDimitry Andric 15120b57cec5SDimitry Andric // Fallback to the default implementation. 1513bdd1243dSDimitry Andric return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1514bdd1243dSDimitry Andric Args, CxtI); 15150b57cec5SDimitry Andric } 15160b57cec5SDimitry Andric 1517647cbc5dSDimitry Andric InstructionCost 1518647cbc5dSDimitry Andric X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0, 1519647cbc5dSDimitry Andric unsigned Opcode1, const SmallBitVector &OpcodeMask, 1520647cbc5dSDimitry Andric TTI::TargetCostKind CostKind) const { 1521647cbc5dSDimitry Andric if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) 1522647cbc5dSDimitry Andric return TTI::TCC_Basic; 1523647cbc5dSDimitry Andric return InstructionCost::getInvalid(); 1524647cbc5dSDimitry Andric } 1525647cbc5dSDimitry Andric 1526*0fca6ea1SDimitry Andric InstructionCost X86TTIImpl::getShuffleCost( 1527*0fca6ea1SDimitry Andric TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask, 1528*0fca6ea1SDimitry Andric TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, 1529*0fca6ea1SDimitry Andric ArrayRef<const Value *> Args, const Instruction *CxtI) { 15300b57cec5SDimitry Andric // 64-bit packed float vectors (v2f32) are widened to type v4f32. 15318bcb0991SDimitry Andric // 64-bit packed integer vectors (v2i32) are widened to type v4i32. 1532bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp); 15330b57cec5SDimitry Andric 15345f757f3fSDimitry Andric Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp); 1535bdd1243dSDimitry Andric 1536*0fca6ea1SDimitry Andric // Recognize a basic concat_vector shuffle. 1537*0fca6ea1SDimitry Andric if (Kind == TTI::SK_PermuteTwoSrc && 1538*0fca6ea1SDimitry Andric Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) && 1539*0fca6ea1SDimitry Andric ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) 1540*0fca6ea1SDimitry Andric return getShuffleCost(TTI::SK_InsertSubvector, 1541*0fca6ea1SDimitry Andric VectorType::getDoubleElementsVectorType(BaseTp), Mask, 1542*0fca6ea1SDimitry Andric CostKind, Mask.size() / 2, BaseTp); 1543*0fca6ea1SDimitry Andric 15440b57cec5SDimitry Andric // Treat Transpose as 2-op shuffles - there's no difference in lowering. 15450b57cec5SDimitry Andric if (Kind == TTI::SK_Transpose) 15460b57cec5SDimitry Andric Kind = TTI::SK_PermuteTwoSrc; 15470b57cec5SDimitry Andric 1548*0fca6ea1SDimitry Andric if (Kind == TTI::SK_Broadcast) { 15490b57cec5SDimitry Andric // For Broadcasts we are splatting the first element from the first input 15500b57cec5SDimitry Andric // register, so only need to reference that input and all the output 15510b57cec5SDimitry Andric // registers are the same. 15520b57cec5SDimitry Andric LT.first = 1; 15530b57cec5SDimitry Andric 1554*0fca6ea1SDimitry Andric // If we're broadcasting a load then AVX/AVX2 can do this for free. 1555*0fca6ea1SDimitry Andric using namespace PatternMatch; 1556*0fca6ea1SDimitry Andric if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) && 1557*0fca6ea1SDimitry Andric (ST->hasAVX2() || 1558*0fca6ea1SDimitry Andric (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32))) 1559*0fca6ea1SDimitry Andric return TTI::TCC_Free; 1560*0fca6ea1SDimitry Andric } 1561*0fca6ea1SDimitry Andric 15625f757f3fSDimitry Andric // Treat <X x bfloat> shuffles as <X x half>. 15635f757f3fSDimitry Andric if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16) 15645f757f3fSDimitry Andric LT.second = LT.second.changeVectorElementType(MVT::f16); 15655f757f3fSDimitry Andric 15660b57cec5SDimitry Andric // Subvector extractions are free if they start at the beginning of a 15670b57cec5SDimitry Andric // vector and cheap if the subvectors are aligned. 15680b57cec5SDimitry Andric if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { 15690b57cec5SDimitry Andric int NumElts = LT.second.getVectorNumElements(); 15700b57cec5SDimitry Andric if ((Index % NumElts) == 0) 15710b57cec5SDimitry Andric return 0; 1572bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); 15730b57cec5SDimitry Andric if (SubLT.second.isVector()) { 15740b57cec5SDimitry Andric int NumSubElts = SubLT.second.getVectorNumElements(); 15750b57cec5SDimitry Andric if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 15760b57cec5SDimitry Andric return SubLT.first; 15778bcb0991SDimitry Andric // Handle some cases for widening legalization. For now we only handle 15788bcb0991SDimitry Andric // cases where the original subvector was naturally aligned and evenly 15798bcb0991SDimitry Andric // fit in its legalized subvector type. 15808bcb0991SDimitry Andric // FIXME: Remove some of the alignment restrictions. 15818bcb0991SDimitry Andric // FIXME: We can use permq for 64-bit or larger extracts from 256-bit 15828bcb0991SDimitry Andric // vectors. 15835ffd83dbSDimitry Andric int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements(); 15845ffd83dbSDimitry Andric if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && 15855ffd83dbSDimitry Andric (NumSubElts % OrigSubElts) == 0 && 15868bcb0991SDimitry Andric LT.second.getVectorElementType() == 15878bcb0991SDimitry Andric SubLT.second.getVectorElementType() && 15888bcb0991SDimitry Andric LT.second.getVectorElementType().getSizeInBits() == 15895ffd83dbSDimitry Andric BaseTp->getElementType()->getPrimitiveSizeInBits()) { 15908bcb0991SDimitry Andric assert(NumElts >= NumSubElts && NumElts > OrigSubElts && 15918bcb0991SDimitry Andric "Unexpected number of elements!"); 15925ffd83dbSDimitry Andric auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), 15938bcb0991SDimitry Andric LT.second.getVectorNumElements()); 15945ffd83dbSDimitry Andric auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), 15958bcb0991SDimitry Andric SubLT.second.getVectorNumElements()); 15968bcb0991SDimitry Andric int ExtractIndex = alignDown((Index % NumElts), NumSubElts); 1597bdd1243dSDimitry Andric InstructionCost ExtractCost = 1598bdd1243dSDimitry Andric getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt, 1599bdd1243dSDimitry Andric CostKind, ExtractIndex, SubTy); 16008bcb0991SDimitry Andric 16018bcb0991SDimitry Andric // If the original size is 32-bits or more, we can use pshufd. Otherwise 16028bcb0991SDimitry Andric // if we have SSSE3 we can use pshufb. 16038bcb0991SDimitry Andric if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) 16048bcb0991SDimitry Andric return ExtractCost + 1; // pshufd or pshufb 16058bcb0991SDimitry Andric 16068bcb0991SDimitry Andric assert(SubTp->getPrimitiveSizeInBits() == 16 && 16078bcb0991SDimitry Andric "Unexpected vector size"); 16088bcb0991SDimitry Andric 16098bcb0991SDimitry Andric return ExtractCost + 2; // worst case pshufhw + pshufd 16108bcb0991SDimitry Andric } 16110b57cec5SDimitry Andric } 1612*0fca6ea1SDimitry Andric // If the extract subvector is not optimal, treat it as single op shuffle. 1613*0fca6ea1SDimitry Andric Kind = TTI::SK_PermuteSingleSrc; 16140b57cec5SDimitry Andric } 16150b57cec5SDimitry Andric 1616fe6060f1SDimitry Andric // Subvector insertions are cheap if the subvectors are aligned. 1617fe6060f1SDimitry Andric // Note that in general, the insertion starting at the beginning of a vector 1618fe6060f1SDimitry Andric // isn't free, because we need to preserve the rest of the wide vector. 1619fe6060f1SDimitry Andric if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { 1620fe6060f1SDimitry Andric int NumElts = LT.second.getVectorNumElements(); 1621bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); 1622fe6060f1SDimitry Andric if (SubLT.second.isVector()) { 1623fe6060f1SDimitry Andric int NumSubElts = SubLT.second.getVectorNumElements(); 1624fe6060f1SDimitry Andric if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 1625fe6060f1SDimitry Andric return SubLT.first; 1626fe6060f1SDimitry Andric } 1627349cc55cSDimitry Andric 1628349cc55cSDimitry Andric // If the insertion isn't aligned, treat it like a 2-op shuffle. 1629349cc55cSDimitry Andric Kind = TTI::SK_PermuteTwoSrc; 1630fe6060f1SDimitry Andric } 1631fe6060f1SDimitry Andric 16325ffd83dbSDimitry Andric // Handle some common (illegal) sub-vector types as they are often very cheap 16335ffd83dbSDimitry Andric // to shuffle even on targets without PSHUFB. 16345ffd83dbSDimitry Andric EVT VT = TLI->getValueType(DL, BaseTp); 16355ffd83dbSDimitry Andric if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && 16365ffd83dbSDimitry Andric !ST->hasSSSE3()) { 16375ffd83dbSDimitry Andric static const CostTblEntry SSE2SubVectorShuffleTbl[] = { 16385ffd83dbSDimitry Andric {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw 16395ffd83dbSDimitry Andric {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw 16405ffd83dbSDimitry Andric {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw 16415ffd83dbSDimitry Andric {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw 16425ffd83dbSDimitry Andric {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck 16435ffd83dbSDimitry Andric 16445ffd83dbSDimitry Andric {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw 16455ffd83dbSDimitry Andric {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw 16465ffd83dbSDimitry Andric {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus 16475ffd83dbSDimitry Andric {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck 16485ffd83dbSDimitry Andric 1649bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq 1650bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq 1651bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq 1652bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq 1653bdd1243dSDimitry Andric 16545ffd83dbSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw 16555ffd83dbSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw 16565ffd83dbSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw 16575ffd83dbSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw 16585ffd83dbSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck 16595ffd83dbSDimitry Andric 16605ffd83dbSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw 16615ffd83dbSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw 16625ffd83dbSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw 16635ffd83dbSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw 16645ffd83dbSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck 16655ffd83dbSDimitry Andric }; 16665ffd83dbSDimitry Andric 16675ffd83dbSDimitry Andric if (ST->hasSSE2()) 16685ffd83dbSDimitry Andric if (const auto *Entry = 16695ffd83dbSDimitry Andric CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) 16705ffd83dbSDimitry Andric return Entry->Cost; 16715ffd83dbSDimitry Andric } 16725ffd83dbSDimitry Andric 16730b57cec5SDimitry Andric // We are going to permute multiple sources and the result will be in multiple 16740b57cec5SDimitry Andric // destinations. Providing an accurate cost only for splits where the element 16750b57cec5SDimitry Andric // type remains the same. 16760b57cec5SDimitry Andric if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { 16770b57cec5SDimitry Andric MVT LegalVT = LT.second; 16780b57cec5SDimitry Andric if (LegalVT.isVector() && 16790b57cec5SDimitry Andric LegalVT.getVectorElementType().getSizeInBits() == 16805ffd83dbSDimitry Andric BaseTp->getElementType()->getPrimitiveSizeInBits() && 16815ffd83dbSDimitry Andric LegalVT.getVectorNumElements() < 16825ffd83dbSDimitry Andric cast<FixedVectorType>(BaseTp)->getNumElements()) { 16835ffd83dbSDimitry Andric unsigned VecTySize = DL.getTypeStoreSize(BaseTp); 16840b57cec5SDimitry Andric unsigned LegalVTSize = LegalVT.getStoreSize(); 16850b57cec5SDimitry Andric // Number of source vectors after legalization: 16860b57cec5SDimitry Andric unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; 16870b57cec5SDimitry Andric // Number of destination vectors after legalization: 1688fe6060f1SDimitry Andric InstructionCost NumOfDests = LT.first; 16890b57cec5SDimitry Andric 16905ffd83dbSDimitry Andric auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), 16910b57cec5SDimitry Andric LegalVT.getVectorNumElements()); 16920b57cec5SDimitry Andric 169381ad6265SDimitry Andric if (!Mask.empty() && NumOfDests.isValid()) { 169481ad6265SDimitry Andric // Try to perform better estimation of the permutation. 169581ad6265SDimitry Andric // 1. Split the source/destination vectors into real registers. 169681ad6265SDimitry Andric // 2. Do the mask analysis to identify which real registers are 169781ad6265SDimitry Andric // permuted. If more than 1 source registers are used for the 169881ad6265SDimitry Andric // destination register building, the cost for this destination register 169981ad6265SDimitry Andric // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one 170081ad6265SDimitry Andric // source register is used, build mask and calculate the cost as a cost 170181ad6265SDimitry Andric // of PermuteSingleSrc. 170281ad6265SDimitry Andric // Also, for the single register permute we try to identify if the 170381ad6265SDimitry Andric // destination register is just a copy of the source register or the 170481ad6265SDimitry Andric // copy of the previous destination register (the cost is 170581ad6265SDimitry Andric // TTI::TCC_Basic). If the source register is just reused, the cost for 170681ad6265SDimitry Andric // this operation is 0. 17075f757f3fSDimitry Andric NumOfDests = 17085f757f3fSDimitry Andric getTypeLegalizationCost( 17095f757f3fSDimitry Andric FixedVectorType::get(BaseTp->getElementType(), Mask.size())) 17105f757f3fSDimitry Andric .first; 171181ad6265SDimitry Andric unsigned E = *NumOfDests.getValue(); 171281ad6265SDimitry Andric unsigned NormalizedVF = 171381ad6265SDimitry Andric LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E); 171481ad6265SDimitry Andric unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); 171581ad6265SDimitry Andric unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); 171606c3fb27SDimitry Andric SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem); 171781ad6265SDimitry Andric copy(Mask, NormalizedMask.begin()); 171881ad6265SDimitry Andric unsigned PrevSrcReg = 0; 171981ad6265SDimitry Andric ArrayRef<int> PrevRegMask; 172081ad6265SDimitry Andric InstructionCost Cost = 0; 172181ad6265SDimitry Andric processShuffleMasks( 172281ad6265SDimitry Andric NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, 1723bdd1243dSDimitry Andric [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask, 172481ad6265SDimitry Andric &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { 17255f757f3fSDimitry Andric if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) { 172681ad6265SDimitry Andric // Check if the previous register can be just copied to the next 172781ad6265SDimitry Andric // one. 172881ad6265SDimitry Andric if (PrevRegMask.empty() || PrevSrcReg != SrcReg || 172981ad6265SDimitry Andric PrevRegMask != RegMask) 173081ad6265SDimitry Andric Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, 1731bdd1243dSDimitry Andric RegMask, CostKind, 0, nullptr); 173281ad6265SDimitry Andric else 173381ad6265SDimitry Andric // Just a copy of previous destination register. 173481ad6265SDimitry Andric Cost += TTI::TCC_Basic; 173581ad6265SDimitry Andric return; 173681ad6265SDimitry Andric } 173781ad6265SDimitry Andric if (SrcReg != DestReg && 173806c3fb27SDimitry Andric any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) { 173981ad6265SDimitry Andric // Just a copy of the source register. 174081ad6265SDimitry Andric Cost += TTI::TCC_Basic; 174181ad6265SDimitry Andric } 174281ad6265SDimitry Andric PrevSrcReg = SrcReg; 174381ad6265SDimitry Andric PrevRegMask = RegMask; 174481ad6265SDimitry Andric }, 1745bdd1243dSDimitry Andric [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask, 174681ad6265SDimitry Andric unsigned /*Unused*/, 174781ad6265SDimitry Andric unsigned /*Unused*/) { 174881ad6265SDimitry Andric Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, 1749bdd1243dSDimitry Andric CostKind, 0, nullptr); 175081ad6265SDimitry Andric }); 175181ad6265SDimitry Andric return Cost; 175281ad6265SDimitry Andric } 175381ad6265SDimitry Andric 1754fe6060f1SDimitry Andric InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; 1755fe6060f1SDimitry Andric return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 1756bdd1243dSDimitry Andric std::nullopt, CostKind, 0, nullptr); 17570b57cec5SDimitry Andric } 17580b57cec5SDimitry Andric 1759bdd1243dSDimitry Andric return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); 17600b57cec5SDimitry Andric } 17610b57cec5SDimitry Andric 17620b57cec5SDimitry Andric // For 2-input shuffles, we must account for splitting the 2 inputs into many. 17630b57cec5SDimitry Andric if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { 17640b57cec5SDimitry Andric // We assume that source and destination have the same vector type. 1765fe6060f1SDimitry Andric InstructionCost NumOfDests = LT.first; 1766fe6060f1SDimitry Andric InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1; 17670b57cec5SDimitry Andric LT.first = NumOfDests * NumOfShufflesPerDest; 17680b57cec5SDimitry Andric } 17690b57cec5SDimitry Andric 17700b57cec5SDimitry Andric static const CostTblEntry AVX512VBMIShuffleTbl[] = { 17710b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb 17720b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb 17730b57cec5SDimitry Andric 17740b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb 17750b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb 17760b57cec5SDimitry Andric 17775ffd83dbSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b 17785ffd83dbSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b 17795ffd83dbSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b 17800b57cec5SDimitry Andric }; 17810b57cec5SDimitry Andric 17820b57cec5SDimitry Andric if (ST->hasVBMI()) 17830b57cec5SDimitry Andric if (const auto *Entry = 17840b57cec5SDimitry Andric CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) 17850b57cec5SDimitry Andric return LT.first * Entry->Cost; 17860b57cec5SDimitry Andric 17870b57cec5SDimitry Andric static const CostTblEntry AVX512BWShuffleTbl[] = { 17880b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw 1789fcaf7f86SDimitry Andric {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw 17900b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb 17910b57cec5SDimitry Andric 17925ffd83dbSDimitry Andric {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw 1793fcaf7f86SDimitry Andric {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw 17945ffd83dbSDimitry Andric {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw 17950b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 17960b57cec5SDimitry Andric 17975ffd83dbSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw 1798fcaf7f86SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw 17995ffd83dbSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw 1800fcaf7f86SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw 18010b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 18020b57cec5SDimitry Andric 18035ffd83dbSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w 1804fcaf7f86SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w 18055ffd83dbSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w 18065ffd83dbSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w 18070b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 1808e8d8bef9SDimitry Andric 1809e8d8bef9SDimitry Andric {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw 1810e8d8bef9SDimitry Andric {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb 1811bdd1243dSDimitry Andric 1812bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr 1813bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr 1814bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr 18150b57cec5SDimitry Andric }; 18160b57cec5SDimitry Andric 18170b57cec5SDimitry Andric if (ST->hasBWI()) 18180b57cec5SDimitry Andric if (const auto *Entry = 18190b57cec5SDimitry Andric CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) 18200b57cec5SDimitry Andric return LT.first * Entry->Cost; 18210b57cec5SDimitry Andric 1822bdd1243dSDimitry Andric static const CostKindTblEntry AVX512ShuffleTbl[] = { 1823bdd1243dSDimitry Andric {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd 1824bdd1243dSDimitry Andric {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss 1825bdd1243dSDimitry Andric {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq 1826bdd1243dSDimitry Andric {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd 1827bdd1243dSDimitry Andric {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw 1828bdd1243dSDimitry Andric {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw 1829bdd1243dSDimitry Andric {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb 18300b57cec5SDimitry Andric 1831bdd1243dSDimitry Andric {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd 1832bdd1243dSDimitry Andric {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps 1833bdd1243dSDimitry Andric {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq 1834bdd1243dSDimitry Andric {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd 1835bdd1243dSDimitry Andric {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca 1836bdd1243dSDimitry Andric {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca 1837bdd1243dSDimitry Andric {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca 18380b57cec5SDimitry Andric 1839bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd 1840bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd 1841bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd 1842bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd 1843bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd 1844bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd 1845bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd 1846bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd 1847bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr 1848bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr 1849bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr 18500b57cec5SDimitry Andric 1851bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd 1852bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd 1853bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd 1854bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps 1855bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps 1856bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps 1857bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq 1858bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq 1859bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq 1860bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd 1861bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd 1862bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd 1863bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb 1864bdd1243dSDimitry Andric 1865bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd 1866bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps 1867bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q 1868bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d 1869bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd 1870bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps 1871bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q 1872bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d 1873bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd 1874bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps 1875bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q 1876bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d 18775ffd83dbSDimitry Andric 18785ffd83dbSDimitry Andric // FIXME: This just applies the type legalization cost rules above 18795ffd83dbSDimitry Andric // assuming these completely split. 1880bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } }, 1881bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } }, 1882bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } }, 1883bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } }, 1884bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } }, 1885bdd1243dSDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } }, 1886e8d8bef9SDimitry Andric 1887bdd1243dSDimitry Andric {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq 1888bdd1243dSDimitry Andric {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq 1889bdd1243dSDimitry Andric {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq 1890bdd1243dSDimitry Andric {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd 1891bdd1243dSDimitry Andric {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps 1892bdd1243dSDimitry Andric {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq 1893bdd1243dSDimitry Andric {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd 18940b57cec5SDimitry Andric }; 18950b57cec5SDimitry Andric 18960b57cec5SDimitry Andric if (ST->hasAVX512()) 18970b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) 1898bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 1899bdd1243dSDimitry Andric return LT.first * *KindCost; 19000b57cec5SDimitry Andric 19010b57cec5SDimitry Andric static const CostTblEntry AVX2ShuffleTbl[] = { 19020b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd 19030b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps 19040b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq 19050b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd 19060b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw 1907fcaf7f86SDimitry Andric {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw 19080b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb 19090b57cec5SDimitry Andric 19100b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd 19110b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps 19120b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq 19130b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd 19140b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb 1915fcaf7f86SDimitry Andric {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb 19160b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb 19170b57cec5SDimitry Andric 19180b57cec5SDimitry Andric {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb 1919fcaf7f86SDimitry Andric {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb 19200b57cec5SDimitry Andric {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb 19210b57cec5SDimitry Andric 1922bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr 1923bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr 1924bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr 1925bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr 1926bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr 1927bdd1243dSDimitry Andric 19280b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd 19290b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps 19300b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq 19310b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd 19320b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb 19330b57cec5SDimitry Andric // + vpblendvb 1934fcaf7f86SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb 1935fcaf7f86SDimitry Andric // + vpblendvb 19360b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb 19370b57cec5SDimitry Andric // + vpblendvb 19380b57cec5SDimitry Andric 19390b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd 19400b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps 19410b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd 19420b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd 19430b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb 19440b57cec5SDimitry Andric // + vpblendvb 1945fcaf7f86SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb 1946fcaf7f86SDimitry Andric // + vpblendvb 19470b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb 19480b57cec5SDimitry Andric // + vpblendvb 19490b57cec5SDimitry Andric }; 19500b57cec5SDimitry Andric 19510b57cec5SDimitry Andric if (ST->hasAVX2()) 19520b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) 19530b57cec5SDimitry Andric return LT.first * Entry->Cost; 19540b57cec5SDimitry Andric 19550b57cec5SDimitry Andric static const CostTblEntry XOPShuffleTbl[] = { 19560b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd 19570b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps 19580b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd 19590b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps 19600b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm 19610b57cec5SDimitry Andric // + vinsertf128 19620b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm 19630b57cec5SDimitry Andric // + vinsertf128 19640b57cec5SDimitry Andric 19650b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm 19660b57cec5SDimitry Andric // + vinsertf128 19670b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm 19680b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm 19690b57cec5SDimitry Andric // + vinsertf128 19700b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm 19710b57cec5SDimitry Andric }; 19720b57cec5SDimitry Andric 19730b57cec5SDimitry Andric if (ST->hasXOP()) 19740b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) 19750b57cec5SDimitry Andric return LT.first * Entry->Cost; 19760b57cec5SDimitry Andric 19770b57cec5SDimitry Andric static const CostTblEntry AVX1ShuffleTbl[] = { 19780b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd 19790b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps 19800b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd 19810b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps 19820b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 1983fcaf7f86SDimitry Andric {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128 19840b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 19850b57cec5SDimitry Andric 19860b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd 19870b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps 19880b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd 19890b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps 19900b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb 19910b57cec5SDimitry Andric // + vinsertf128 1992fcaf7f86SDimitry Andric {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb 1993fcaf7f86SDimitry Andric // + vinsertf128 19940b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb 19950b57cec5SDimitry Andric // + vinsertf128 19960b57cec5SDimitry Andric 19970b57cec5SDimitry Andric {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd 19980b57cec5SDimitry Andric {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd 19990b57cec5SDimitry Andric {TTI::SK_Select, MVT::v8i32, 1}, // vblendps 20000b57cec5SDimitry Andric {TTI::SK_Select, MVT::v8f32, 1}, // vblendps 20010b57cec5SDimitry Andric {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor 2002fcaf7f86SDimitry Andric {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor 20030b57cec5SDimitry Andric {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor 20040b57cec5SDimitry Andric 2005bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd 2006bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd 2007bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps 2008bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps 2009bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 2010bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 2011bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 2012bdd1243dSDimitry Andric 20130b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd 20140b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd 20150b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps 20160b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps 20170b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb 20180b57cec5SDimitry Andric // + 2*por + vinsertf128 2019fcaf7f86SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb 2020fcaf7f86SDimitry Andric // + 2*por + vinsertf128 20210b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb 20220b57cec5SDimitry Andric // + 2*por + vinsertf128 20230b57cec5SDimitry Andric 20240b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd 20250b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd 20260b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps 20270b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps 20280b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb 20290b57cec5SDimitry Andric // + 4*por + vinsertf128 2030fcaf7f86SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb 2031fcaf7f86SDimitry Andric // + 4*por + vinsertf128 20320b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb 20330b57cec5SDimitry Andric // + 4*por + vinsertf128 20340b57cec5SDimitry Andric }; 20350b57cec5SDimitry Andric 20360b57cec5SDimitry Andric if (ST->hasAVX()) 20370b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) 20380b57cec5SDimitry Andric return LT.first * Entry->Cost; 20390b57cec5SDimitry Andric 20400b57cec5SDimitry Andric static const CostTblEntry SSE41ShuffleTbl[] = { 20410b57cec5SDimitry Andric {TTI::SK_Select, MVT::v2i64, 1}, // pblendw 20420b57cec5SDimitry Andric {TTI::SK_Select, MVT::v2f64, 1}, // movsd 20430b57cec5SDimitry Andric {TTI::SK_Select, MVT::v4i32, 1}, // pblendw 20440b57cec5SDimitry Andric {TTI::SK_Select, MVT::v4f32, 1}, // blendps 20450b57cec5SDimitry Andric {TTI::SK_Select, MVT::v8i16, 1}, // pblendw 2046fcaf7f86SDimitry Andric {TTI::SK_Select, MVT::v8f16, 1}, // pblendw 20470b57cec5SDimitry Andric {TTI::SK_Select, MVT::v16i8, 1} // pblendvb 20480b57cec5SDimitry Andric }; 20490b57cec5SDimitry Andric 20500b57cec5SDimitry Andric if (ST->hasSSE41()) 20510b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) 20520b57cec5SDimitry Andric return LT.first * Entry->Cost; 20530b57cec5SDimitry Andric 20540b57cec5SDimitry Andric static const CostTblEntry SSSE3ShuffleTbl[] = { 20550b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb 2056fcaf7f86SDimitry Andric {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb 20570b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb 20580b57cec5SDimitry Andric 20590b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb 2060fcaf7f86SDimitry Andric {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb 20610b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb 20620b57cec5SDimitry Andric 20630b57cec5SDimitry Andric {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por 2064fcaf7f86SDimitry Andric {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por 20650b57cec5SDimitry Andric {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por 20660b57cec5SDimitry Andric 2067bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4i32, 1}, // palignr 2068bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4f32, 1}, // palignr 2069bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8i16, 1}, // palignr 2070bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8f16, 1}, // palignr 2071bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v16i8, 1}, // palignr 2072bdd1243dSDimitry Andric 20730b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb 2074fcaf7f86SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb 20750b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb 20760b57cec5SDimitry Andric 20770b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por 2078fcaf7f86SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por 20790b57cec5SDimitry Andric {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por 20800b57cec5SDimitry Andric }; 20810b57cec5SDimitry Andric 20820b57cec5SDimitry Andric if (ST->hasSSSE3()) 20830b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) 20840b57cec5SDimitry Andric return LT.first * Entry->Cost; 20850b57cec5SDimitry Andric 20860b57cec5SDimitry Andric static const CostTblEntry SSE2ShuffleTbl[] = { 20870b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd 20880b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd 20890b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd 20900b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd 2091fcaf7f86SDimitry Andric {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd 20920b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd 20930b57cec5SDimitry Andric 20940b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd 20950b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd 20960b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd 20970b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd 2098fcaf7f86SDimitry Andric {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd 20990b57cec5SDimitry Andric {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw 21000b57cec5SDimitry Andric // + 2*pshufd + 2*unpck + packus 21010b57cec5SDimitry Andric 21020b57cec5SDimitry Andric {TTI::SK_Select, MVT::v2i64, 1}, // movsd 21030b57cec5SDimitry Andric {TTI::SK_Select, MVT::v2f64, 1}, // movsd 21040b57cec5SDimitry Andric {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps 21050b57cec5SDimitry Andric {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por 2106fcaf7f86SDimitry Andric {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por 21070b57cec5SDimitry Andric {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por 21080b57cec5SDimitry Andric 2109bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd 2110bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd 2111bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd} 2112bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por 2113bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por 2114bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por 2115bdd1243dSDimitry Andric 21160b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd 21170b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd 21180b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd 21190b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw 21200b57cec5SDimitry Andric // + pshufd/unpck 2121fcaf7f86SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw 2122fcaf7f86SDimitry Andric // + pshufd/unpck 21230b57cec5SDimitry Andric { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw 21240b57cec5SDimitry Andric // + 2*pshufd + 2*unpck + 2*packus 21250b57cec5SDimitry Andric 21260b57cec5SDimitry Andric { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd 21270b57cec5SDimitry Andric { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd 21280b57cec5SDimitry Andric { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} 21290b57cec5SDimitry Andric { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute 2130fcaf7f86SDimitry Andric { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute 21310b57cec5SDimitry Andric { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute 21320b57cec5SDimitry Andric }; 21330b57cec5SDimitry Andric 213481ad6265SDimitry Andric static const CostTblEntry SSE3BroadcastLoadTbl[] = { 213581ad6265SDimitry Andric {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup 213681ad6265SDimitry Andric }; 213781ad6265SDimitry Andric 213881ad6265SDimitry Andric if (ST->hasSSE2()) { 213981ad6265SDimitry Andric bool IsLoad = 214081ad6265SDimitry Andric llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); }); 214181ad6265SDimitry Andric if (ST->hasSSE3() && IsLoad) 214281ad6265SDimitry Andric if (const auto *Entry = 214381ad6265SDimitry Andric CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { 214481ad6265SDimitry Andric assert(isLegalBroadcastLoad(BaseTp->getElementType(), 214581ad6265SDimitry Andric LT.second.getVectorElementCount()) && 214681ad6265SDimitry Andric "Table entry missing from isLegalBroadcastLoad()"); 214781ad6265SDimitry Andric return LT.first * Entry->Cost; 214881ad6265SDimitry Andric } 214981ad6265SDimitry Andric 21500b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) 21510b57cec5SDimitry Andric return LT.first * Entry->Cost; 215281ad6265SDimitry Andric } 21530b57cec5SDimitry Andric 21540b57cec5SDimitry Andric static const CostTblEntry SSE1ShuffleTbl[] = { 21550b57cec5SDimitry Andric { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps 21560b57cec5SDimitry Andric { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps 21570b57cec5SDimitry Andric { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps 2158bdd1243dSDimitry Andric { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps 21590b57cec5SDimitry Andric { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps 21600b57cec5SDimitry Andric { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps 21610b57cec5SDimitry Andric }; 21620b57cec5SDimitry Andric 21630b57cec5SDimitry Andric if (ST->hasSSE1()) 21640b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) 21650b57cec5SDimitry Andric return LT.first * Entry->Cost; 21660b57cec5SDimitry Andric 2167bdd1243dSDimitry Andric return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); 21680b57cec5SDimitry Andric } 21690b57cec5SDimitry Andric 2170fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 2171fe6060f1SDimitry Andric Type *Src, 2172e8d8bef9SDimitry Andric TTI::CastContextHint CCH, 21735ffd83dbSDimitry Andric TTI::TargetCostKind CostKind, 21740b57cec5SDimitry Andric const Instruction *I) { 21750b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 21760b57cec5SDimitry Andric assert(ISD && "Invalid opcode"); 21770b57cec5SDimitry Andric 2178fe6060f1SDimitry Andric // The cost tables include both specific, custom (non-legal) src/dst type 2179fe6060f1SDimitry Andric // conversions and generic, legalized types. We test for customs first, before 2180fe6060f1SDimitry Andric // falling back to legalization. 21810b57cec5SDimitry Andric // FIXME: Need a better design of the cost table to handle non-simple types of 21820b57cec5SDimitry Andric // potential massive combinations (elem_num x src_type x dst_type). 2183*0fca6ea1SDimitry Andric static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{ 2184*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } }, 2185*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } }, 21860b57cec5SDimitry Andric 21870b57cec5SDimitry Andric // Mask sign extend has an instruction. 2188*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } }, 2189*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } }, 2190*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } }, 2191*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } }, 2192*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } }, 2193*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } }, 2194*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } }, 2195*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } }, 2196*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } }, 2197*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } }, 2198*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } }, 2199*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } }, 2200*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } }, 2201*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } }, 2202*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } }, 2203*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } }, 2204*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } }, 22050b57cec5SDimitry Andric 22065ffd83dbSDimitry Andric // Mask zero extend is a sext + shift. 2207*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } }, 2208*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } }, 2209*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } }, 2210*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } }, 2211*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } }, 2212*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } }, 2213*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } }, 2214*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } }, 2215*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } }, 2216*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } }, 2217*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } }, 2218*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } }, 2219*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } }, 2220*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } }, 2221*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } }, 2222*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } }, 2223*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } }, 22244824e7fdSDimitry Andric 2225*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } }, 2226*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2227*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, 2228*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } }, 2229*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, 2230*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2231*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } }, 2232*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } }, 2233*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } }, 2234*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2235*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } }, 2236*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2237*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } }, 2238*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } }, 2239*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } }, 2240*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } }, 2241*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } }, 22425ffd83dbSDimitry Andric 2243*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } }, 2244*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm 2245*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb 2246*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb 2247*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb 22480b57cec5SDimitry Andric }; 22490b57cec5SDimitry Andric 2250*0fca6ea1SDimitry Andric static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = { 22514824e7fdSDimitry Andric // Mask sign extend has an instruction. 2252*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, 2253*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } }, 2254*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, 2255*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, 2256*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, 2257*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } }, 2258*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, 2259*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, 22604824e7fdSDimitry Andric 22614824e7fdSDimitry Andric // Mask zero extend is a sext + shift. 2262*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } }, 2263*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } }, 2264*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } }, 2265*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } }, 2266*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } }, 2267*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } }, 2268*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } }, 2269*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } }, 22704824e7fdSDimitry Andric 2271*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, 2272*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } }, 2273*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, 2274*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, 2275*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, 2276*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, 2277*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, 2278*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } }, 22794824e7fdSDimitry Andric 2280*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } }, 2281*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } }, 22820b57cec5SDimitry Andric 2283*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } }, 2284*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } }, 22850b57cec5SDimitry Andric 2286*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } }, 2287*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } }, 22880b57cec5SDimitry Andric 2289*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } }, 2290*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } }, 22910b57cec5SDimitry Andric }; 22920b57cec5SDimitry Andric 22930b57cec5SDimitry Andric // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and 22940b57cec5SDimitry Andric // 256-bit wide vectors. 22950b57cec5SDimitry Andric 2296*0fca6ea1SDimitry Andric static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = { 2297*0fca6ea1SDimitry Andric { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } }, 2298*0fca6ea1SDimitry Andric { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } }, 2299*0fca6ea1SDimitry Andric { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4 2300*0fca6ea1SDimitry Andric { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } }, 23010b57cec5SDimitry Andric 2302*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2303*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2304*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2305*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2306*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq 2307*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq 2308*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq 2309*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2310*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd 2311*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd 2312*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd 2313*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd 2314*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq 2315*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq 2316*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq 2317*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb 2318*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb 2319*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb 2320*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb 2321*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb 2322*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw 2323*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw 2324*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb 2325*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb 2326*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb 2327*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb 2328*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb 2329*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb 2330*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw 2331*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw 2332*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw 2333*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd 2334*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd 2335*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb 23360b57cec5SDimitry Andric 2337*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32 2338*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } }, 2339*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } }, 23405ffd83dbSDimitry Andric 23415ffd83dbSDimitry Andric // Sign extend is zmm vpternlogd+vptruncdb. 23425ffd83dbSDimitry Andric // Zero extend is zmm broadcast load+vptruncdw. 2343*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } }, 2344*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } }, 2345*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } }, 2346*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } }, 2347*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } }, 2348*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } }, 2349*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } }, 2350*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } }, 23515ffd83dbSDimitry Andric 23525ffd83dbSDimitry Andric // Sign extend is zmm vpternlogd+vptruncdw. 23535ffd83dbSDimitry Andric // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. 2354*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } }, 2355*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } }, 2356*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } }, 2357*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } }, 2358*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } }, 2359*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } }, 2360*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } }, 2361*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } }, 23625ffd83dbSDimitry Andric 2363*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd 2364*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld 2365*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd 2366*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld 2367*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd 2368*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld 2369*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq 2370*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq 2371*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq 2372*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq 23735ffd83dbSDimitry Andric 2374*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd 2375*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld 2376*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq 2377*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq 23785ffd83dbSDimitry Andric 2379*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } }, 2380*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } }, 2381*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } }, 2382*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } }, 2383*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } }, 2384*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } }, 2385*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } }, 2386*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } }, 2387*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } }, 2388*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } }, 23890b57cec5SDimitry Andric 2390*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right 2391*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right 23925ffd83dbSDimitry Andric 2393*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } }, 2394*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } }, 2395*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } }, 2396*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } }, 2397*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } }, 2398*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } }, 2399*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } }, 2400*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } }, 24010b57cec5SDimitry Andric 2402*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } }, 2403*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } }, 2404*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } }, 2405*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } }, 2406*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } }, 2407*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } }, 2408*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } }, 2409*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } }, 2410*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } }, 2411*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } }, 24125ffd83dbSDimitry Andric 2413*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } }, 2414*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } }, 2415*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } }, 2416*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } }, 2417*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } }, 2418*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } }, 2419*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } }, 2420*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } }, 2421*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } }, 2422*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } }, 2423*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } }, 24245ffd83dbSDimitry Andric 2425*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } }, 2426*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } }, 2427*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } }, 2428*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } }, 2429*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } }, 2430*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } }, 24315ffd83dbSDimitry Andric }; 24325ffd83dbSDimitry Andric 2433*0fca6ea1SDimitry Andric static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] { 24345ffd83dbSDimitry Andric // Mask sign extend has an instruction. 2435*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } }, 2436*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } }, 2437*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } }, 2438*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } }, 2439*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } }, 2440*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } }, 2441*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } }, 2442*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } }, 2443*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } }, 2444*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } }, 2445*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } }, 2446*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } }, 2447*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } }, 2448*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } }, 2449*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } }, 2450*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } }, 2451*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } }, 24525ffd83dbSDimitry Andric 24535ffd83dbSDimitry Andric // Mask zero extend is a sext + shift. 2454*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } }, 2455*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } }, 2456*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } }, 2457*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } }, 2458*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } }, 2459*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } }, 2460*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } }, 2461*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } }, 2462*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } }, 2463*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } }, 2464*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } }, 2465*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } }, 2466*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } }, 2467*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } }, 2468*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } }, 2469*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } }, 2470*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } }, 24714824e7fdSDimitry Andric 2472*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } }, 2473*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2474*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, 2475*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } }, 2476*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, 2477*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2478*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } }, 2479*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } }, 2480*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } }, 2481*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2482*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } }, 2483*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2484*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } }, 2485*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } }, 2486*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } }, 2487*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } }, 2488*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } }, 24895ffd83dbSDimitry Andric 2490*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, 24915ffd83dbSDimitry Andric }; 24925ffd83dbSDimitry Andric 2493*0fca6ea1SDimitry Andric static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = { 24944824e7fdSDimitry Andric // Mask sign extend has an instruction. 2495*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, 2496*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } }, 2497*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, 2498*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } }, 2499*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, 2500*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } }, 2501*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, 2502*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, 25034824e7fdSDimitry Andric 25044824e7fdSDimitry Andric // Mask zero extend is a sext + shift. 2505*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, 2506*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } }, 2507*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, 2508*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } }, 2509*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, 2510*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } }, 2511*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, 2512*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, 25134824e7fdSDimitry Andric 2514*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } }, 2515*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, 2516*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, 2517*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } }, 2518*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, 2519*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, 2520*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } }, 2521*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, 25224824e7fdSDimitry Andric 2523*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } }, 2524*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } }, 2525*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } }, 2526*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } }, 25275ffd83dbSDimitry Andric 2528*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } }, 2529*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } }, 2530*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } }, 2531*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } }, 25325ffd83dbSDimitry Andric 2533*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } }, 2534*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } }, 2535*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } }, 2536*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } }, 25375ffd83dbSDimitry Andric 2538*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } }, 2539*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } }, 2540*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } }, 2541*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } }, 25425ffd83dbSDimitry Andric }; 25435ffd83dbSDimitry Andric 2544*0fca6ea1SDimitry Andric static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = { 2545*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2546*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2547*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2548*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8 2549*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq 2550*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq 2551*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq 2552*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16 2553*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd 2554*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd 2555*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd 2556*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd 2557*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq 2558*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq 2559*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd 2560*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb 2561*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw 2562*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb 25635ffd83dbSDimitry Andric 25645ffd83dbSDimitry Andric // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb 25655ffd83dbSDimitry Andric // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb 2566*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } }, 2567*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } }, 2568*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } }, 2569*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } }, 2570*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } }, 2571*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } }, 2572*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } }, 2573*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } }, 25745ffd83dbSDimitry Andric 25755ffd83dbSDimitry Andric // sign extend is vpcmpeq+maskedmove+vpmovdw 25765ffd83dbSDimitry Andric // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw 2577*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } }, 2578*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } }, 2579*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } }, 2580*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } }, 2581*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } }, 2582*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } }, 2583*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } }, 2584*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } }, 25855ffd83dbSDimitry Andric 2586*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd 2587*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld 2588*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd 2589*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld 2590*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd 2591*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld 2592*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd 2593*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld 2594bdd1243dSDimitry Andric 2595*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq 2596*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq 2597*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq 2598*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq 25995ffd83dbSDimitry Andric 2600*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } }, 2601*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } }, 2602*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } }, 2603*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } }, 2604*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } }, 2605*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } }, 2606*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } }, 2607*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } }, 2608*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } }, 2609*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } }, 2610*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } }, 2611*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } }, 2612fe6060f1SDimitry Andric 2613*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } }, 2614*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } }, 2615*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } }, 2616*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } }, 2617fe6060f1SDimitry Andric 2618*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } }, 2619*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } }, 2620*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } }, 2621*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } }, 2622*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } }, 2623*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } }, 2624*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } }, 2625*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } }, 2626*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } }, 2627*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } }, 2628*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } }, 2629*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } }, 2630*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } }, 26310b57cec5SDimitry Andric 2632*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } }, 2633*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } }, 2634*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } }, 26355ffd83dbSDimitry Andric 2636*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } }, 2637*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } }, 2638*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } }, 2639*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } }, 2640*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } }, 2641*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } }, 2642*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } }, 26430b57cec5SDimitry Andric }; 26440b57cec5SDimitry Andric 2645*0fca6ea1SDimitry Andric static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = { 2646*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } }, 2647*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } }, 2648*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } }, 2649*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } }, 2650*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } }, 2651*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } }, 2652fe6060f1SDimitry Andric 2653*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } }, 2654*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } }, 2655*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } }, 2656*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } }, 2657*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } }, 2658*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } }, 2659*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } }, 2660*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } }, 2661*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } }, 2662*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } }, 2663*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } }, 2664*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } }, 2665*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } }, 2666*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } }, 26675ffd83dbSDimitry Andric 2668*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, 26690b57cec5SDimitry Andric 2670*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } }, 2671*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } }, 2672*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } }, 2673*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } }, 2674*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } }, 2675*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } }, 2676*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } }, 2677*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } }, 2678*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } }, 2679*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } }, 2680*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, 2681*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } }, 26820b57cec5SDimitry Andric 2683*0fca6ea1SDimitry Andric { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } }, 2684*0fca6ea1SDimitry Andric { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } }, 26850b57cec5SDimitry Andric 2686*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } }, 2687*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } }, 2688*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } }, 2689*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } }, 2690fe6060f1SDimitry Andric 2691*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } }, 2692*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } }, 2693*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } }, 2694*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } }, 2695*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } }, 2696*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } }, 2697*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } }, 2698*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } }, 2699fe6060f1SDimitry Andric 2700*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } }, 2701*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } }, 2702*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } }, 2703*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } }, 2704*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } }, 2705*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } }, 2706*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } }, 2707fe6060f1SDimitry Andric 2708*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } }, 2709*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } }, 2710*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } }, 2711*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } }, 2712*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } }, 2713*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } }, 2714*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } }, 2715*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } }, 2716*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } }, 2717*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } }, 27180b57cec5SDimitry Andric }; 27190b57cec5SDimitry Andric 2720*0fca6ea1SDimitry Andric static const TypeConversionCostKindTblEntry AVXConversionTbl[] = { 2721*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } }, 2722*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } }, 2723*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } }, 2724*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } }, 2725*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } }, 2726*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } }, 2727fe6060f1SDimitry Andric 2728*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } }, 2729*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } }, 2730*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } }, 2731*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } }, 2732*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } }, 2733*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } }, 2734*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } }, 2735*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } }, 2736*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } }, 2737*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } }, 2738*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } }, 2739*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } }, 27400b57cec5SDimitry Andric 2741*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } }, 2742*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } }, 2743*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } }, 2744*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } }, 2745*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } }, 27465ffd83dbSDimitry Andric 2747*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } }, 2748*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } }, 2749*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb 2750*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } }, 2751*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } }, 2752*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } }, 2753*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw 2754*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } }, 27550b57cec5SDimitry Andric 2756*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } }, 2757*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } }, 2758*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } }, 2759*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } }, 2760*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } }, 2761*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } }, 2762*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } }, 2763*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } }, 2764*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } }, 2765*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } }, 2766*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } }, 2767*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } }, 27680b57cec5SDimitry Andric 2769*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } }, 2770*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } }, 2771*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } }, 2772*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } }, 2773*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } }, 2774*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } }, 2775*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } }, 2776*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } }, 2777*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } }, 2778*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } }, 2779*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } }, 2780*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } }, 2781*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } }, 2782*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } }, 2783*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } }, 2784*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } }, 2785*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } }, 27860b57cec5SDimitry Andric 2787*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } }, 2788*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } }, 2789*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } }, 2790*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } }, 2791*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } }, 2792*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } }, 2793*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } }, 2794*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } }, 2795*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } }, 2796*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } }, 2797*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } }, 27985ffd83dbSDimitry Andric 2799*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } }, 2800*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } }, 2801*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } }, 2802*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } }, 2803*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } }, 2804*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } }, 2805*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } }, 2806*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } }, 2807*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } }, 2808*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } }, 2809*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } }, 2810*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } }, 2811*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } }, 28120b57cec5SDimitry Andric 2813*0fca6ea1SDimitry Andric { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } }, 2814*0fca6ea1SDimitry Andric { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } }, 28150b57cec5SDimitry Andric }; 28160b57cec5SDimitry Andric 2817*0fca6ea1SDimitry Andric static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = { 2818*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } }, 2819*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } }, 2820*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } }, 2821*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } }, 2822*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } }, 2823*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } }, 2824*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } }, 2825*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } }, 2826*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } }, 2827*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } }, 2828*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } }, 2829*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } }, 28300b57cec5SDimitry Andric 28315ffd83dbSDimitry Andric // These truncates end up widening elements. 2832*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ 2833*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ 2834*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD 28355ffd83dbSDimitry Andric 2836*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } }, 2837*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } }, 2838*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } }, 28390b57cec5SDimitry Andric 2840*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } }, 2841*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } }, 2842*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } }, 2843*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } }, 2844*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } }, 2845*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } }, 2846*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } }, 2847*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } }, 2848*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } }, 2849*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } }, 2850*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } }, 2851fe6060f1SDimitry Andric 2852*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } }, 2853*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } }, 2854*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } }, 2855*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } }, 2856*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } }, 2857*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } }, 2858*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } }, 2859*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } }, 2860*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } }, 2861*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } }, 2862*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } }, 2863*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } }, 2864*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } }, 2865*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } }, 28665ffd83dbSDimitry Andric 2867*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } }, 2868*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } }, 2869*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } }, 2870*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } }, 2871*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } }, 2872*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } }, 2873*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } }, 2874*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } }, 2875*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } }, 2876*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } }, 28775ffd83dbSDimitry Andric 2878*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } }, 2879*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } }, 2880*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } }, 2881*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } }, 2882*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } }, 2883*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } }, 2884*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } }, 2885*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } }, 2886*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } }, 2887*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } }, 28880b57cec5SDimitry Andric }; 28890b57cec5SDimitry Andric 2890*0fca6ea1SDimitry Andric static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = { 2891fe6060f1SDimitry Andric // These are somewhat magic numbers justified by comparing the 2892fe6060f1SDimitry Andric // output of llvm-mca for our various supported scheduler models 2893fe6060f1SDimitry Andric // and basing it off the worst case scenario. 2894*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } }, 2895*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } }, 2896*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } }, 2897*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } }, 2898*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } }, 2899*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } }, 2900*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } }, 2901*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } }, 2902*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } }, 2903*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } }, 2904*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } }, 2905*0fca6ea1SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } }, 29060b57cec5SDimitry Andric 2907*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } }, 2908*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } }, 2909*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } }, 2910*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } }, 2911*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } }, 2912*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } }, 2913*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } }, 2914*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } }, 2915*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } }, 2916*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } }, 2917*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } }, 2918*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } }, 2919*0fca6ea1SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } }, 29200b57cec5SDimitry Andric 2921*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } }, 2922*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } }, 2923*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } }, 2924*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } }, 2925*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } }, 2926*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } }, 2927*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } }, 2928*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } }, 2929*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } }, 2930*0fca6ea1SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } }, 2931480093f4SDimitry Andric 2932*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } }, 2933*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } }, 2934*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } }, 2935*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } }, 2936*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } }, 2937*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } }, 2938*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } }, 2939*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } }, 2940*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } }, 2941*0fca6ea1SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } }, 29420b57cec5SDimitry Andric 2943*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } }, 2944*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } }, 2945*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } }, 2946*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } }, 2947*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } }, 2948*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } }, 2949*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } }, 2950*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } }, 2951*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } }, 2952*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } }, 2953*0fca6ea1SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } }, 2954*0fca6ea1SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } }, 29550b57cec5SDimitry Andric 29565ffd83dbSDimitry Andric // These truncates are really widening elements. 2957*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD 2958*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ 2959*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD 2960*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD 2961*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD 2962*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW 29635ffd83dbSDimitry Andric 2964*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB 2965*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, 2966*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB 2967*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } }, 2968*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } }, 2969*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } }, 2970*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } }, 2971*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } }, 2972*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB 2973*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW 2974*0fca6ea1SDimitry Andric { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD 29750b57cec5SDimitry Andric }; 29760b57cec5SDimitry Andric 2977fe6060f1SDimitry Andric // Attempt to map directly to (simple) MVT types to let us match custom entries. 29780b57cec5SDimitry Andric EVT SrcTy = TLI->getValueType(DL, Src); 29790b57cec5SDimitry Andric EVT DstTy = TLI->getValueType(DL, Dst); 29800b57cec5SDimitry Andric 29810b57cec5SDimitry Andric // The function getSimpleVT only handles simple value types. 2982fe6060f1SDimitry Andric if (SrcTy.isSimple() && DstTy.isSimple()) { 29830b57cec5SDimitry Andric MVT SimpleSrcTy = SrcTy.getSimpleVT(); 29840b57cec5SDimitry Andric MVT SimpleDstTy = DstTy.getSimpleVT(); 29850b57cec5SDimitry Andric 29865ffd83dbSDimitry Andric if (ST->useAVX512Regs()) { 29870b57cec5SDimitry Andric if (ST->hasBWI()) 2988fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup( 2989fe6060f1SDimitry Andric AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 2990*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 2991*0fca6ea1SDimitry Andric return *KindCost; 29920b57cec5SDimitry Andric 29930b57cec5SDimitry Andric if (ST->hasDQI()) 2994fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup( 2995fe6060f1SDimitry Andric AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 2996*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 2997*0fca6ea1SDimitry Andric return *KindCost; 29980b57cec5SDimitry Andric 29990b57cec5SDimitry Andric if (ST->hasAVX512()) 3000fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup( 3001fe6060f1SDimitry Andric AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 3002*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3003*0fca6ea1SDimitry Andric return *KindCost; 30040b57cec5SDimitry Andric } 30050b57cec5SDimitry Andric 30065ffd83dbSDimitry Andric if (ST->hasBWI()) 3007fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup( 3008fe6060f1SDimitry Andric AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 3009*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3010*0fca6ea1SDimitry Andric return *KindCost; 30115ffd83dbSDimitry Andric 30125ffd83dbSDimitry Andric if (ST->hasDQI()) 3013fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup( 3014fe6060f1SDimitry Andric AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 3015*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3016*0fca6ea1SDimitry Andric return *KindCost; 30175ffd83dbSDimitry Andric 30185ffd83dbSDimitry Andric if (ST->hasAVX512()) 30195ffd83dbSDimitry Andric if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, 30205ffd83dbSDimitry Andric SimpleDstTy, SimpleSrcTy)) 3021*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3022*0fca6ea1SDimitry Andric return *KindCost; 30235ffd83dbSDimitry Andric 30240b57cec5SDimitry Andric if (ST->hasAVX2()) { 30250b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 30260b57cec5SDimitry Andric SimpleDstTy, SimpleSrcTy)) 3027*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3028*0fca6ea1SDimitry Andric return *KindCost; 30290b57cec5SDimitry Andric } 30300b57cec5SDimitry Andric 30310b57cec5SDimitry Andric if (ST->hasAVX()) { 30320b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, 30330b57cec5SDimitry Andric SimpleDstTy, SimpleSrcTy)) 3034*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3035*0fca6ea1SDimitry Andric return *KindCost; 30360b57cec5SDimitry Andric } 30370b57cec5SDimitry Andric 30380b57cec5SDimitry Andric if (ST->hasSSE41()) { 30390b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, 30400b57cec5SDimitry Andric SimpleDstTy, SimpleSrcTy)) 3041*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3042*0fca6ea1SDimitry Andric return *KindCost; 30430b57cec5SDimitry Andric } 30440b57cec5SDimitry Andric 30450b57cec5SDimitry Andric if (ST->hasSSE2()) { 30460b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 30470b57cec5SDimitry Andric SimpleDstTy, SimpleSrcTy)) 3048*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3049*0fca6ea1SDimitry Andric return *KindCost; 30500b57cec5SDimitry Andric } 3051fe6060f1SDimitry Andric } 3052fe6060f1SDimitry Andric 3053fe6060f1SDimitry Andric // Fall back to legalized types. 3054bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src); 3055bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst); 3056fe6060f1SDimitry Andric 305781ad6265SDimitry Andric // If we're truncating to the same legalized type - just assume its free. 305881ad6265SDimitry Andric if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) 305981ad6265SDimitry Andric return TTI::TCC_Free; 306081ad6265SDimitry Andric 3061fe6060f1SDimitry Andric if (ST->useAVX512Regs()) { 3062fe6060f1SDimitry Andric if (ST->hasBWI()) 3063fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup( 3064fe6060f1SDimitry Andric AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second)) 3065*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3066*0fca6ea1SDimitry Andric return std::max(LTSrc.first, LTDest.first) * *KindCost; 3067fe6060f1SDimitry Andric 3068fe6060f1SDimitry Andric if (ST->hasDQI()) 3069fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup( 3070fe6060f1SDimitry Andric AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second)) 3071*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3072*0fca6ea1SDimitry Andric return std::max(LTSrc.first, LTDest.first) * *KindCost; 3073fe6060f1SDimitry Andric 3074fe6060f1SDimitry Andric if (ST->hasAVX512()) 3075fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup( 3076fe6060f1SDimitry Andric AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second)) 3077*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3078*0fca6ea1SDimitry Andric return std::max(LTSrc.first, LTDest.first) * *KindCost; 3079fe6060f1SDimitry Andric } 3080fe6060f1SDimitry Andric 3081fe6060f1SDimitry Andric if (ST->hasBWI()) 3082fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, 3083fe6060f1SDimitry Andric LTDest.second, LTSrc.second)) 3084*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3085*0fca6ea1SDimitry Andric return std::max(LTSrc.first, LTDest.first) * *KindCost; 3086fe6060f1SDimitry Andric 3087fe6060f1SDimitry Andric if (ST->hasDQI()) 3088fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, 3089fe6060f1SDimitry Andric LTDest.second, LTSrc.second)) 3090*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3091*0fca6ea1SDimitry Andric return std::max(LTSrc.first, LTDest.first) * *KindCost; 3092fe6060f1SDimitry Andric 3093fe6060f1SDimitry Andric if (ST->hasAVX512()) 3094fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, 3095fe6060f1SDimitry Andric LTDest.second, LTSrc.second)) 3096*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3097*0fca6ea1SDimitry Andric return std::max(LTSrc.first, LTDest.first) * *KindCost; 3098fe6060f1SDimitry Andric 3099fe6060f1SDimitry Andric if (ST->hasAVX2()) 3100fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 3101fe6060f1SDimitry Andric LTDest.second, LTSrc.second)) 3102*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3103*0fca6ea1SDimitry Andric return std::max(LTSrc.first, LTDest.first) * *KindCost; 3104fe6060f1SDimitry Andric 3105fe6060f1SDimitry Andric if (ST->hasAVX()) 3106fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, 3107fe6060f1SDimitry Andric LTDest.second, LTSrc.second)) 3108*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3109*0fca6ea1SDimitry Andric return std::max(LTSrc.first, LTDest.first) * *KindCost; 3110fe6060f1SDimitry Andric 3111fe6060f1SDimitry Andric if (ST->hasSSE41()) 3112fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, 3113fe6060f1SDimitry Andric LTDest.second, LTSrc.second)) 3114*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3115*0fca6ea1SDimitry Andric return std::max(LTSrc.first, LTDest.first) * *KindCost; 3116fe6060f1SDimitry Andric 3117fe6060f1SDimitry Andric if (ST->hasSSE2()) 3118fe6060f1SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 3119fe6060f1SDimitry Andric LTDest.second, LTSrc.second)) 3120*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3121*0fca6ea1SDimitry Andric return std::max(LTSrc.first, LTDest.first) * *KindCost; 3122fe6060f1SDimitry Andric 3123fe6060f1SDimitry Andric // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for 3124fe6060f1SDimitry Andric // sitofp. 3125fe6060f1SDimitry Andric if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && 3126fe6060f1SDimitry Andric 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { 3127fe6060f1SDimitry Andric Type *ExtSrc = Src->getWithNewBitWidth(32); 3128fe6060f1SDimitry Andric unsigned ExtOpc = 3129fe6060f1SDimitry Andric (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; 3130fe6060f1SDimitry Andric 3131fe6060f1SDimitry Andric // For scalar loads the extend would be free. 3132fe6060f1SDimitry Andric InstructionCost ExtCost = 0; 3133fe6060f1SDimitry Andric if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0)))) 3134fe6060f1SDimitry Andric ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind); 3135fe6060f1SDimitry Andric 3136fe6060f1SDimitry Andric return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc, 3137fe6060f1SDimitry Andric TTI::CastContextHint::None, CostKind); 3138fe6060f1SDimitry Andric } 3139fe6060f1SDimitry Andric 3140fe6060f1SDimitry Andric // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi 3141fe6060f1SDimitry Andric // i32. 3142fe6060f1SDimitry Andric if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && 3143fe6060f1SDimitry Andric 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { 3144fe6060f1SDimitry Andric Type *TruncDst = Dst->getWithNewBitWidth(32); 3145fe6060f1SDimitry Andric return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) + 3146fe6060f1SDimitry Andric getCastInstrCost(Instruction::Trunc, Dst, TruncDst, 3147fe6060f1SDimitry Andric TTI::CastContextHint::None, CostKind); 3148fe6060f1SDimitry Andric } 31490b57cec5SDimitry Andric 3150*0fca6ea1SDimitry Andric // TODO: Allow non-throughput costs that aren't binary. 3151*0fca6ea1SDimitry Andric auto AdjustCost = [&CostKind](InstructionCost Cost, 3152*0fca6ea1SDimitry Andric InstructionCost N = 1) -> InstructionCost { 3153*0fca6ea1SDimitry Andric if (CostKind != TTI::TCK_RecipThroughput) 3154*0fca6ea1SDimitry Andric return Cost == 0 ? 0 : N; 3155*0fca6ea1SDimitry Andric return Cost * N; 3156*0fca6ea1SDimitry Andric }; 3157e8d8bef9SDimitry Andric return AdjustCost( 3158e8d8bef9SDimitry Andric BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 31590b57cec5SDimitry Andric } 31600b57cec5SDimitry Andric 3161fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 3162fe6060f1SDimitry Andric Type *CondTy, 3163e8d8bef9SDimitry Andric CmpInst::Predicate VecPred, 31645ffd83dbSDimitry Andric TTI::TargetCostKind CostKind, 31650b57cec5SDimitry Andric const Instruction *I) { 3166bdd1243dSDimitry Andric // Early out if this type isn't scalar/vector integer/float. 3167bdd1243dSDimitry Andric if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) 3168e8d8bef9SDimitry Andric return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 3169e8d8bef9SDimitry Andric I); 31705ffd83dbSDimitry Andric 31710b57cec5SDimitry Andric // Legalize the type. 3172bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 31730b57cec5SDimitry Andric 31740b57cec5SDimitry Andric MVT MTy = LT.second; 31750b57cec5SDimitry Andric 31760b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 31770b57cec5SDimitry Andric assert(ISD && "Invalid opcode"); 31780b57cec5SDimitry Andric 317981ad6265SDimitry Andric InstructionCost ExtraCost = 0; 3180349cc55cSDimitry Andric if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { 31810b57cec5SDimitry Andric // Some vector comparison predicates cost extra instructions. 3182*0fca6ea1SDimitry Andric // TODO: Adjust ExtraCost based on CostKind? 3183349cc55cSDimitry Andric // TODO: Should we invert this and assume worst case cmp costs 3184349cc55cSDimitry Andric // and reduce for particular predicates? 31850b57cec5SDimitry Andric if (MTy.isVector() && 31860b57cec5SDimitry Andric !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || 31870b57cec5SDimitry Andric (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || 31880b57cec5SDimitry Andric ST->hasBWI())) { 3189349cc55cSDimitry Andric // Fallback to I if a specific predicate wasn't specified. 3190349cc55cSDimitry Andric CmpInst::Predicate Pred = VecPred; 3191349cc55cSDimitry Andric if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE || 3192349cc55cSDimitry Andric Pred == CmpInst::BAD_FCMP_PREDICATE)) 3193349cc55cSDimitry Andric Pred = cast<CmpInst>(I)->getPredicate(); 3194349cc55cSDimitry Andric 3195*0fca6ea1SDimitry Andric bool CmpWithConstant = false; 3196*0fca6ea1SDimitry Andric if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I)) 3197*0fca6ea1SDimitry Andric CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1)); 3198*0fca6ea1SDimitry Andric 3199349cc55cSDimitry Andric switch (Pred) { 32000b57cec5SDimitry Andric case CmpInst::Predicate::ICMP_NE: 32010b57cec5SDimitry Andric // xor(cmpeq(x,y),-1) 3202*0fca6ea1SDimitry Andric ExtraCost = CmpWithConstant ? 0 : 1; 32030b57cec5SDimitry Andric break; 32040b57cec5SDimitry Andric case CmpInst::Predicate::ICMP_SGE: 32050b57cec5SDimitry Andric case CmpInst::Predicate::ICMP_SLE: 32060b57cec5SDimitry Andric // xor(cmpgt(x,y),-1) 3207*0fca6ea1SDimitry Andric ExtraCost = CmpWithConstant ? 0 : 1; 32080b57cec5SDimitry Andric break; 32090b57cec5SDimitry Andric case CmpInst::Predicate::ICMP_ULT: 32100b57cec5SDimitry Andric case CmpInst::Predicate::ICMP_UGT: 32110b57cec5SDimitry Andric // cmpgt(xor(x,signbit),xor(y,signbit)) 32120b57cec5SDimitry Andric // xor(cmpeq(pmaxu(x,y),x),-1) 3213*0fca6ea1SDimitry Andric ExtraCost = CmpWithConstant ? 1 : 2; 32140b57cec5SDimitry Andric break; 32150b57cec5SDimitry Andric case CmpInst::Predicate::ICMP_ULE: 32160b57cec5SDimitry Andric case CmpInst::Predicate::ICMP_UGE: 32170b57cec5SDimitry Andric if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || 32180b57cec5SDimitry Andric (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { 32190b57cec5SDimitry Andric // cmpeq(psubus(x,y),0) 32200b57cec5SDimitry Andric // cmpeq(pminu(x,y),x) 32210b57cec5SDimitry Andric ExtraCost = 1; 32220b57cec5SDimitry Andric } else { 32230b57cec5SDimitry Andric // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) 3224*0fca6ea1SDimitry Andric ExtraCost = CmpWithConstant ? 2 : 3; 32250b57cec5SDimitry Andric } 32260b57cec5SDimitry Andric break; 3227bdd1243dSDimitry Andric case CmpInst::Predicate::FCMP_ONE: 3228bdd1243dSDimitry Andric case CmpInst::Predicate::FCMP_UEQ: 3229bdd1243dSDimitry Andric // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases. 3230bdd1243dSDimitry Andric // Use FCMP_UEQ expansion - FCMP_ONE should be the same. 3231bdd1243dSDimitry Andric if (CondTy && !ST->hasAVX()) 3232bdd1243dSDimitry Andric return getCmpSelInstrCost(Opcode, ValTy, CondTy, 3233bdd1243dSDimitry Andric CmpInst::Predicate::FCMP_UNO, CostKind) + 3234bdd1243dSDimitry Andric getCmpSelInstrCost(Opcode, ValTy, CondTy, 3235bdd1243dSDimitry Andric CmpInst::Predicate::FCMP_OEQ, CostKind) + 3236bdd1243dSDimitry Andric getArithmeticInstrCost(Instruction::Or, CondTy, CostKind); 3237bdd1243dSDimitry Andric 3238bdd1243dSDimitry Andric break; 3239349cc55cSDimitry Andric case CmpInst::Predicate::BAD_ICMP_PREDICATE: 3240349cc55cSDimitry Andric case CmpInst::Predicate::BAD_FCMP_PREDICATE: 3241349cc55cSDimitry Andric // Assume worst case scenario and add the maximum extra cost. 3242349cc55cSDimitry Andric ExtraCost = 3; 3243349cc55cSDimitry Andric break; 32440b57cec5SDimitry Andric default: 32450b57cec5SDimitry Andric break; 32460b57cec5SDimitry Andric } 32470b57cec5SDimitry Andric } 32480b57cec5SDimitry Andric } 32490b57cec5SDimitry Andric 3250bdd1243dSDimitry Andric static const CostKindTblEntry SLMCostTbl[] = { 32518bcb0991SDimitry Andric // slm pcmpeq/pcmpgt throughput is 2 3252bdd1243dSDimitry Andric { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } }, 3253bdd1243dSDimitry Andric // slm pblendvb/blendvpd/blendvps throughput is 4 3254bdd1243dSDimitry Andric { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd 3255bdd1243dSDimitry Andric { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps 3256bdd1243dSDimitry Andric { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb 3257bdd1243dSDimitry Andric { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb 3258bdd1243dSDimitry Andric { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb 3259bdd1243dSDimitry Andric { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb 32608bcb0991SDimitry Andric }; 32618bcb0991SDimitry Andric 3262bdd1243dSDimitry Andric static const CostKindTblEntry AVX512BWCostTbl[] = { 3263bdd1243dSDimitry Andric { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } }, 3264bdd1243dSDimitry Andric { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } }, 3265bdd1243dSDimitry Andric { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } }, 3266bdd1243dSDimitry Andric { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } }, 32670b57cec5SDimitry Andric 3268bdd1243dSDimitry Andric { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } }, 3269bdd1243dSDimitry Andric { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } }, 32700b57cec5SDimitry Andric }; 32710b57cec5SDimitry Andric 3272bdd1243dSDimitry Andric static const CostKindTblEntry AVX512CostTbl[] = { 3273bdd1243dSDimitry Andric { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } }, 3274bdd1243dSDimitry Andric { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } }, 3275bdd1243dSDimitry Andric { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } }, 3276bdd1243dSDimitry Andric { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } }, 32770b57cec5SDimitry Andric 3278bdd1243dSDimitry Andric { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } }, 3279bdd1243dSDimitry Andric { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } }, 3280bdd1243dSDimitry Andric { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, 3281bdd1243dSDimitry Andric { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } }, 3282bdd1243dSDimitry Andric { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } }, 3283bdd1243dSDimitry Andric { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } }, 3284bdd1243dSDimitry Andric { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } }, 32855ffd83dbSDimitry Andric 3286bdd1243dSDimitry Andric { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } }, 3287bdd1243dSDimitry Andric { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } }, 3288bdd1243dSDimitry Andric { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } }, 3289bdd1243dSDimitry Andric { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } }, 3290bdd1243dSDimitry Andric { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } }, 3291bdd1243dSDimitry Andric { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } }, 3292bdd1243dSDimitry Andric { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } }, 3293bdd1243dSDimitry Andric { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } }, 3294bdd1243dSDimitry Andric { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } }, 3295bdd1243dSDimitry Andric { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } }, 3296bdd1243dSDimitry Andric { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } }, 3297bdd1243dSDimitry Andric { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } }, 3298bdd1243dSDimitry Andric { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } }, 3299bdd1243dSDimitry Andric { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } }, 33005ffd83dbSDimitry Andric 3301bdd1243dSDimitry Andric { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } }, 3302bdd1243dSDimitry Andric { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } }, 3303bdd1243dSDimitry Andric { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } }, 3304bdd1243dSDimitry Andric { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } }, 3305bdd1243dSDimitry Andric { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } }, 3306bdd1243dSDimitry Andric { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } }, 33070b57cec5SDimitry Andric }; 33080b57cec5SDimitry Andric 3309bdd1243dSDimitry Andric static const CostKindTblEntry AVX2CostTbl[] = { 3310bdd1243dSDimitry Andric { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } }, 3311bdd1243dSDimitry Andric { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } }, 3312bdd1243dSDimitry Andric { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } }, 3313bdd1243dSDimitry Andric { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } }, 3314bdd1243dSDimitry Andric { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } }, 3315bdd1243dSDimitry Andric { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } }, 33160b57cec5SDimitry Andric 3317bdd1243dSDimitry Andric { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } }, 3318bdd1243dSDimitry Andric { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } }, 3319bdd1243dSDimitry Andric { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } }, 3320bdd1243dSDimitry Andric { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } }, 3321bdd1243dSDimitry Andric 3322bdd1243dSDimitry Andric { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd 3323bdd1243dSDimitry Andric { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps 3324bdd1243dSDimitry Andric { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb 3325bdd1243dSDimitry Andric { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb 3326bdd1243dSDimitry Andric { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb 3327bdd1243dSDimitry Andric { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb 33280b57cec5SDimitry Andric }; 33290b57cec5SDimitry Andric 3330bdd1243dSDimitry Andric static const CostKindTblEntry XOPCostTbl[] = { 3331bdd1243dSDimitry Andric { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, 3332bdd1243dSDimitry Andric { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, 3333bdd1243dSDimitry Andric }; 3334bdd1243dSDimitry Andric 3335bdd1243dSDimitry Andric static const CostKindTblEntry AVX1CostTbl[] = { 3336bdd1243dSDimitry Andric { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } }, 3337bdd1243dSDimitry Andric { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } }, 3338bdd1243dSDimitry Andric { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } }, 3339bdd1243dSDimitry Andric { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } }, 3340bdd1243dSDimitry Andric { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } }, 3341bdd1243dSDimitry Andric { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } }, 3342bdd1243dSDimitry Andric 33430b57cec5SDimitry Andric // AVX1 does not support 8-wide integer compare. 3344bdd1243dSDimitry Andric { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, 3345bdd1243dSDimitry Andric { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } }, 3346bdd1243dSDimitry Andric { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } }, 3347bdd1243dSDimitry Andric { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } }, 33480b57cec5SDimitry Andric 3349bdd1243dSDimitry Andric { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd 3350bdd1243dSDimitry Andric { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps 3351bdd1243dSDimitry Andric { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd 3352bdd1243dSDimitry Andric { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps 3353bdd1243dSDimitry Andric { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps 3354bdd1243dSDimitry Andric { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps 33550b57cec5SDimitry Andric }; 33560b57cec5SDimitry Andric 3357bdd1243dSDimitry Andric static const CostKindTblEntry SSE42CostTbl[] = { 3358bdd1243dSDimitry Andric { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } }, 33590b57cec5SDimitry Andric }; 33600b57cec5SDimitry Andric 3361bdd1243dSDimitry Andric static const CostKindTblEntry SSE41CostTbl[] = { 3362bdd1243dSDimitry Andric { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } }, 3363bdd1243dSDimitry Andric { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } }, 336481ad6265SDimitry Andric 3365bdd1243dSDimitry Andric { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd 3366bdd1243dSDimitry Andric { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd 3367bdd1243dSDimitry Andric { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps 3368bdd1243dSDimitry Andric { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps 3369bdd1243dSDimitry Andric { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb 3370bdd1243dSDimitry Andric { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb 3371bdd1243dSDimitry Andric { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb 3372bdd1243dSDimitry Andric { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb 33730b57cec5SDimitry Andric }; 33740b57cec5SDimitry Andric 3375bdd1243dSDimitry Andric static const CostKindTblEntry SSE2CostTbl[] = { 3376bdd1243dSDimitry Andric { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } }, 3377bdd1243dSDimitry Andric { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } }, 33780b57cec5SDimitry Andric 3379bdd1243dSDimitry Andric { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion 3380bdd1243dSDimitry Andric { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } }, 3381bdd1243dSDimitry Andric { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } }, 3382bdd1243dSDimitry Andric { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } }, 3383bdd1243dSDimitry Andric 3384bdd1243dSDimitry Andric { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd 3385bdd1243dSDimitry Andric { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd 3386bdd1243dSDimitry Andric { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por 3387bdd1243dSDimitry Andric { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por 3388bdd1243dSDimitry Andric { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por 3389bdd1243dSDimitry Andric { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por 33900b57cec5SDimitry Andric }; 33910b57cec5SDimitry Andric 3392bdd1243dSDimitry Andric static const CostKindTblEntry SSE1CostTbl[] = { 3393bdd1243dSDimitry Andric { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } }, 3394bdd1243dSDimitry Andric { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } }, 33950b57cec5SDimitry Andric 3396bdd1243dSDimitry Andric { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps 3397bdd1243dSDimitry Andric { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps 33980b57cec5SDimitry Andric }; 33990b57cec5SDimitry Andric 3400349cc55cSDimitry Andric if (ST->useSLMArithCosts()) 34018bcb0991SDimitry Andric if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 3402bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3403bdd1243dSDimitry Andric return LT.first * (ExtraCost + *KindCost); 34048bcb0991SDimitry Andric 34050b57cec5SDimitry Andric if (ST->hasBWI()) 34060b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 3407bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3408bdd1243dSDimitry Andric return LT.first * (ExtraCost + *KindCost); 34090b57cec5SDimitry Andric 34100b57cec5SDimitry Andric if (ST->hasAVX512()) 34110b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 3412bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3413bdd1243dSDimitry Andric return LT.first * (ExtraCost + *KindCost); 34140b57cec5SDimitry Andric 34150b57cec5SDimitry Andric if (ST->hasAVX2()) 34160b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 3417bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3418bdd1243dSDimitry Andric return LT.first * (ExtraCost + *KindCost); 3419bdd1243dSDimitry Andric 3420bdd1243dSDimitry Andric if (ST->hasXOP()) 3421bdd1243dSDimitry Andric if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) 3422bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3423bdd1243dSDimitry Andric return LT.first * (ExtraCost + *KindCost); 34240b57cec5SDimitry Andric 34250b57cec5SDimitry Andric if (ST->hasAVX()) 34260b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 3427bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3428bdd1243dSDimitry Andric return LT.first * (ExtraCost + *KindCost); 34290b57cec5SDimitry Andric 34300b57cec5SDimitry Andric if (ST->hasSSE42()) 34310b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 3432bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3433bdd1243dSDimitry Andric return LT.first * (ExtraCost + *KindCost); 34340b57cec5SDimitry Andric 34350b57cec5SDimitry Andric if (ST->hasSSE41()) 34360b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 3437bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3438bdd1243dSDimitry Andric return LT.first * (ExtraCost + *KindCost); 34390b57cec5SDimitry Andric 34400b57cec5SDimitry Andric if (ST->hasSSE2()) 34410b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 3442bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3443bdd1243dSDimitry Andric return LT.first * (ExtraCost + *KindCost); 34440b57cec5SDimitry Andric 34450b57cec5SDimitry Andric if (ST->hasSSE1()) 34460b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) 3447bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 3448bdd1243dSDimitry Andric return LT.first * (ExtraCost + *KindCost); 3449bdd1243dSDimitry Andric 3450bdd1243dSDimitry Andric // Assume a 3cy latency for fp select ops. 3451bdd1243dSDimitry Andric if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select) 3452bdd1243dSDimitry Andric if (ValTy->getScalarType()->isFloatingPointTy()) 3453bdd1243dSDimitry Andric return 3; 34540b57cec5SDimitry Andric 3455e8d8bef9SDimitry Andric return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 34560b57cec5SDimitry Andric } 34570b57cec5SDimitry Andric 34580b57cec5SDimitry Andric unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } 34590b57cec5SDimitry Andric 3460fe6060f1SDimitry Andric InstructionCost 3461bdd1243dSDimitry Andric X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 3462fe6060f1SDimitry Andric TTI::TargetCostKind CostKind) { 34630b57cec5SDimitry Andric // Costs should match the codegen from: 34640b57cec5SDimitry Andric // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll 34650b57cec5SDimitry Andric // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll 34660b57cec5SDimitry Andric // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll 34670b57cec5SDimitry Andric // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll 34680b57cec5SDimitry Andric // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll 3469e8d8bef9SDimitry Andric 3470e8d8bef9SDimitry Andric // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not 3471e8d8bef9SDimitry Andric // specialized in these tables yet. 3472bdd1243dSDimitry Andric static const CostKindTblEntry AVX512VBMI2CostTbl[] = { 3473bdd1243dSDimitry Andric { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } }, 3474bdd1243dSDimitry Andric { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } }, 3475bdd1243dSDimitry Andric { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } }, 3476bdd1243dSDimitry Andric { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } }, 3477bdd1243dSDimitry Andric { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } }, 3478bdd1243dSDimitry Andric { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } }, 3479bdd1243dSDimitry Andric { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } }, 3480bdd1243dSDimitry Andric { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } }, 3481bdd1243dSDimitry Andric { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } }, 3482bdd1243dSDimitry Andric { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } }, 3483bdd1243dSDimitry Andric { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } }, 3484bdd1243dSDimitry Andric { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } }, 3485bdd1243dSDimitry Andric { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } }, 3486bdd1243dSDimitry Andric { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } }, 3487bdd1243dSDimitry Andric { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } }, 3488*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } }, 3489*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } }, 3490*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } }, 3491349cc55cSDimitry Andric }; 3492bdd1243dSDimitry Andric static const CostKindTblEntry AVX512BITALGCostTbl[] = { 3493bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } }, 3494bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } }, 3495bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } }, 3496bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } }, 3497bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } }, 3498bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } }, 3499349cc55cSDimitry Andric }; 3500bdd1243dSDimitry Andric static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = { 3501bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } }, 3502bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } }, 3503bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } }, 3504bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } }, 3505bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } }, 3506bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } }, 35070b57cec5SDimitry Andric }; 3508bdd1243dSDimitry Andric static const CostKindTblEntry AVX512CDCostTbl[] = { 3509bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } }, 3510bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } }, 3511bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } }, 3512bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } }, 3513bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } }, 3514bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } }, 3515bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } }, 3516bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } }, 3517bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } }, 3518bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } }, 3519bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } }, 3520bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } }, 3521bdd1243dSDimitry Andric 3522bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, 3523bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, 3524bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } }, 3525bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } }, 3526bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } }, 3527bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } }, 35280b57cec5SDimitry Andric }; 3529bdd1243dSDimitry Andric static const CostKindTblEntry AVX512BWCostTbl[] = { 3530bdd1243dSDimitry Andric { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } }, 3531bdd1243dSDimitry Andric { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } }, 353206c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } }, 353306c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } }, 353406c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } }, 353506c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } }, 353606c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } }, 353706c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } }, 353806c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } }, 353906c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } }, 354006c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } }, 354106c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } }, 354206c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } }, 354306c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } }, 354406c3fb27SDimitry Andric { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } }, 354506c3fb27SDimitry Andric { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } }, 354606c3fb27SDimitry Andric { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } }, 354706c3fb27SDimitry Andric { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } }, 354806c3fb27SDimitry Andric { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } }, 354906c3fb27SDimitry Andric { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } }, 355006c3fb27SDimitry Andric { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } }, 355106c3fb27SDimitry Andric { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } }, 355206c3fb27SDimitry Andric { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } }, 3553bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } }, 3554bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } }, 3555bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } }, 3556bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } }, 3557bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } }, 3558bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } }, 3559bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } }, 3560bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } }, 3561bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } }, 3562bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } }, 3563bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } }, 3564bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } }, 3565bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } }, 3566bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } }, 3567bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } }, 3568bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } }, 3569bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } }, 3570bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } }, 3571bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } }, 3572bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } }, 3573bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } }, 3574bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } }, 3575bdd1243dSDimitry Andric { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } }, 3576bdd1243dSDimitry Andric { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } }, 3577bdd1243dSDimitry Andric { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } }, 3578bdd1243dSDimitry Andric { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } }, 3579bdd1243dSDimitry Andric { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } }, 3580bdd1243dSDimitry Andric { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } }, 3581bdd1243dSDimitry Andric { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } }, 3582bdd1243dSDimitry Andric { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } }, 3583bdd1243dSDimitry Andric { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } }, 3584bdd1243dSDimitry Andric { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } }, 3585bdd1243dSDimitry Andric { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } }, 3586bdd1243dSDimitry Andric { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } }, 3587*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } }, 3588*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } }, 3589*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } }, 3590*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } }, 3591*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } }, 3592*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } }, 3593bdd1243dSDimitry Andric { ISD::SADDSAT, MVT::v32i16, { 1 } }, 3594bdd1243dSDimitry Andric { ISD::SADDSAT, MVT::v64i8, { 1 } }, 3595bdd1243dSDimitry Andric { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } }, 3596bdd1243dSDimitry Andric { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } }, 3597bdd1243dSDimitry Andric { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } }, 3598bdd1243dSDimitry Andric { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } }, 3599bdd1243dSDimitry Andric { ISD::SSUBSAT, MVT::v32i16, { 1 } }, 3600bdd1243dSDimitry Andric { ISD::SSUBSAT, MVT::v64i8, { 1 } }, 3601bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v32i16, { 1 } }, 3602bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v64i8, { 1 } }, 3603bdd1243dSDimitry Andric { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } }, 3604bdd1243dSDimitry Andric { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } }, 3605bdd1243dSDimitry Andric { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } }, 3606bdd1243dSDimitry Andric { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } }, 3607bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v32i16, { 1 } }, 3608bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v64i8, { 1 } }, 36090b57cec5SDimitry Andric }; 3610bdd1243dSDimitry Andric static const CostKindTblEntry AVX512CostTbl[] = { 3611bdd1243dSDimitry Andric { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } }, 3612bdd1243dSDimitry Andric { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } }, 3613bdd1243dSDimitry Andric { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } }, 3614bdd1243dSDimitry Andric { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } }, 3615bdd1243dSDimitry Andric { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } }, 3616bdd1243dSDimitry Andric { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } }, 3617bdd1243dSDimitry Andric { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } }, 3618bdd1243dSDimitry Andric { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } }, 3619bdd1243dSDimitry Andric { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } }, 362006c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } }, 362106c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } }, 362206c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } }, 362306c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } }, 362406c3fb27SDimitry Andric { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } }, 362506c3fb27SDimitry Andric { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } }, 362606c3fb27SDimitry Andric { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } }, 3627bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } }, 3628bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } }, 3629bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } }, 3630bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } }, 3631bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } }, 3632bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } }, 3633bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } }, 3634bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } }, 3635bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, 3636bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, 3637bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } }, 3638bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } }, 3639bdd1243dSDimitry Andric { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } }, 3640bdd1243dSDimitry Andric { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } }, 3641bdd1243dSDimitry Andric { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } }, 3642bdd1243dSDimitry Andric { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } }, 3643bdd1243dSDimitry Andric { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } }, 3644bdd1243dSDimitry Andric { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } }, 3645bdd1243dSDimitry Andric { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } }, 3646bdd1243dSDimitry Andric { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } }, 3647bdd1243dSDimitry Andric { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } }, 3648bdd1243dSDimitry Andric { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } }, 3649bdd1243dSDimitry Andric { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } }, 3650bdd1243dSDimitry Andric { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } }, 3651*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } }, 3652*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } }, 3653*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } }, 3654*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } }, 3655*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } }, 3656*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } }, 3657bdd1243dSDimitry Andric { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } }, 3658bdd1243dSDimitry Andric { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } }, 3659bdd1243dSDimitry Andric { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } }, 3660bdd1243dSDimitry Andric { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } }, 3661bdd1243dSDimitry Andric { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } }, 3662bdd1243dSDimitry Andric { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } }, 3663bdd1243dSDimitry Andric { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } }, 3664bdd1243dSDimitry Andric { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } }, 3665bdd1243dSDimitry Andric { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } }, 3666bdd1243dSDimitry Andric { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } }, 3667bdd1243dSDimitry Andric { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } }, 3668bdd1243dSDimitry Andric { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } }, 3669bdd1243dSDimitry Andric { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } }, 3670bdd1243dSDimitry Andric { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } }, 3671bdd1243dSDimitry Andric { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } }, 3672bdd1243dSDimitry Andric { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } }, 3673bdd1243dSDimitry Andric { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } }, 3674bdd1243dSDimitry Andric { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } }, 3675bdd1243dSDimitry Andric { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } }, 3676bdd1243dSDimitry Andric { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } }, 3677bdd1243dSDimitry Andric { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } }, 3678bdd1243dSDimitry Andric { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } }, 3679bdd1243dSDimitry Andric { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } }, 3680bdd1243dSDimitry Andric { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } }, 3681bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd 3682bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq 3683bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq 3684bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq 3685bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd 3686bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq 3687bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq 3688bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq 3689bdd1243dSDimitry Andric { ISD::SADDSAT, MVT::v32i16, { 2 } }, 3690bdd1243dSDimitry Andric { ISD::SADDSAT, MVT::v64i8, { 2 } }, 3691bdd1243dSDimitry Andric { ISD::SSUBSAT, MVT::v32i16, { 2 } }, 3692bdd1243dSDimitry Andric { ISD::SSUBSAT, MVT::v64i8, { 2 } }, 3693bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v32i16, { 2 } }, 3694bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v64i8, { 2 } }, 3695bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v32i16, { 2 } }, 3696bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v64i8, { 2 } }, 369706c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } }, 369806c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } }, 369906c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } }, 370006c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } }, 370106c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } }, 370206c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } }, 370306c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } }, 370406c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } }, 3705bdd1243dSDimitry Andric { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ 3706bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ 3707bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/ 3708bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/ 3709bdd1243dSDimitry Andric { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ 3710bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ 3711bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/ 3712bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/ 37130b57cec5SDimitry Andric }; 3714bdd1243dSDimitry Andric static const CostKindTblEntry XOPCostTbl[] = { 371506c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } }, 371606c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } }, 371706c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } }, 371806c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } }, 371906c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } }, 372006c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } }, 372106c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } }, 372206c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } }, 372306c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } }, 372406c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } }, 372506c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } }, 372606c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } }, 3727bdd1243dSDimitry Andric // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) 3728bdd1243dSDimitry Andric { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } }, 3729bdd1243dSDimitry Andric { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } }, 3730bdd1243dSDimitry Andric { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } }, 3731bdd1243dSDimitry Andric { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } }, 3732bdd1243dSDimitry Andric { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } }, 3733bdd1243dSDimitry Andric { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } }, 3734bdd1243dSDimitry Andric { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } }, 3735bdd1243dSDimitry Andric { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } }, 3736bdd1243dSDimitry Andric { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } }, 3737bdd1243dSDimitry Andric { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } }, 3738bdd1243dSDimitry Andric { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } }, 3739bdd1243dSDimitry Andric { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } }, 3740bdd1243dSDimitry Andric { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } }, 3741bdd1243dSDimitry Andric { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } }, 3742bdd1243dSDimitry Andric { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } }, 3743*0fca6ea1SDimitry Andric { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } }, 3744*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } }, 3745*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } }, 3746*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } }, 3747*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } }, 3748*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } }, 3749*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } }, 3750*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } }, 3751*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } }, 37520b57cec5SDimitry Andric }; 3753bdd1243dSDimitry Andric static const CostKindTblEntry AVX2CostTbl[] = { 3754bdd1243dSDimitry Andric { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) 3755bdd1243dSDimitry Andric { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) 3756bdd1243dSDimitry Andric { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } }, 3757bdd1243dSDimitry Andric { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } }, 3758bdd1243dSDimitry Andric { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } }, 3759bdd1243dSDimitry Andric { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } }, 3760bdd1243dSDimitry Andric { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } }, 3761bdd1243dSDimitry Andric { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } }, 376206c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } }, 376306c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } }, 376406c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } }, 376506c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } }, 376606c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } }, 376706c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } }, 376806c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } }, 376906c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } }, 377006c3fb27SDimitry Andric { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } }, 377106c3fb27SDimitry Andric { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } }, 377206c3fb27SDimitry Andric { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } }, 377306c3fb27SDimitry Andric { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } }, 377406c3fb27SDimitry Andric { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } }, 377506c3fb27SDimitry Andric { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } }, 3776bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } }, 3777bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } }, 3778bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } }, 3779bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } }, 3780bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } }, 3781bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } }, 3782bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } }, 3783bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } }, 3784bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } }, 3785bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } }, 3786bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } }, 3787bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } }, 3788bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } }, 3789bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } }, 3790bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } }, 3791bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } }, 3792bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } }, 3793bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } }, 3794bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } }, 3795bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } }, 3796bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } }, 3797bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } }, 3798bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } }, 3799bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } }, 3800bdd1243dSDimitry Andric { ISD::SADDSAT, MVT::v16i16, { 1 } }, 3801bdd1243dSDimitry Andric { ISD::SADDSAT, MVT::v32i8, { 1 } }, 3802bdd1243dSDimitry Andric { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } }, 3803bdd1243dSDimitry Andric { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } }, 3804bdd1243dSDimitry Andric { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } }, 3805bdd1243dSDimitry Andric { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } }, 3806bdd1243dSDimitry Andric { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } }, 3807bdd1243dSDimitry Andric { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } }, 3808bdd1243dSDimitry Andric { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } }, 3809bdd1243dSDimitry Andric { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } }, 3810bdd1243dSDimitry Andric { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } }, 3811bdd1243dSDimitry Andric { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } }, 3812bdd1243dSDimitry Andric { ISD::SSUBSAT, MVT::v16i16, { 1 } }, 3813bdd1243dSDimitry Andric { ISD::SSUBSAT, MVT::v32i8, { 1 } }, 3814bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v16i16, { 1 } }, 3815bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v32i8, { 1 } }, 3816bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd 3817bdd1243dSDimitry Andric { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } }, 3818bdd1243dSDimitry Andric { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } }, 3819bdd1243dSDimitry Andric { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } }, 3820bdd1243dSDimitry Andric { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } }, 3821bdd1243dSDimitry Andric { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } }, 3822bdd1243dSDimitry Andric { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } }, 3823bdd1243dSDimitry Andric { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } }, 3824bdd1243dSDimitry Andric { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } }, 3825bdd1243dSDimitry Andric { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } }, 3826bdd1243dSDimitry Andric { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } }, 3827bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v16i16, { 1 } }, 3828bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v32i8, { 1 } }, 3829bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd 383006c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS 383106c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS 383206c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS 383306c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD 383406c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD 383506c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD 3836bdd1243dSDimitry Andric { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss 3837bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps 3838bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps 3839bdd1243dSDimitry Andric { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd 3840bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd 3841bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd 38420b57cec5SDimitry Andric }; 3843bdd1243dSDimitry Andric static const CostKindTblEntry AVX1CostTbl[] = { 3844bdd1243dSDimitry Andric { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) 3845bdd1243dSDimitry Andric { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } }, 3846bdd1243dSDimitry Andric { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } }, 3847bdd1243dSDimitry Andric { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } }, 384806c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert 384906c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } }, 385006c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert 385106c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } }, 385206c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert 385306c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } }, 385406c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert 385506c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } }, 3856647cbc5dSDimitry Andric { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } }, 3857647cbc5dSDimitry Andric { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } }, 3858647cbc5dSDimitry Andric { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } }, 3859647cbc5dSDimitry Andric { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } }, 386006c3fb27SDimitry Andric { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } }, 386106c3fb27SDimitry Andric { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } }, 3862bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert 3863bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } }, 3864bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert 3865bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } }, 3866bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert 3867bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } }, 3868bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert 3869bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } }, 3870bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert 3871bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } }, 3872bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert 3873bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } }, 3874bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert 3875bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } }, 3876bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert 3877bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } }, 3878bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert 3879bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } }, 3880bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert 3881bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } }, 3882bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert 3883bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } }, 3884bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert 3885bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } }, 3886bdd1243dSDimitry Andric { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert 3887bdd1243dSDimitry Andric { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert 3888bdd1243dSDimitry Andric { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert 3889bdd1243dSDimitry Andric { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } }, 3890bdd1243dSDimitry Andric { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3891bdd1243dSDimitry Andric { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3892bdd1243dSDimitry Andric { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3893bdd1243dSDimitry Andric { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert 3894bdd1243dSDimitry Andric { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, 3895bdd1243dSDimitry Andric { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3896bdd1243dSDimitry Andric { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3897bdd1243dSDimitry Andric { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3898bdd1243dSDimitry Andric { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert 3899bdd1243dSDimitry Andric { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert 3900bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert 3901bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert 3902bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert 3903bdd1243dSDimitry Andric { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert 3904bdd1243dSDimitry Andric { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } }, 3905bdd1243dSDimitry Andric { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3906bdd1243dSDimitry Andric { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3907bdd1243dSDimitry Andric { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3908bdd1243dSDimitry Andric { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert 3909bdd1243dSDimitry Andric { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } }, 3910bdd1243dSDimitry Andric { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3911bdd1243dSDimitry Andric { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3912bdd1243dSDimitry Andric { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3913bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert 3914bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert 3915bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert 391606c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS 391706c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS 391806c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS 391906c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD 392006c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD 392106c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD 3922bdd1243dSDimitry Andric { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss 3923bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps 3924bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps 3925bdd1243dSDimitry Andric { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd 3926bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd 3927bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd 39280b57cec5SDimitry Andric }; 3929*0fca6ea1SDimitry Andric static const CostKindTblEntry GFNICostTbl[] = { 3930*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb 3931*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb 3932*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb 3933*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb 3934*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 3935*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 3936*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 3937*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb 3938*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb 3939*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb 3940*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb 3941*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb 3942*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb 3943*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb 3944*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb 3945*0fca6ea1SDimitry Andric { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb 3946*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 3947*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 3948*0fca6ea1SDimitry Andric { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 3949*0fca6ea1SDimitry Andric }; 3950bdd1243dSDimitry Andric static const CostKindTblEntry GLMCostTbl[] = { 3951bdd1243dSDimitry Andric { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss 3952bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps 3953bdd1243dSDimitry Andric { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd 3954bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd 39550b57cec5SDimitry Andric }; 3956bdd1243dSDimitry Andric static const CostKindTblEntry SLMCostTbl[] = { 3957647cbc5dSDimitry Andric { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } }, 3958647cbc5dSDimitry Andric { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } }, 3959647cbc5dSDimitry Andric { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } }, 3960bdd1243dSDimitry Andric { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss 3961bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps 3962bdd1243dSDimitry Andric { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd 3963bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd 39640b57cec5SDimitry Andric }; 3965bdd1243dSDimitry Andric static const CostKindTblEntry SSE42CostTbl[] = { 3966bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd 3967bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd 396806c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS 396906c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS 397006c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD 397106c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD 3972bdd1243dSDimitry Andric { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ 3973bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ 3974e8d8bef9SDimitry Andric }; 3975bdd1243dSDimitry Andric static const CostKindTblEntry SSE41CostTbl[] = { 3976bdd1243dSDimitry Andric { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X) 3977bdd1243dSDimitry Andric { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } }, 3978bdd1243dSDimitry Andric { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } }, 3979bdd1243dSDimitry Andric { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } }, 3980bdd1243dSDimitry Andric { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, 3981bdd1243dSDimitry Andric { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } }, 3982bdd1243dSDimitry Andric { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } }, 3983bdd1243dSDimitry Andric { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } }, 3984bdd1243dSDimitry Andric { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } }, 3985bdd1243dSDimitry Andric { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } }, 3986bdd1243dSDimitry Andric { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } }, 3987bdd1243dSDimitry Andric { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } }, 3988bdd1243dSDimitry Andric { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } }, 39890b57cec5SDimitry Andric }; 3990bdd1243dSDimitry Andric static const CostKindTblEntry SSSE3CostTbl[] = { 3991bdd1243dSDimitry Andric { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } }, 3992bdd1243dSDimitry Andric { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } }, 3993bdd1243dSDimitry Andric { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } }, 399406c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } }, 399506c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } }, 399606c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } }, 399706c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } }, 3998647cbc5dSDimitry Andric { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } }, 3999647cbc5dSDimitry Andric { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } }, 4000647cbc5dSDimitry Andric { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } }, 4001bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } }, 4002bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } }, 4003bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } }, 4004bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } }, 4005bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } }, 4006bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } }, 4007bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } }, 4008bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } }, 4009bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } }, 4010bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } }, 4011bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } }, 4012bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } } 40130b57cec5SDimitry Andric }; 4014bdd1243dSDimitry Andric static const CostKindTblEntry SSE2CostTbl[] = { 4015bdd1243dSDimitry Andric { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } }, 4016bdd1243dSDimitry Andric { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } }, 4017bdd1243dSDimitry Andric { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } }, 4018bdd1243dSDimitry Andric { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } }, 401906c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } }, 402006c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } }, 402106c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } }, 402206c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } }, 402306c3fb27SDimitry Andric { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } }, 402406c3fb27SDimitry Andric { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } }, 402506c3fb27SDimitry Andric { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } }, 4026bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } }, 4027bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } }, 4028bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } }, 4029bdd1243dSDimitry Andric { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } }, 4030bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } }, 4031bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } }, 4032bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } }, 4033bdd1243dSDimitry Andric { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } }, 4034bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } }, 4035bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } }, 4036bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } }, 4037bdd1243dSDimitry Andric { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } }, 4038bdd1243dSDimitry Andric { ISD::SADDSAT, MVT::v8i16, { 1 } }, 4039bdd1243dSDimitry Andric { ISD::SADDSAT, MVT::v16i8, { 1 } }, 4040bdd1243dSDimitry Andric { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } }, 4041bdd1243dSDimitry Andric { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } }, 4042bdd1243dSDimitry Andric { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } }, 4043bdd1243dSDimitry Andric { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } }, 4044bdd1243dSDimitry Andric { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } }, 4045bdd1243dSDimitry Andric { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } }, 4046bdd1243dSDimitry Andric { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } }, 4047bdd1243dSDimitry Andric { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } }, 4048bdd1243dSDimitry Andric { ISD::SSUBSAT, MVT::v8i16, { 1 } }, 4049bdd1243dSDimitry Andric { ISD::SSUBSAT, MVT::v16i8, { 1 } }, 4050bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v8i16, { 1 } }, 4051bdd1243dSDimitry Andric { ISD::UADDSAT, MVT::v16i8, { 1 } }, 4052bdd1243dSDimitry Andric { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } }, 4053bdd1243dSDimitry Andric { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } }, 4054bdd1243dSDimitry Andric { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } }, 4055bdd1243dSDimitry Andric { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } }, 4056bdd1243dSDimitry Andric { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } }, 4057bdd1243dSDimitry Andric { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } }, 4058bdd1243dSDimitry Andric { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } }, 4059bdd1243dSDimitry Andric { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } }, 4060bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v8i16, { 1 } }, 4061bdd1243dSDimitry Andric { ISD::USUBSAT, MVT::v16i8, { 1 } }, 406206c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, 406306c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } }, 4064bdd1243dSDimitry Andric { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ 4065bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ 40660b57cec5SDimitry Andric }; 4067bdd1243dSDimitry Andric static const CostKindTblEntry SSE1CostTbl[] = { 406806c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, 406906c3fb27SDimitry Andric { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } }, 4070bdd1243dSDimitry Andric { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/ 4071bdd1243dSDimitry Andric { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/ 40725ffd83dbSDimitry Andric }; 4073bdd1243dSDimitry Andric static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets 4074bdd1243dSDimitry Andric { ISD::CTTZ, MVT::i64, { 1 } }, 40755ffd83dbSDimitry Andric }; 4076bdd1243dSDimitry Andric static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets 4077bdd1243dSDimitry Andric { ISD::CTTZ, MVT::i32, { 1 } }, 4078bdd1243dSDimitry Andric { ISD::CTTZ, MVT::i16, { 1 } }, 4079bdd1243dSDimitry Andric { ISD::CTTZ, MVT::i8, { 1 } }, 40808bcb0991SDimitry Andric }; 4081bdd1243dSDimitry Andric static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets 4082bdd1243dSDimitry Andric { ISD::CTLZ, MVT::i64, { 1 } }, 40838bcb0991SDimitry Andric }; 4084bdd1243dSDimitry Andric static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets 4085bdd1243dSDimitry Andric { ISD::CTLZ, MVT::i32, { 1 } }, 4086bdd1243dSDimitry Andric { ISD::CTLZ, MVT::i16, { 2 } }, 4087bdd1243dSDimitry Andric { ISD::CTLZ, MVT::i8, { 2 } }, 40888bcb0991SDimitry Andric }; 4089bdd1243dSDimitry Andric static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets 4090bdd1243dSDimitry Andric { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt 40918bcb0991SDimitry Andric }; 4092bdd1243dSDimitry Andric static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets 4093bdd1243dSDimitry Andric { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt 4094bdd1243dSDimitry Andric { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext()) 4095bdd1243dSDimitry Andric { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext()) 40960b57cec5SDimitry Andric }; 4097bdd1243dSDimitry Andric static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets 4098*0fca6ea1SDimitry Andric { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV 409906c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } }, 410006c3fb27SDimitry Andric { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } }, 4101bdd1243dSDimitry Andric { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV 4102bdd1243dSDimitry Andric { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR 4103bdd1243dSDimitry Andric { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH 4104bdd1243dSDimitry Andric { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR 4105bdd1243dSDimitry Andric { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } }, 4106bdd1243dSDimitry Andric { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } }, 4107bdd1243dSDimitry Andric { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } }, 41085f757f3fSDimitry Andric { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } }, 4109bdd1243dSDimitry Andric { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } }, 4110bdd1243dSDimitry Andric { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } }, 4111bdd1243dSDimitry Andric { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } }, 4112bdd1243dSDimitry Andric { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } }, 4113bdd1243dSDimitry Andric { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } }, 4114bdd1243dSDimitry Andric { ISD::SADDO, MVT::i64, { 1 } }, 4115bdd1243dSDimitry Andric { ISD::UADDO, MVT::i64, { 1 } }, 4116bdd1243dSDimitry Andric { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto 4117bdd1243dSDimitry Andric }; 4118bdd1243dSDimitry Andric static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets 4119*0fca6ea1SDimitry Andric { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV 4120*0fca6ea1SDimitry Andric { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV 4121*0fca6ea1SDimitry Andric { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA 412206c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } }, 412306c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } }, 412406c3fb27SDimitry Andric { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } }, 412506c3fb27SDimitry Andric { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } }, 412606c3fb27SDimitry Andric { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL 4127bdd1243dSDimitry Andric { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV 4128bdd1243dSDimitry Andric { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV 4129bdd1243dSDimitry Andric { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV 4130bdd1243dSDimitry Andric { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR 4131bdd1243dSDimitry Andric { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR 4132bdd1243dSDimitry Andric { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR 4133bdd1243dSDimitry Andric { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH 4134bdd1243dSDimitry Andric { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH 4135bdd1243dSDimitry Andric { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH 4136bdd1243dSDimitry Andric { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF 4137bdd1243dSDimitry Andric { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF 4138bdd1243dSDimitry Andric { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF 4139bdd1243dSDimitry Andric { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } }, 4140bdd1243dSDimitry Andric { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } }, 4141*0fca6ea1SDimitry Andric { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } }, 4142bdd1243dSDimitry Andric { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } }, 4143bdd1243dSDimitry Andric { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } }, 4144bdd1243dSDimitry Andric { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } }, 4145bdd1243dSDimitry Andric { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } }, 4146bdd1243dSDimitry Andric { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } }, 4147bdd1243dSDimitry Andric { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } }, 41485f757f3fSDimitry Andric { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } }, 41495f757f3fSDimitry Andric { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } }, 41505f757f3fSDimitry Andric { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } }, 4151bdd1243dSDimitry Andric { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } }, 4152bdd1243dSDimitry Andric { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } }, 4153bdd1243dSDimitry Andric { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } }, 4154bdd1243dSDimitry Andric { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } }, 4155bdd1243dSDimitry Andric { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } }, 4156bdd1243dSDimitry Andric { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } }, 4157bdd1243dSDimitry Andric { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } }, 4158bdd1243dSDimitry Andric { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } }, 4159bdd1243dSDimitry Andric { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } }, 4160bdd1243dSDimitry Andric { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } }, 4161bdd1243dSDimitry Andric { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } }, 4162bdd1243dSDimitry Andric { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } }, 4163bdd1243dSDimitry Andric { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } }, 4164bdd1243dSDimitry Andric { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } }, 4165bdd1243dSDimitry Andric { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } }, 4166bdd1243dSDimitry Andric { ISD::SADDO, MVT::i32, { 1 } }, 4167bdd1243dSDimitry Andric { ISD::SADDO, MVT::i16, { 1 } }, 4168bdd1243dSDimitry Andric { ISD::SADDO, MVT::i8, { 1 } }, 4169bdd1243dSDimitry Andric { ISD::UADDO, MVT::i32, { 1 } }, 4170bdd1243dSDimitry Andric { ISD::UADDO, MVT::i16, { 1 } }, 4171bdd1243dSDimitry Andric { ISD::UADDO, MVT::i8, { 1 } }, 4172bdd1243dSDimitry Andric { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto 4173bdd1243dSDimitry Andric { ISD::UMULO, MVT::i16, { 2 } }, 4174bdd1243dSDimitry Andric { ISD::UMULO, MVT::i8, { 2 } }, 41750b57cec5SDimitry Andric }; 41760b57cec5SDimitry Andric 41775ffd83dbSDimitry Andric Type *RetTy = ICA.getReturnType(); 41780b57cec5SDimitry Andric Type *OpTy = RetTy; 41795ffd83dbSDimitry Andric Intrinsic::ID IID = ICA.getID(); 41800b57cec5SDimitry Andric unsigned ISD = ISD::DELETED_NODE; 41810b57cec5SDimitry Andric switch (IID) { 41820b57cec5SDimitry Andric default: 41830b57cec5SDimitry Andric break; 4184e8d8bef9SDimitry Andric case Intrinsic::abs: 4185e8d8bef9SDimitry Andric ISD = ISD::ABS; 4186e8d8bef9SDimitry Andric break; 41870b57cec5SDimitry Andric case Intrinsic::bitreverse: 41880b57cec5SDimitry Andric ISD = ISD::BITREVERSE; 41890b57cec5SDimitry Andric break; 41900b57cec5SDimitry Andric case Intrinsic::bswap: 41910b57cec5SDimitry Andric ISD = ISD::BSWAP; 41920b57cec5SDimitry Andric break; 41930b57cec5SDimitry Andric case Intrinsic::ctlz: 41940b57cec5SDimitry Andric ISD = ISD::CTLZ; 41950b57cec5SDimitry Andric break; 41960b57cec5SDimitry Andric case Intrinsic::ctpop: 41970b57cec5SDimitry Andric ISD = ISD::CTPOP; 41980b57cec5SDimitry Andric break; 41990b57cec5SDimitry Andric case Intrinsic::cttz: 42000b57cec5SDimitry Andric ISD = ISD::CTTZ; 42010b57cec5SDimitry Andric break; 4202bdd1243dSDimitry Andric case Intrinsic::fshl: 4203bdd1243dSDimitry Andric ISD = ISD::FSHL; 4204bdd1243dSDimitry Andric if (!ICA.isTypeBasedOnly()) { 4205bdd1243dSDimitry Andric const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 42065f757f3fSDimitry Andric if (Args[0] == Args[1]) { 4207bdd1243dSDimitry Andric ISD = ISD::ROTL; 4208*0fca6ea1SDimitry Andric // Handle uniform constant rotation amounts. 4209*0fca6ea1SDimitry Andric // TODO: Handle funnel-shift cases. 4210*0fca6ea1SDimitry Andric const APInt *Amt; 4211*0fca6ea1SDimitry Andric if (Args[2] && 4212*0fca6ea1SDimitry Andric PatternMatch::match(Args[2], PatternMatch::m_APIntAllowPoison(Amt))) 42135f757f3fSDimitry Andric ISD = X86ISD::VROTLI; 42145f757f3fSDimitry Andric } 4215bdd1243dSDimitry Andric } 4216bdd1243dSDimitry Andric break; 4217bdd1243dSDimitry Andric case Intrinsic::fshr: 4218bdd1243dSDimitry Andric // FSHR has same costs so don't duplicate. 4219bdd1243dSDimitry Andric ISD = ISD::FSHL; 4220bdd1243dSDimitry Andric if (!ICA.isTypeBasedOnly()) { 4221bdd1243dSDimitry Andric const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 42225f757f3fSDimitry Andric if (Args[0] == Args[1]) { 4223bdd1243dSDimitry Andric ISD = ISD::ROTR; 4224*0fca6ea1SDimitry Andric // Handle uniform constant rotation amount. 4225*0fca6ea1SDimitry Andric // TODO: Handle funnel-shift cases. 4226*0fca6ea1SDimitry Andric const APInt *Amt; 4227*0fca6ea1SDimitry Andric if (Args[2] && 4228*0fca6ea1SDimitry Andric PatternMatch::match(Args[2], PatternMatch::m_APIntAllowPoison(Amt))) 42295f757f3fSDimitry Andric ISD = X86ISD::VROTLI; 42305f757f3fSDimitry Andric } 4231bdd1243dSDimitry Andric } 4232bdd1243dSDimitry Andric break; 4233*0fca6ea1SDimitry Andric case Intrinsic::lrint: 4234*0fca6ea1SDimitry Andric case Intrinsic::llrint: 4235*0fca6ea1SDimitry Andric // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which 4236*0fca6ea1SDimitry Andric // have the same costs as the CVTTP2SI (fptosi) instructions 4237*0fca6ea1SDimitry Andric if (!ICA.isTypeBasedOnly()) { 4238*0fca6ea1SDimitry Andric const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes(); 4239*0fca6ea1SDimitry Andric return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0], 4240*0fca6ea1SDimitry Andric TTI::CastContextHint::None, CostKind); 4241*0fca6ea1SDimitry Andric } 4242*0fca6ea1SDimitry Andric break; 42435ffd83dbSDimitry Andric case Intrinsic::maxnum: 42445ffd83dbSDimitry Andric case Intrinsic::minnum: 42455ffd83dbSDimitry Andric // FMINNUM has same costs so don't duplicate. 42465ffd83dbSDimitry Andric ISD = ISD::FMAXNUM; 42475ffd83dbSDimitry Andric break; 42480b57cec5SDimitry Andric case Intrinsic::sadd_sat: 42490b57cec5SDimitry Andric ISD = ISD::SADDSAT; 42500b57cec5SDimitry Andric break; 4251e8d8bef9SDimitry Andric case Intrinsic::smax: 4252e8d8bef9SDimitry Andric ISD = ISD::SMAX; 4253e8d8bef9SDimitry Andric break; 4254e8d8bef9SDimitry Andric case Intrinsic::smin: 4255e8d8bef9SDimitry Andric ISD = ISD::SMIN; 4256e8d8bef9SDimitry Andric break; 42570b57cec5SDimitry Andric case Intrinsic::ssub_sat: 42580b57cec5SDimitry Andric ISD = ISD::SSUBSAT; 42590b57cec5SDimitry Andric break; 42600b57cec5SDimitry Andric case Intrinsic::uadd_sat: 42610b57cec5SDimitry Andric ISD = ISD::UADDSAT; 42620b57cec5SDimitry Andric break; 4263e8d8bef9SDimitry Andric case Intrinsic::umax: 4264e8d8bef9SDimitry Andric ISD = ISD::UMAX; 4265e8d8bef9SDimitry Andric break; 4266e8d8bef9SDimitry Andric case Intrinsic::umin: 4267e8d8bef9SDimitry Andric ISD = ISD::UMIN; 4268e8d8bef9SDimitry Andric break; 42690b57cec5SDimitry Andric case Intrinsic::usub_sat: 42700b57cec5SDimitry Andric ISD = ISD::USUBSAT; 42710b57cec5SDimitry Andric break; 42720b57cec5SDimitry Andric case Intrinsic::sqrt: 42730b57cec5SDimitry Andric ISD = ISD::FSQRT; 42740b57cec5SDimitry Andric break; 42750b57cec5SDimitry Andric case Intrinsic::sadd_with_overflow: 42760b57cec5SDimitry Andric case Intrinsic::ssub_with_overflow: 42770b57cec5SDimitry Andric // SSUBO has same costs so don't duplicate. 42780b57cec5SDimitry Andric ISD = ISD::SADDO; 42790b57cec5SDimitry Andric OpTy = RetTy->getContainedType(0); 42800b57cec5SDimitry Andric break; 42810b57cec5SDimitry Andric case Intrinsic::uadd_with_overflow: 42820b57cec5SDimitry Andric case Intrinsic::usub_with_overflow: 42830b57cec5SDimitry Andric // USUBO has same costs so don't duplicate. 42840b57cec5SDimitry Andric ISD = ISD::UADDO; 42850b57cec5SDimitry Andric OpTy = RetTy->getContainedType(0); 42860b57cec5SDimitry Andric break; 4287e8d8bef9SDimitry Andric case Intrinsic::umul_with_overflow: 4288e8d8bef9SDimitry Andric case Intrinsic::smul_with_overflow: 4289e8d8bef9SDimitry Andric // SMULO has same costs so don't duplicate. 4290e8d8bef9SDimitry Andric ISD = ISD::UMULO; 4291e8d8bef9SDimitry Andric OpTy = RetTy->getContainedType(0); 4292e8d8bef9SDimitry Andric break; 42930b57cec5SDimitry Andric } 42940b57cec5SDimitry Andric 42950b57cec5SDimitry Andric if (ISD != ISD::DELETED_NODE) { 4296*0fca6ea1SDimitry Andric auto adjustTableCost = [&](int ISD, unsigned Cost, 4297*0fca6ea1SDimitry Andric std::pair<InstructionCost, MVT> LT, 4298*0fca6ea1SDimitry Andric FastMathFlags FMF) -> InstructionCost { 4299*0fca6ea1SDimitry Andric InstructionCost LegalizationCost = LT.first; 4300*0fca6ea1SDimitry Andric MVT MTy = LT.second; 4301*0fca6ea1SDimitry Andric 4302*0fca6ea1SDimitry Andric // If there are no NANs to deal with, then these are reduced to a 4303*0fca6ea1SDimitry Andric // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we 4304*0fca6ea1SDimitry Andric // assume is used in the non-fast case. 4305*0fca6ea1SDimitry Andric if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { 4306*0fca6ea1SDimitry Andric if (FMF.noNaNs()) 4307*0fca6ea1SDimitry Andric return LegalizationCost * 1; 4308*0fca6ea1SDimitry Andric } 4309*0fca6ea1SDimitry Andric 4310*0fca6ea1SDimitry Andric // For cases where some ops can be folded into a load/store, assume free. 4311*0fca6ea1SDimitry Andric if (MTy.isScalarInteger()) { 4312*0fca6ea1SDimitry Andric if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { 4313*0fca6ea1SDimitry Andric if (const Instruction *II = ICA.getInst()) { 4314*0fca6ea1SDimitry Andric if (II->hasOneUse() && isa<StoreInst>(II->user_back())) 4315*0fca6ea1SDimitry Andric return TTI::TCC_Free; 4316*0fca6ea1SDimitry Andric if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { 4317*0fca6ea1SDimitry Andric if (LI->hasOneUse()) 4318*0fca6ea1SDimitry Andric return TTI::TCC_Free; 4319*0fca6ea1SDimitry Andric } 4320*0fca6ea1SDimitry Andric } 4321*0fca6ea1SDimitry Andric } 4322*0fca6ea1SDimitry Andric } 4323*0fca6ea1SDimitry Andric 4324*0fca6ea1SDimitry Andric return LegalizationCost * (int)Cost; 4325*0fca6ea1SDimitry Andric }; 4326*0fca6ea1SDimitry Andric 43270b57cec5SDimitry Andric // Legalize the type. 4328bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy); 43290b57cec5SDimitry Andric MVT MTy = LT.second; 43300b57cec5SDimitry Andric 4331bdd1243dSDimitry Andric // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost. 4332bdd1243dSDimitry Andric if (((ISD == ISD::CTTZ && !ST->hasBMI()) || 4333bdd1243dSDimitry Andric (ISD == ISD::CTLZ && !ST->hasLZCNT())) && 4334bdd1243dSDimitry Andric !MTy.isVector() && !ICA.isTypeBasedOnly()) { 4335bdd1243dSDimitry Andric const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 4336bdd1243dSDimitry Andric if (auto *Cst = dyn_cast<ConstantInt>(Args[1])) 4337bdd1243dSDimitry Andric if (Cst->isAllOnesValue()) 4338bdd1243dSDimitry Andric ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF; 4339bdd1243dSDimitry Andric } 4340bdd1243dSDimitry Andric 4341bdd1243dSDimitry Andric // FSQRT is a single instruction. 4342bdd1243dSDimitry Andric if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize) 4343bdd1243dSDimitry Andric return LT.first; 4344bdd1243dSDimitry Andric 4345480093f4SDimitry Andric if (ST->useGLMDivSqrtCosts()) 43460b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) 4347bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4348*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 43490b57cec5SDimitry Andric 4350349cc55cSDimitry Andric if (ST->useSLMArithCosts()) 43510b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 4352bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4353*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4354bdd1243dSDimitry Andric 4355bdd1243dSDimitry Andric if (ST->hasVBMI2()) 4356bdd1243dSDimitry Andric if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy)) 4357bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4358*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 43590b57cec5SDimitry Andric 4360349cc55cSDimitry Andric if (ST->hasBITALG()) 4361349cc55cSDimitry Andric if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy)) 4362bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4363*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4364349cc55cSDimitry Andric 4365349cc55cSDimitry Andric if (ST->hasVPOPCNTDQ()) 4366349cc55cSDimitry Andric if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy)) 4367bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4368*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4369*0fca6ea1SDimitry Andric 4370*0fca6ea1SDimitry Andric if (ST->hasGFNI()) 4371*0fca6ea1SDimitry Andric if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy)) 4372*0fca6ea1SDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4373*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4374349cc55cSDimitry Andric 43750b57cec5SDimitry Andric if (ST->hasCDI()) 43760b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) 4377bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4378*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 43790b57cec5SDimitry Andric 43800b57cec5SDimitry Andric if (ST->hasBWI()) 43810b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 4382bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4383*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 43840b57cec5SDimitry Andric 43850b57cec5SDimitry Andric if (ST->hasAVX512()) 43860b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 4387bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4388*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 43890b57cec5SDimitry Andric 43900b57cec5SDimitry Andric if (ST->hasXOP()) 43910b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) 4392bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4393*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 43940b57cec5SDimitry Andric 43950b57cec5SDimitry Andric if (ST->hasAVX2()) 43960b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 4397bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4398*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 43990b57cec5SDimitry Andric 44000b57cec5SDimitry Andric if (ST->hasAVX()) 44010b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 4402bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4403*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 44040b57cec5SDimitry Andric 44050b57cec5SDimitry Andric if (ST->hasSSE42()) 44060b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 4407bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4408*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4409e8d8bef9SDimitry Andric 4410e8d8bef9SDimitry Andric if (ST->hasSSE41()) 4411e8d8bef9SDimitry Andric if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 4412bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4413*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 44140b57cec5SDimitry Andric 44150b57cec5SDimitry Andric if (ST->hasSSSE3()) 44160b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) 4417bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4418*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 44190b57cec5SDimitry Andric 44200b57cec5SDimitry Andric if (ST->hasSSE2()) 44210b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 4422bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4423*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 44240b57cec5SDimitry Andric 44250b57cec5SDimitry Andric if (ST->hasSSE1()) 44260b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) 4427bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4428*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 44290b57cec5SDimitry Andric 44305ffd83dbSDimitry Andric if (ST->hasBMI()) { 44315ffd83dbSDimitry Andric if (ST->is64Bit()) 44325ffd83dbSDimitry Andric if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) 4433bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4434*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 44355ffd83dbSDimitry Andric 44365ffd83dbSDimitry Andric if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) 4437bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4438*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 44395ffd83dbSDimitry Andric } 44405ffd83dbSDimitry Andric 44418bcb0991SDimitry Andric if (ST->hasLZCNT()) { 44428bcb0991SDimitry Andric if (ST->is64Bit()) 44438bcb0991SDimitry Andric if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) 4444bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4445*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 44468bcb0991SDimitry Andric 44478bcb0991SDimitry Andric if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) 4448bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4449*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 44508bcb0991SDimitry Andric } 44518bcb0991SDimitry Andric 44528bcb0991SDimitry Andric if (ST->hasPOPCNT()) { 44538bcb0991SDimitry Andric if (ST->is64Bit()) 44548bcb0991SDimitry Andric if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) 4455bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4456*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 44578bcb0991SDimitry Andric 44588bcb0991SDimitry Andric if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) 4459bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4460*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4461fe6060f1SDimitry Andric } 4462fe6060f1SDimitry Andric 44630b57cec5SDimitry Andric if (ST->is64Bit()) 44640b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) 4465bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4466*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 44670b57cec5SDimitry Andric 44680b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) 4469bdd1243dSDimitry Andric if (auto KindCost = Entry->Cost[CostKind]) 4470*0fca6ea1SDimitry Andric return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 44710b57cec5SDimitry Andric } 44720b57cec5SDimitry Andric 44735ffd83dbSDimitry Andric return BaseT::getIntrinsicInstrCost(ICA, CostKind); 44740b57cec5SDimitry Andric } 44750b57cec5SDimitry Andric 4476fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 4477bdd1243dSDimitry Andric TTI::TargetCostKind CostKind, 4478bdd1243dSDimitry Andric unsigned Index, Value *Op0, 4479bdd1243dSDimitry Andric Value *Op1) { 4480480093f4SDimitry Andric static const CostTblEntry SLMCostTbl[] = { 4481480093f4SDimitry Andric { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, 4482480093f4SDimitry Andric { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, 4483480093f4SDimitry Andric { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 }, 4484480093f4SDimitry Andric { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 } 4485480093f4SDimitry Andric }; 4486480093f4SDimitry Andric 44870b57cec5SDimitry Andric assert(Val->isVectorTy() && "This must be a vector type"); 44880b57cec5SDimitry Andric Type *ScalarType = Val->getScalarType(); 448981ad6265SDimitry Andric InstructionCost RegisterFileMoveCost = 0; 44900b57cec5SDimitry Andric 4491fe6060f1SDimitry Andric // Non-immediate extraction/insertion can be handled as a sequence of 4492fe6060f1SDimitry Andric // aliased loads+stores via the stack. 4493fe6060f1SDimitry Andric if (Index == -1U && (Opcode == Instruction::ExtractElement || 4494fe6060f1SDimitry Andric Opcode == Instruction::InsertElement)) { 4495fe6060f1SDimitry Andric // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: 4496fe6060f1SDimitry Andric // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. 4497fe6060f1SDimitry Andric 4498fe6060f1SDimitry Andric // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. 4499fe6060f1SDimitry Andric assert(isa<FixedVectorType>(Val) && "Fixed vector type expected"); 4500fe6060f1SDimitry Andric Align VecAlign = DL.getPrefTypeAlign(Val); 4501fe6060f1SDimitry Andric Align SclAlign = DL.getPrefTypeAlign(ScalarType); 4502fe6060f1SDimitry Andric 4503fe6060f1SDimitry Andric // Extract - store vector to stack, load scalar. 4504fe6060f1SDimitry Andric if (Opcode == Instruction::ExtractElement) { 4505bdd1243dSDimitry Andric return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + 4506fe6060f1SDimitry Andric getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, 4507bdd1243dSDimitry Andric CostKind); 4508fe6060f1SDimitry Andric } 4509fe6060f1SDimitry Andric // Insert - store vector to stack, store scalar, load vector. 4510fe6060f1SDimitry Andric if (Opcode == Instruction::InsertElement) { 4511bdd1243dSDimitry Andric return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + 4512fe6060f1SDimitry Andric getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, 4513bdd1243dSDimitry Andric CostKind) + 4514bdd1243dSDimitry Andric getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind); 4515fe6060f1SDimitry Andric } 4516fe6060f1SDimitry Andric } 4517fe6060f1SDimitry Andric 45185ffd83dbSDimitry Andric if (Index != -1U && (Opcode == Instruction::ExtractElement || 45195ffd83dbSDimitry Andric Opcode == Instruction::InsertElement)) { 452081ad6265SDimitry Andric // Extraction of vXi1 elements are now efficiently handled by MOVMSK. 452181ad6265SDimitry Andric if (Opcode == Instruction::ExtractElement && 452281ad6265SDimitry Andric ScalarType->getScalarSizeInBits() == 1 && 452381ad6265SDimitry Andric cast<FixedVectorType>(Val)->getNumElements() > 1) 452481ad6265SDimitry Andric return 1; 452581ad6265SDimitry Andric 45260b57cec5SDimitry Andric // Legalize the type. 4527bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 45280b57cec5SDimitry Andric 45290b57cec5SDimitry Andric // This type is legalized to a scalar type. 45300b57cec5SDimitry Andric if (!LT.second.isVector()) 45310b57cec5SDimitry Andric return 0; 45320b57cec5SDimitry Andric 45330b57cec5SDimitry Andric // The type may be split. Normalize the index to the new type. 453481ad6265SDimitry Andric unsigned SizeInBits = LT.second.getSizeInBits(); 45355ffd83dbSDimitry Andric unsigned NumElts = LT.second.getVectorNumElements(); 45365ffd83dbSDimitry Andric unsigned SubNumElts = NumElts; 45375ffd83dbSDimitry Andric Index = Index % NumElts; 45385ffd83dbSDimitry Andric 45395ffd83dbSDimitry Andric // For >128-bit vectors, we need to extract higher 128-bit subvectors. 45405ffd83dbSDimitry Andric // For inserts, we also need to insert the subvector back. 454181ad6265SDimitry Andric if (SizeInBits > 128) { 454281ad6265SDimitry Andric assert((SizeInBits % 128) == 0 && "Illegal vector"); 454381ad6265SDimitry Andric unsigned NumSubVecs = SizeInBits / 128; 45445ffd83dbSDimitry Andric SubNumElts = NumElts / NumSubVecs; 45455ffd83dbSDimitry Andric if (SubNumElts <= Index) { 45465ffd83dbSDimitry Andric RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); 45475ffd83dbSDimitry Andric Index %= SubNumElts; 45485ffd83dbSDimitry Andric } 45495ffd83dbSDimitry Andric } 45500b57cec5SDimitry Andric 4551bdd1243dSDimitry Andric MVT MScalarTy = LT.second.getScalarType(); 4552bdd1243dSDimitry Andric auto IsCheapPInsrPExtrInsertPS = [&]() { 4553bdd1243dSDimitry Andric // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. 4554bdd1243dSDimitry Andric // Also, assume insertps is relatively cheap on all >= SSE41 targets. 4555bdd1243dSDimitry Andric return (MScalarTy == MVT::i16 && ST->hasSSE2()) || 4556bdd1243dSDimitry Andric (MScalarTy.isInteger() && ST->hasSSE41()) || 4557bdd1243dSDimitry Andric (MScalarTy == MVT::f32 && ST->hasSSE41() && 4558bdd1243dSDimitry Andric Opcode == Instruction::InsertElement); 4559bdd1243dSDimitry Andric }; 4560bdd1243dSDimitry Andric 4561480093f4SDimitry Andric if (Index == 0) { 45620b57cec5SDimitry Andric // Floating point scalars are already located in index #0. 45635ffd83dbSDimitry Andric // Many insertions to #0 can fold away for scalar fp-ops, so let's assume 45645ffd83dbSDimitry Andric // true for all. 456506c3fb27SDimitry Andric if (ScalarType->isFloatingPointTy() && 456606c3fb27SDimitry Andric (Opcode != Instruction::InsertElement || !Op0 || 456706c3fb27SDimitry Andric isa<UndefValue>(Op0))) 45685ffd83dbSDimitry Andric return RegisterFileMoveCost; 4569480093f4SDimitry Andric 4570bdd1243dSDimitry Andric if (Opcode == Instruction::InsertElement && 4571bdd1243dSDimitry Andric isa_and_nonnull<UndefValue>(Op0)) { 4572bdd1243dSDimitry Andric // Consider the gather cost to be cheap. 4573bdd1243dSDimitry Andric if (isa_and_nonnull<LoadInst>(Op1)) 4574bdd1243dSDimitry Andric return RegisterFileMoveCost; 4575bdd1243dSDimitry Andric if (!IsCheapPInsrPExtrInsertPS()) { 4576bdd1243dSDimitry Andric // mov constant-to-GPR + movd/movq GPR -> XMM. 4577bdd1243dSDimitry Andric if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy()) 4578bdd1243dSDimitry Andric return 2 + RegisterFileMoveCost; 4579bdd1243dSDimitry Andric // Assume movd/movq GPR -> XMM is relatively cheap on all targets. 4580bdd1243dSDimitry Andric return 1 + RegisterFileMoveCost; 4581bdd1243dSDimitry Andric } 4582bdd1243dSDimitry Andric } 4583bdd1243dSDimitry Andric 45845ffd83dbSDimitry Andric // Assume movd/movq XMM -> GPR is relatively cheap on all targets. 45855ffd83dbSDimitry Andric if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) 45865ffd83dbSDimitry Andric return 1 + RegisterFileMoveCost; 4587480093f4SDimitry Andric } 4588480093f4SDimitry Andric 4589480093f4SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 4590480093f4SDimitry Andric assert(ISD && "Unexpected vector opcode"); 4591349cc55cSDimitry Andric if (ST->useSLMArithCosts()) 4592480093f4SDimitry Andric if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) 45935ffd83dbSDimitry Andric return Entry->Cost + RegisterFileMoveCost; 45945ffd83dbSDimitry Andric 4595bdd1243dSDimitry Andric // Consider cheap cases. 4596bdd1243dSDimitry Andric if (IsCheapPInsrPExtrInsertPS()) 45975ffd83dbSDimitry Andric return 1 + RegisterFileMoveCost; 45985ffd83dbSDimitry Andric 45995ffd83dbSDimitry Andric // For extractions we just need to shuffle the element to index 0, which 46005ffd83dbSDimitry Andric // should be very cheap (assume cost = 1). For insertions we need to shuffle 46015ffd83dbSDimitry Andric // the elements to its destination. In both cases we must handle the 46025ffd83dbSDimitry Andric // subvector move(s). 46035ffd83dbSDimitry Andric // If the vector type is already less than 128-bits then don't reduce it. 46045ffd83dbSDimitry Andric // TODO: Under what circumstances should we shuffle using the full width? 4605fe6060f1SDimitry Andric InstructionCost ShuffleCost = 1; 46065ffd83dbSDimitry Andric if (Opcode == Instruction::InsertElement) { 46075ffd83dbSDimitry Andric auto *SubTy = cast<VectorType>(Val); 46085ffd83dbSDimitry Andric EVT VT = TLI->getValueType(DL, Val); 46095ffd83dbSDimitry Andric if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) 46105ffd83dbSDimitry Andric SubTy = FixedVectorType::get(ScalarType, SubNumElts); 4611bdd1243dSDimitry Andric ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt, 4612bdd1243dSDimitry Andric CostKind, 0, SubTy); 46135ffd83dbSDimitry Andric } 46145ffd83dbSDimitry Andric int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; 46155ffd83dbSDimitry Andric return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; 46160b57cec5SDimitry Andric } 46170b57cec5SDimitry Andric 4618bdd1243dSDimitry Andric return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) + 4619bdd1243dSDimitry Andric RegisterFileMoveCost; 46200b57cec5SDimitry Andric } 46210b57cec5SDimitry Andric 4622bdd1243dSDimitry Andric InstructionCost 4623bdd1243dSDimitry Andric X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, 4624bdd1243dSDimitry Andric bool Insert, bool Extract, 4625bdd1243dSDimitry Andric TTI::TargetCostKind CostKind) { 462681ad6265SDimitry Andric assert(DemandedElts.getBitWidth() == 462781ad6265SDimitry Andric cast<FixedVectorType>(Ty)->getNumElements() && 462881ad6265SDimitry Andric "Vector size mismatch"); 462981ad6265SDimitry Andric 4630bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 463181ad6265SDimitry Andric MVT MScalarTy = LT.second.getScalarType(); 4632bdd1243dSDimitry Andric unsigned LegalVectorBitWidth = LT.second.getSizeInBits(); 4633fe6060f1SDimitry Andric InstructionCost Cost = 0; 46345ffd83dbSDimitry Andric 4635bdd1243dSDimitry Andric constexpr unsigned LaneBitWidth = 128; 4636bdd1243dSDimitry Andric assert((LegalVectorBitWidth < LaneBitWidth || 4637bdd1243dSDimitry Andric (LegalVectorBitWidth % LaneBitWidth) == 0) && 4638bdd1243dSDimitry Andric "Illegal vector"); 4639bdd1243dSDimitry Andric 4640bdd1243dSDimitry Andric const int NumLegalVectors = *LT.first.getValue(); 4641bdd1243dSDimitry Andric assert(NumLegalVectors >= 0 && "Negative cost!"); 4642bdd1243dSDimitry Andric 46435ffd83dbSDimitry Andric // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much 46445ffd83dbSDimitry Andric // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. 46455ffd83dbSDimitry Andric if (Insert) { 46465ffd83dbSDimitry Andric if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || 46475ffd83dbSDimitry Andric (MScalarTy.isInteger() && ST->hasSSE41()) || 46485ffd83dbSDimitry Andric (MScalarTy == MVT::f32 && ST->hasSSE41())) { 46495ffd83dbSDimitry Andric // For types we can insert directly, insertion into 128-bit sub vectors is 46505ffd83dbSDimitry Andric // cheap, followed by a cheap chain of concatenations. 4651bdd1243dSDimitry Andric if (LegalVectorBitWidth <= LaneBitWidth) { 4652bdd1243dSDimitry Andric Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, 4653bdd1243dSDimitry Andric /*Extract*/ false, CostKind); 46545ffd83dbSDimitry Andric } else { 4655e8d8bef9SDimitry Andric // In each 128-lane, if at least one index is demanded but not all 4656e8d8bef9SDimitry Andric // indices are demanded and this 128-lane is not the first 128-lane of 4657e8d8bef9SDimitry Andric // the legalized-vector, then this 128-lane needs a extracti128; If in 4658e8d8bef9SDimitry Andric // each 128-lane, there is at least one demanded index, this 128-lane 4659e8d8bef9SDimitry Andric // needs a inserti128. 4660e8d8bef9SDimitry Andric 4661e8d8bef9SDimitry Andric // The following cases will help you build a better understanding: 4662e8d8bef9SDimitry Andric // Assume we insert several elements into a v8i32 vector in avx2, 4663e8d8bef9SDimitry Andric // Case#1: inserting into 1th index needs vpinsrd + inserti128. 4664e8d8bef9SDimitry Andric // Case#2: inserting into 5th index needs extracti128 + vpinsrd + 4665e8d8bef9SDimitry Andric // inserti128. 4666e8d8bef9SDimitry Andric // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. 4667bdd1243dSDimitry Andric assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector"); 4668bdd1243dSDimitry Andric unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; 4669bdd1243dSDimitry Andric unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; 4670bdd1243dSDimitry Andric unsigned NumLegalElts = 4671bdd1243dSDimitry Andric LT.second.getVectorNumElements() * NumLegalVectors; 4672bdd1243dSDimitry Andric assert(NumLegalElts >= DemandedElts.getBitWidth() && 4673bdd1243dSDimitry Andric "Vector has been legalized to smaller element count"); 4674bdd1243dSDimitry Andric assert((NumLegalElts % NumLanesTotal) == 0 && 4675bdd1243dSDimitry Andric "Unexpected elts per lane"); 4676bdd1243dSDimitry Andric unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; 46775ffd83dbSDimitry Andric 4678bdd1243dSDimitry Andric APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts); 4679bdd1243dSDimitry Andric auto *LaneTy = 4680bdd1243dSDimitry Andric FixedVectorType::get(Ty->getElementType(), NumEltsPerLane); 4681bdd1243dSDimitry Andric 4682bdd1243dSDimitry Andric for (unsigned I = 0; I != NumLanesTotal; ++I) { 4683bdd1243dSDimitry Andric APInt LaneEltMask = WidenedDemandedElts.extractBits( 4684bdd1243dSDimitry Andric NumEltsPerLane, NumEltsPerLane * I); 468506c3fb27SDimitry Andric if (LaneEltMask.isZero()) 4686bdd1243dSDimitry Andric continue; 4687bdd1243dSDimitry Andric // FIXME: we don't need to extract if all non-demanded elements 4688bdd1243dSDimitry Andric // are legalization-inserted padding. 4689bdd1243dSDimitry Andric if (!LaneEltMask.isAllOnes()) 4690bdd1243dSDimitry Andric Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 4691bdd1243dSDimitry Andric CostKind, I * NumEltsPerLane, LaneTy); 4692bdd1243dSDimitry Andric Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert, 4693bdd1243dSDimitry Andric /*Extract*/ false, CostKind); 4694bdd1243dSDimitry Andric } 4695bdd1243dSDimitry Andric 4696bdd1243dSDimitry Andric APInt AffectedLanes = 4697bdd1243dSDimitry Andric APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal); 4698bdd1243dSDimitry Andric APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask( 4699bdd1243dSDimitry Andric AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true); 4700bdd1243dSDimitry Andric for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) { 4701bdd1243dSDimitry Andric for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) { 4702bdd1243dSDimitry Andric unsigned I = NumLegalLanes * LegalVec + Lane; 4703bdd1243dSDimitry Andric // No need to insert unaffected lane; or lane 0 of each legal vector 4704bdd1243dSDimitry Andric // iff ALL lanes of that vector were affected and will be inserted. 4705bdd1243dSDimitry Andric if (!AffectedLanes[I] || 4706bdd1243dSDimitry Andric (Lane == 0 && FullyAffectedLegalVectors[LegalVec])) 4707bdd1243dSDimitry Andric continue; 4708bdd1243dSDimitry Andric Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt, 4709bdd1243dSDimitry Andric CostKind, I * NumEltsPerLane, LaneTy); 4710bdd1243dSDimitry Andric } 4711bdd1243dSDimitry Andric } 47125ffd83dbSDimitry Andric } 47135ffd83dbSDimitry Andric } else if (LT.second.isVector()) { 47145ffd83dbSDimitry Andric // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded 47155ffd83dbSDimitry Andric // integer element as a SCALAR_TO_VECTOR, then we build the vector as a 47165ffd83dbSDimitry Andric // series of UNPCK followed by CONCAT_VECTORS - all of these can be 47175ffd83dbSDimitry Andric // considered cheap. 47185ffd83dbSDimitry Andric if (Ty->isIntOrIntVectorTy()) 471906c3fb27SDimitry Andric Cost += DemandedElts.popcount(); 47205ffd83dbSDimitry Andric 47215ffd83dbSDimitry Andric // Get the smaller of the legalized or original pow2-extended number of 47225ffd83dbSDimitry Andric // vector elements, which represents the number of unpacks we'll end up 47235ffd83dbSDimitry Andric // performing. 47245ffd83dbSDimitry Andric unsigned NumElts = LT.second.getVectorNumElements(); 47255ffd83dbSDimitry Andric unsigned Pow2Elts = 47265ffd83dbSDimitry Andric PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements()); 47275ffd83dbSDimitry Andric Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first; 47285ffd83dbSDimitry Andric } 47295ffd83dbSDimitry Andric } 47305ffd83dbSDimitry Andric 473181ad6265SDimitry Andric if (Extract) { 473281ad6265SDimitry Andric // vXi1 can be efficiently extracted with MOVMSK. 473381ad6265SDimitry Andric // TODO: AVX512 predicate mask handling. 473481ad6265SDimitry Andric // NOTE: This doesn't work well for roundtrip scalarization. 473581ad6265SDimitry Andric if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { 473681ad6265SDimitry Andric unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements(); 473781ad6265SDimitry Andric unsigned MaxElts = ST->hasAVX2() ? 32 : 16; 473881ad6265SDimitry Andric unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; 473981ad6265SDimitry Andric return MOVMSKCost; 474081ad6265SDimitry Andric } 474181ad6265SDimitry Andric 474281ad6265SDimitry Andric if (LT.second.isVector()) { 4743bdd1243dSDimitry Andric unsigned NumLegalElts = 4744bdd1243dSDimitry Andric LT.second.getVectorNumElements() * NumLegalVectors; 4745bdd1243dSDimitry Andric assert(NumLegalElts >= DemandedElts.getBitWidth() && 474681ad6265SDimitry Andric "Vector has been legalized to smaller element count"); 474781ad6265SDimitry Andric 4748bdd1243dSDimitry Andric // If we're extracting elements from a 128-bit subvector lane, 4749bdd1243dSDimitry Andric // we only need to extract each lane once, not for every element. 4750bdd1243dSDimitry Andric if (LegalVectorBitWidth > LaneBitWidth) { 4751bdd1243dSDimitry Andric unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; 4752bdd1243dSDimitry Andric unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; 4753bdd1243dSDimitry Andric assert((NumLegalElts % NumLanesTotal) == 0 && 4754bdd1243dSDimitry Andric "Unexpected elts per lane"); 4755bdd1243dSDimitry Andric unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; 475681ad6265SDimitry Andric 475781ad6265SDimitry Andric // Add cost for each demanded 128-bit subvector extraction. 475881ad6265SDimitry Andric // Luckily this is a lot easier than for insertion. 4759bdd1243dSDimitry Andric APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts); 4760bdd1243dSDimitry Andric auto *LaneTy = 4761bdd1243dSDimitry Andric FixedVectorType::get(Ty->getElementType(), NumEltsPerLane); 476281ad6265SDimitry Andric 4763bdd1243dSDimitry Andric for (unsigned I = 0; I != NumLanesTotal; ++I) { 4764bdd1243dSDimitry Andric APInt LaneEltMask = WidenedDemandedElts.extractBits( 4765bdd1243dSDimitry Andric NumEltsPerLane, I * NumEltsPerLane); 476606c3fb27SDimitry Andric if (LaneEltMask.isZero()) 4767bdd1243dSDimitry Andric continue; 4768bdd1243dSDimitry Andric Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 4769bdd1243dSDimitry Andric CostKind, I * NumEltsPerLane, LaneTy); 4770bdd1243dSDimitry Andric Cost += BaseT::getScalarizationOverhead( 4771bdd1243dSDimitry Andric LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind); 477281ad6265SDimitry Andric } 477381ad6265SDimitry Andric 477481ad6265SDimitry Andric return Cost; 477581ad6265SDimitry Andric } 477681ad6265SDimitry Andric } 477781ad6265SDimitry Andric 477881ad6265SDimitry Andric // Fallback to default extraction. 4779bdd1243dSDimitry Andric Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false, 4780bdd1243dSDimitry Andric Extract, CostKind); 478181ad6265SDimitry Andric } 47825ffd83dbSDimitry Andric 47835ffd83dbSDimitry Andric return Cost; 47845ffd83dbSDimitry Andric } 47855ffd83dbSDimitry Andric 4786349cc55cSDimitry Andric InstructionCost 4787349cc55cSDimitry Andric X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, 4788349cc55cSDimitry Andric int VF, const APInt &DemandedDstElts, 4789349cc55cSDimitry Andric TTI::TargetCostKind CostKind) { 4790349cc55cSDimitry Andric const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); 4791349cc55cSDimitry Andric // We don't differentiate element types here, only element bit width. 4792349cc55cSDimitry Andric EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits); 4793349cc55cSDimitry Andric 4794349cc55cSDimitry Andric auto bailout = [&]() { 4795349cc55cSDimitry Andric return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, 4796349cc55cSDimitry Andric DemandedDstElts, CostKind); 4797349cc55cSDimitry Andric }; 4798349cc55cSDimitry Andric 4799349cc55cSDimitry Andric // For now, only deal with AVX512 cases. 4800349cc55cSDimitry Andric if (!ST->hasAVX512()) 4801349cc55cSDimitry Andric return bailout(); 4802349cc55cSDimitry Andric 4803349cc55cSDimitry Andric // Do we have a native shuffle for this element type, or should we promote? 4804349cc55cSDimitry Andric unsigned PromEltTyBits = EltTyBits; 4805349cc55cSDimitry Andric switch (EltTyBits) { 4806349cc55cSDimitry Andric case 32: 4807349cc55cSDimitry Andric case 64: 4808349cc55cSDimitry Andric break; // AVX512F. 4809349cc55cSDimitry Andric case 16: 4810349cc55cSDimitry Andric if (!ST->hasBWI()) 4811349cc55cSDimitry Andric PromEltTyBits = 32; // promote to i32, AVX512F. 4812349cc55cSDimitry Andric break; // AVX512BW 4813349cc55cSDimitry Andric case 8: 4814349cc55cSDimitry Andric if (!ST->hasVBMI()) 4815349cc55cSDimitry Andric PromEltTyBits = 32; // promote to i32, AVX512F. 4816349cc55cSDimitry Andric break; // AVX512VBMI 4817349cc55cSDimitry Andric case 1: 4818349cc55cSDimitry Andric // There is no support for shuffling i1 elements. We *must* promote. 4819349cc55cSDimitry Andric if (ST->hasBWI()) { 4820349cc55cSDimitry Andric if (ST->hasVBMI()) 4821349cc55cSDimitry Andric PromEltTyBits = 8; // promote to i8, AVX512VBMI. 4822349cc55cSDimitry Andric else 4823349cc55cSDimitry Andric PromEltTyBits = 16; // promote to i16, AVX512BW. 4824349cc55cSDimitry Andric break; 4825349cc55cSDimitry Andric } 48264824e7fdSDimitry Andric PromEltTyBits = 32; // promote to i32, AVX512F. 48274824e7fdSDimitry Andric break; 4828349cc55cSDimitry Andric default: 4829349cc55cSDimitry Andric return bailout(); 4830349cc55cSDimitry Andric } 4831349cc55cSDimitry Andric auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits); 4832349cc55cSDimitry Andric 4833349cc55cSDimitry Andric auto *SrcVecTy = FixedVectorType::get(EltTy, VF); 4834349cc55cSDimitry Andric auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF); 4835349cc55cSDimitry Andric 4836349cc55cSDimitry Andric int NumDstElements = VF * ReplicationFactor; 4837349cc55cSDimitry Andric auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements); 4838349cc55cSDimitry Andric auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); 4839349cc55cSDimitry Andric 4840349cc55cSDimitry Andric // Legalize the types. 4841bdd1243dSDimitry Andric MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second; 4842bdd1243dSDimitry Andric MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second; 4843bdd1243dSDimitry Andric MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second; 4844bdd1243dSDimitry Andric MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second; 4845349cc55cSDimitry Andric // They should have legalized into vector types. 4846349cc55cSDimitry Andric if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || 4847349cc55cSDimitry Andric !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) 4848349cc55cSDimitry Andric return bailout(); 4849349cc55cSDimitry Andric 4850349cc55cSDimitry Andric if (PromEltTyBits != EltTyBits) { 4851349cc55cSDimitry Andric // If we have to perform the shuffle with wider elt type than our data type, 4852349cc55cSDimitry Andric // then we will first need to anyext (we don't care about the new bits) 4853349cc55cSDimitry Andric // the source elements, and then truncate Dst elements. 4854349cc55cSDimitry Andric InstructionCost PromotionCost; 4855349cc55cSDimitry Andric PromotionCost += getCastInstrCost( 4856349cc55cSDimitry Andric Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, 4857349cc55cSDimitry Andric TargetTransformInfo::CastContextHint::None, CostKind); 4858349cc55cSDimitry Andric PromotionCost += 4859349cc55cSDimitry Andric getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy, 4860349cc55cSDimitry Andric /*Src=*/PromDstVecTy, 4861349cc55cSDimitry Andric TargetTransformInfo::CastContextHint::None, CostKind); 4862349cc55cSDimitry Andric return PromotionCost + getReplicationShuffleCost(PromEltTy, 4863349cc55cSDimitry Andric ReplicationFactor, VF, 4864349cc55cSDimitry Andric DemandedDstElts, CostKind); 4865349cc55cSDimitry Andric } 4866349cc55cSDimitry Andric 4867349cc55cSDimitry Andric assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && 4868349cc55cSDimitry Andric LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && 4869349cc55cSDimitry Andric "We expect that the legalization doesn't affect the element width, " 4870349cc55cSDimitry Andric "doesn't coalesce/split elements."); 4871349cc55cSDimitry Andric 4872349cc55cSDimitry Andric unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); 4873349cc55cSDimitry Andric unsigned NumDstVectors = 4874349cc55cSDimitry Andric divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec); 4875349cc55cSDimitry Andric 4876349cc55cSDimitry Andric auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec); 4877349cc55cSDimitry Andric 4878349cc55cSDimitry Andric // Not all the produced Dst elements may be demanded. In our case, 4879349cc55cSDimitry Andric // given that a single Dst vector is formed by a single shuffle, 4880349cc55cSDimitry Andric // if all elements that will form a single Dst vector aren't demanded, 4881349cc55cSDimitry Andric // then we won't need to do that shuffle, so adjust the cost accordingly. 4882349cc55cSDimitry Andric APInt DemandedDstVectors = APIntOps::ScaleBitMask( 488381ad6265SDimitry Andric DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors); 488406c3fb27SDimitry Andric unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount(); 4885349cc55cSDimitry Andric 4886bdd1243dSDimitry Andric InstructionCost SingleShuffleCost = getShuffleCost( 4887bdd1243dSDimitry Andric TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind, 4888bdd1243dSDimitry Andric /*Index=*/0, /*SubTp=*/nullptr); 4889349cc55cSDimitry Andric return NumDstVectorsDemanded * SingleShuffleCost; 4890349cc55cSDimitry Andric } 4891349cc55cSDimitry Andric 4892fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 4893fe6060f1SDimitry Andric MaybeAlign Alignment, 4894fe6060f1SDimitry Andric unsigned AddressSpace, 48955ffd83dbSDimitry Andric TTI::TargetCostKind CostKind, 4896bdd1243dSDimitry Andric TTI::OperandValueInfo OpInfo, 4897480093f4SDimitry Andric const Instruction *I) { 48985ffd83dbSDimitry Andric // TODO: Handle other cost kinds. 48995ffd83dbSDimitry Andric if (CostKind != TTI::TCK_RecipThroughput) { 4900e8d8bef9SDimitry Andric if (auto *SI = dyn_cast_or_null<StoreInst>(I)) { 49015ffd83dbSDimitry Andric // Store instruction with index and scale costs 2 Uops. 49025ffd83dbSDimitry Andric // Check the preceding GEP to identify non-const indices. 4903e8d8bef9SDimitry Andric if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) { 49045ffd83dbSDimitry Andric if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) 49055ffd83dbSDimitry Andric return TTI::TCC_Basic * 2; 49065ffd83dbSDimitry Andric } 49075ffd83dbSDimitry Andric } 49085ffd83dbSDimitry Andric return TTI::TCC_Basic; 49095ffd83dbSDimitry Andric } 49105ffd83dbSDimitry Andric 4911fe6060f1SDimitry Andric assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 4912fe6060f1SDimitry Andric "Invalid Opcode"); 49135ffd83dbSDimitry Andric // Type legalization can't handle structs 49145ffd83dbSDimitry Andric if (TLI->getValueType(DL, Src, true) == MVT::Other) 49155ffd83dbSDimitry Andric return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 49165ffd83dbSDimitry Andric CostKind); 49175ffd83dbSDimitry Andric 49180b57cec5SDimitry Andric // Legalize the type. 4919bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 49200b57cec5SDimitry Andric 4921fe6060f1SDimitry Andric auto *VTy = dyn_cast<FixedVectorType>(Src); 4922fe6060f1SDimitry Andric 4923bdd1243dSDimitry Andric InstructionCost Cost = 0; 4924bdd1243dSDimitry Andric 4925bdd1243dSDimitry Andric // Add a cost for constant load to vector. 4926bdd1243dSDimitry Andric if (Opcode == Instruction::Store && OpInfo.isConstant()) 4927bdd1243dSDimitry Andric Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src), 4928bdd1243dSDimitry Andric /*AddressSpace=*/0, CostKind); 4929bdd1243dSDimitry Andric 4930fe6060f1SDimitry Andric // Handle the simple case of non-vectors. 4931fe6060f1SDimitry Andric // NOTE: this assumes that legalization never creates vector from scalars! 4932bdd1243dSDimitry Andric if (!VTy || !LT.second.isVector()) { 49330b57cec5SDimitry Andric // Each load/store unit costs 1. 4934bdd1243dSDimitry Andric return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1; 4935bdd1243dSDimitry Andric } 49360b57cec5SDimitry Andric 4937fe6060f1SDimitry Andric bool IsLoad = Opcode == Instruction::Load; 4938fe6060f1SDimitry Andric 4939fe6060f1SDimitry Andric Type *EltTy = VTy->getElementType(); 4940fe6060f1SDimitry Andric 4941fe6060f1SDimitry Andric const int EltTyBits = DL.getTypeSizeInBits(EltTy); 4942fe6060f1SDimitry Andric 4943fe6060f1SDimitry Andric // Source of truth: how many elements were there in the original IR vector? 4944fe6060f1SDimitry Andric const unsigned SrcNumElt = VTy->getNumElements(); 4945fe6060f1SDimitry Andric 4946fe6060f1SDimitry Andric // How far have we gotten? 4947fe6060f1SDimitry Andric int NumEltRemaining = SrcNumElt; 4948fe6060f1SDimitry Andric // Note that we intentionally capture by-reference, NumEltRemaining changes. 4949fe6060f1SDimitry Andric auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; 4950fe6060f1SDimitry Andric 4951fe6060f1SDimitry Andric const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8); 4952fe6060f1SDimitry Andric 4953fe6060f1SDimitry Andric // Note that even if we can store 64 bits of an XMM, we still operate on XMM. 4954fe6060f1SDimitry Andric const unsigned XMMBits = 128; 4955fe6060f1SDimitry Andric if (XMMBits % EltTyBits != 0) 4956fe6060f1SDimitry Andric // Vector size must be a multiple of the element size. I.e. no padding. 4957fe6060f1SDimitry Andric return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 4958fe6060f1SDimitry Andric CostKind); 4959fe6060f1SDimitry Andric const int NumEltPerXMM = XMMBits / EltTyBits; 4960fe6060f1SDimitry Andric 4961fe6060f1SDimitry Andric auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM); 4962fe6060f1SDimitry Andric 4963fe6060f1SDimitry Andric for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; 4964fe6060f1SDimitry Andric NumEltRemaining > 0; CurrOpSizeBytes /= 2) { 4965fe6060f1SDimitry Andric // How many elements would a single op deal with at once? 4966fe6060f1SDimitry Andric if ((8 * CurrOpSizeBytes) % EltTyBits != 0) 4967fe6060f1SDimitry Andric // Vector size must be a multiple of the element size. I.e. no padding. 4968fe6060f1SDimitry Andric return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 4969fe6060f1SDimitry Andric CostKind); 4970fe6060f1SDimitry Andric int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; 4971fe6060f1SDimitry Andric 4972fe6060f1SDimitry Andric assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?"); 4973fe6060f1SDimitry Andric assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || 4974fe6060f1SDimitry Andric (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && 4975fe6060f1SDimitry Andric "Unless we haven't halved the op size yet, " 4976fe6060f1SDimitry Andric "we have less than two op's sized units of work left."); 4977fe6060f1SDimitry Andric 4978fe6060f1SDimitry Andric auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM 4979fe6060f1SDimitry Andric ? FixedVectorType::get(EltTy, CurrNumEltPerOp) 4980fe6060f1SDimitry Andric : XMMVecTy; 4981fe6060f1SDimitry Andric 4982fe6060f1SDimitry Andric assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && 4983fe6060f1SDimitry Andric "After halving sizes, the vector elt count is no longer a multiple " 4984fe6060f1SDimitry Andric "of number of elements per operation?"); 4985fe6060f1SDimitry Andric auto *CoalescedVecTy = 4986fe6060f1SDimitry Andric CurrNumEltPerOp == 1 4987fe6060f1SDimitry Andric ? CurrVecTy 4988fe6060f1SDimitry Andric : FixedVectorType::get( 4989fe6060f1SDimitry Andric IntegerType::get(Src->getContext(), 4990fe6060f1SDimitry Andric EltTyBits * CurrNumEltPerOp), 4991fe6060f1SDimitry Andric CurrVecTy->getNumElements() / CurrNumEltPerOp); 4992fe6060f1SDimitry Andric assert(DL.getTypeSizeInBits(CoalescedVecTy) == 4993fe6060f1SDimitry Andric DL.getTypeSizeInBits(CurrVecTy) && 4994fe6060f1SDimitry Andric "coalesciing elements doesn't change vector width."); 4995fe6060f1SDimitry Andric 4996fe6060f1SDimitry Andric while (NumEltRemaining > 0) { 4997fe6060f1SDimitry Andric assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?"); 4998fe6060f1SDimitry Andric 4999fe6060f1SDimitry Andric // Can we use this vector size, as per the remaining element count? 5000fe6060f1SDimitry Andric // Iff the vector is naturally aligned, we can do a wide load regardless. 5001fe6060f1SDimitry Andric if (NumEltRemaining < CurrNumEltPerOp && 5002fe6060f1SDimitry Andric (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) && 5003fe6060f1SDimitry Andric CurrOpSizeBytes != 1) 5004fe6060f1SDimitry Andric break; // Try smalled vector size. 5005fe6060f1SDimitry Andric 5006fe6060f1SDimitry Andric bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; 5007fe6060f1SDimitry Andric 5008fe6060f1SDimitry Andric // If we have fully processed the previous reg, we need to replenish it. 5009fe6060f1SDimitry Andric if (SubVecEltsLeft == 0) { 5010fe6060f1SDimitry Andric SubVecEltsLeft += CurrVecTy->getNumElements(); 5011fe6060f1SDimitry Andric // And that's free only for the 0'th subvector of a legalized vector. 5012fe6060f1SDimitry Andric if (!Is0thSubVec) 5013fe6060f1SDimitry Andric Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector 5014fe6060f1SDimitry Andric : TTI::ShuffleKind::SK_ExtractSubvector, 5015bdd1243dSDimitry Andric VTy, std::nullopt, CostKind, NumEltDone(), 5016bdd1243dSDimitry Andric CurrVecTy); 5017fe6060f1SDimitry Andric } 5018fe6060f1SDimitry Andric 5019fe6060f1SDimitry Andric // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, 5020fe6060f1SDimitry Andric // for smaller widths (32/16/8) we have to insert/extract them separately. 5021fe6060f1SDimitry Andric // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, 5022fe6060f1SDimitry Andric // but let's pretend that it is also true for 16/8 bit wide ops...) 5023fe6060f1SDimitry Andric if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { 5024fe6060f1SDimitry Andric int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; 5025fe6060f1SDimitry Andric assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && ""); 5026fe6060f1SDimitry Andric int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; 5027fe6060f1SDimitry Andric APInt DemandedElts = 5028fe6060f1SDimitry Andric APInt::getBitsSet(CoalescedVecTy->getNumElements(), 5029fe6060f1SDimitry Andric CoalescedVecEltIdx, CoalescedVecEltIdx + 1); 503006c3fb27SDimitry Andric assert(DemandedElts.popcount() == 1 && "Inserting single value"); 5031fe6060f1SDimitry Andric Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad, 5032bdd1243dSDimitry Andric !IsLoad, CostKind); 5033fe6060f1SDimitry Andric } 5034fe6060f1SDimitry Andric 5035fe6060f1SDimitry Andric // This isn't exactly right. We're using slow unaligned 32-byte accesses 5036fe6060f1SDimitry Andric // as a proxy for a double-pumped AVX memory interface such as on 5037fe6060f1SDimitry Andric // Sandybridge. 503806c3fb27SDimitry Andric // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or 503906c3fb27SDimitry Andric // will be scalarized. 5040fe6060f1SDimitry Andric if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) 5041fe6060f1SDimitry Andric Cost += 2; 504206c3fb27SDimitry Andric else if (CurrOpSizeBytes < 4) 504306c3fb27SDimitry Andric Cost += 2; 5044fe6060f1SDimitry Andric else 5045fe6060f1SDimitry Andric Cost += 1; 5046fe6060f1SDimitry Andric 5047fe6060f1SDimitry Andric SubVecEltsLeft -= CurrNumEltPerOp; 5048fe6060f1SDimitry Andric NumEltRemaining -= CurrNumEltPerOp; 5049fe6060f1SDimitry Andric Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes); 5050fe6060f1SDimitry Andric } 5051fe6060f1SDimitry Andric } 5052fe6060f1SDimitry Andric 5053fe6060f1SDimitry Andric assert(NumEltRemaining <= 0 && "Should have processed all the elements."); 50540b57cec5SDimitry Andric 50550b57cec5SDimitry Andric return Cost; 50560b57cec5SDimitry Andric } 50570b57cec5SDimitry Andric 5058fe6060f1SDimitry Andric InstructionCost 5059fe6060f1SDimitry Andric X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, 5060fe6060f1SDimitry Andric unsigned AddressSpace, 50615ffd83dbSDimitry Andric TTI::TargetCostKind CostKind) { 50620b57cec5SDimitry Andric bool IsLoad = (Instruction::Load == Opcode); 50630b57cec5SDimitry Andric bool IsStore = (Instruction::Store == Opcode); 50640b57cec5SDimitry Andric 50655ffd83dbSDimitry Andric auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy); 50660b57cec5SDimitry Andric if (!SrcVTy) 50670b57cec5SDimitry Andric // To calculate scalar take the regular cost, without mask 50685ffd83dbSDimitry Andric return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind); 50690b57cec5SDimitry Andric 50705ffd83dbSDimitry Andric unsigned NumElem = SrcVTy->getNumElements(); 50715ffd83dbSDimitry Andric auto *MaskTy = 50725ffd83dbSDimitry Andric FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); 50735ffd83dbSDimitry Andric if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || 5074fe6060f1SDimitry Andric (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { 50750b57cec5SDimitry Andric // Scalarization 5076349cc55cSDimitry Andric APInt DemandedElts = APInt::getAllOnes(NumElem); 5077bdd1243dSDimitry Andric InstructionCost MaskSplitCost = getScalarizationOverhead( 5078bdd1243dSDimitry Andric MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind); 5079fe6060f1SDimitry Andric InstructionCost ScalarCompareCost = getCmpSelInstrCost( 50805ffd83dbSDimitry Andric Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, 5081e8d8bef9SDimitry Andric CmpInst::BAD_ICMP_PREDICATE, CostKind); 5082fe6060f1SDimitry Andric InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); 5083fe6060f1SDimitry Andric InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); 5084bdd1243dSDimitry Andric InstructionCost ValueSplitCost = getScalarizationOverhead( 5085bdd1243dSDimitry Andric SrcVTy, DemandedElts, IsLoad, IsStore, CostKind); 5086fe6060f1SDimitry Andric InstructionCost MemopCost = 50870b57cec5SDimitry Andric NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 50885ffd83dbSDimitry Andric Alignment, AddressSpace, CostKind); 50890b57cec5SDimitry Andric return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; 50900b57cec5SDimitry Andric } 50910b57cec5SDimitry Andric 50920b57cec5SDimitry Andric // Legalize the type. 5093bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy); 50940b57cec5SDimitry Andric auto VT = TLI->getValueType(DL, SrcVTy); 5095fe6060f1SDimitry Andric InstructionCost Cost = 0; 5096*0fca6ea1SDimitry Andric MVT Ty = LT.second; 5097*0fca6ea1SDimitry Andric if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64) 5098*0fca6ea1SDimitry Andric // APX masked load/store for scalar is cheap. 5099*0fca6ea1SDimitry Andric return Cost + LT.first; 5100*0fca6ea1SDimitry Andric 5101*0fca6ea1SDimitry Andric if (VT.isSimple() && Ty != VT.getSimpleVT() && 51020b57cec5SDimitry Andric LT.second.getVectorNumElements() == NumElem) 5103fe6060f1SDimitry Andric // Promotion requires extend/truncate for data and a shuffle for mask. 5104bdd1243dSDimitry Andric Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt, 5105bdd1243dSDimitry Andric CostKind, 0, nullptr) + 5106bdd1243dSDimitry Andric getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt, 5107bdd1243dSDimitry Andric CostKind, 0, nullptr); 51080b57cec5SDimitry Andric 5109*0fca6ea1SDimitry Andric else if (LT.first * Ty.getVectorNumElements() > NumElem) { 51105ffd83dbSDimitry Andric auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), 5111*0fca6ea1SDimitry Andric Ty.getVectorNumElements()); 51120b57cec5SDimitry Andric // Expanding requires fill mask with zeroes 5113bdd1243dSDimitry Andric Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt, 5114bdd1243dSDimitry Andric CostKind, 0, MaskTy); 51150b57cec5SDimitry Andric } 51160b57cec5SDimitry Andric 51170b57cec5SDimitry Andric // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. 51180b57cec5SDimitry Andric if (!ST->hasAVX512()) 51190b57cec5SDimitry Andric return Cost + LT.first * (IsLoad ? 2 : 8); 51200b57cec5SDimitry Andric 5121bdd1243dSDimitry Andric // AVX-512 masked load/store is cheaper 51220b57cec5SDimitry Andric return Cost + LT.first; 51230b57cec5SDimitry Andric } 51240b57cec5SDimitry Andric 512506c3fb27SDimitry Andric InstructionCost 512606c3fb27SDimitry Andric X86TTIImpl::getPointersChainCost(ArrayRef<const Value *> Ptrs, 512706c3fb27SDimitry Andric const Value *Base, 512806c3fb27SDimitry Andric const TTI::PointersChainInfo &Info, 512906c3fb27SDimitry Andric Type *AccessTy, TTI::TargetCostKind CostKind) { 513006c3fb27SDimitry Andric if (Info.isSameBase() && Info.isKnownStride()) { 513106c3fb27SDimitry Andric // If all the pointers have known stride all the differences are translated 513206c3fb27SDimitry Andric // into constants. X86 memory addressing allows encoding it into 513306c3fb27SDimitry Andric // displacement. So we just need to take the base GEP cost. 513406c3fb27SDimitry Andric if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) { 513506c3fb27SDimitry Andric SmallVector<const Value *> Indices(BaseGEP->indices()); 513606c3fb27SDimitry Andric return getGEPCost(BaseGEP->getSourceElementType(), 513706c3fb27SDimitry Andric BaseGEP->getPointerOperand(), Indices, nullptr, 513806c3fb27SDimitry Andric CostKind); 513906c3fb27SDimitry Andric } 514006c3fb27SDimitry Andric return TTI::TCC_Free; 514106c3fb27SDimitry Andric } 514206c3fb27SDimitry Andric return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); 514306c3fb27SDimitry Andric } 514406c3fb27SDimitry Andric 5145fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, 5146fe6060f1SDimitry Andric ScalarEvolution *SE, 51470b57cec5SDimitry Andric const SCEV *Ptr) { 51480b57cec5SDimitry Andric // Address computations in vectorized code with non-consecutive addresses will 51490b57cec5SDimitry Andric // likely result in more instructions compared to scalar code where the 51500b57cec5SDimitry Andric // computation can more often be merged into the index mode. The resulting 51510b57cec5SDimitry Andric // extra micro-ops can significantly decrease throughput. 51520b57cec5SDimitry Andric const unsigned NumVectorInstToHideOverhead = 10; 51530b57cec5SDimitry Andric 51540b57cec5SDimitry Andric // Cost modeling of Strided Access Computation is hidden by the indexing 51550b57cec5SDimitry Andric // modes of X86 regardless of the stride value. We dont believe that there 51560b57cec5SDimitry Andric // is a difference between constant strided access in gerenal and constant 51570b57cec5SDimitry Andric // strided value which is less than or equal to 64. 51580b57cec5SDimitry Andric // Even in the case of (loop invariant) stride whose value is not known at 51590b57cec5SDimitry Andric // compile time, the address computation will not incur more than one extra 51600b57cec5SDimitry Andric // ADD instruction. 51614824e7fdSDimitry Andric if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { 51624824e7fdSDimitry Andric // TODO: AVX2 is the current cut-off because we don't have correct 51634824e7fdSDimitry Andric // interleaving costs for prior ISA's. 51640b57cec5SDimitry Andric if (!BaseT::isStridedAccess(Ptr)) 51650b57cec5SDimitry Andric return NumVectorInstToHideOverhead; 51660b57cec5SDimitry Andric if (!BaseT::getConstantStrideStep(SE, Ptr)) 51670b57cec5SDimitry Andric return 1; 51680b57cec5SDimitry Andric } 51690b57cec5SDimitry Andric 51700b57cec5SDimitry Andric return BaseT::getAddressComputationCost(Ty, SE, Ptr); 51710b57cec5SDimitry Andric } 51720b57cec5SDimitry Andric 5173fe6060f1SDimitry Andric InstructionCost 5174fe6060f1SDimitry Andric X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 5175bdd1243dSDimitry Andric std::optional<FastMathFlags> FMF, 51765ffd83dbSDimitry Andric TTI::TargetCostKind CostKind) { 5177fe6060f1SDimitry Andric if (TTI::requiresOrderedReduction(FMF)) 5178fe6060f1SDimitry Andric return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 51795ffd83dbSDimitry Andric 51808bcb0991SDimitry Andric // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 51818bcb0991SDimitry Andric // and make it as the cost. 51820b57cec5SDimitry Andric 518306c3fb27SDimitry Andric static const CostTblEntry SLMCostTbl[] = { 5184480093f4SDimitry Andric { ISD::FADD, MVT::v2f64, 3 }, 5185480093f4SDimitry Andric { ISD::ADD, MVT::v2i64, 5 }, 5186480093f4SDimitry Andric }; 5187480093f4SDimitry Andric 518806c3fb27SDimitry Andric static const CostTblEntry SSE2CostTbl[] = { 51898bcb0991SDimitry Andric { ISD::FADD, MVT::v2f64, 2 }, 5190fe6060f1SDimitry Andric { ISD::FADD, MVT::v2f32, 2 }, 51918bcb0991SDimitry Andric { ISD::FADD, MVT::v4f32, 4 }, 51928bcb0991SDimitry Andric { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 51938bcb0991SDimitry Andric { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 51948bcb0991SDimitry Andric { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". 51958bcb0991SDimitry Andric { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". 51968bcb0991SDimitry Andric { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". 51978bcb0991SDimitry Andric { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". 51988bcb0991SDimitry Andric { ISD::ADD, MVT::v2i8, 2 }, 51998bcb0991SDimitry Andric { ISD::ADD, MVT::v4i8, 2 }, 52008bcb0991SDimitry Andric { ISD::ADD, MVT::v8i8, 2 }, 52018bcb0991SDimitry Andric { ISD::ADD, MVT::v16i8, 3 }, 52028bcb0991SDimitry Andric }; 52038bcb0991SDimitry Andric 520406c3fb27SDimitry Andric static const CostTblEntry AVX1CostTbl[] = { 52058bcb0991SDimitry Andric { ISD::FADD, MVT::v4f64, 3 }, 52068bcb0991SDimitry Andric { ISD::FADD, MVT::v4f32, 3 }, 52078bcb0991SDimitry Andric { ISD::FADD, MVT::v8f32, 4 }, 52088bcb0991SDimitry Andric { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 52098bcb0991SDimitry Andric { ISD::ADD, MVT::v4i64, 3 }, 52108bcb0991SDimitry Andric { ISD::ADD, MVT::v8i32, 5 }, 52118bcb0991SDimitry Andric { ISD::ADD, MVT::v16i16, 5 }, 52128bcb0991SDimitry Andric { ISD::ADD, MVT::v32i8, 4 }, 52138bcb0991SDimitry Andric }; 52140b57cec5SDimitry Andric 52150b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 52160b57cec5SDimitry Andric assert(ISD && "Invalid opcode"); 52170b57cec5SDimitry Andric 52188bcb0991SDimitry Andric // Before legalizing the type, give a chance to look up illegal narrow types 52198bcb0991SDimitry Andric // in the table. 52208bcb0991SDimitry Andric // FIXME: Is there a better way to do this? 52218bcb0991SDimitry Andric EVT VT = TLI->getValueType(DL, ValTy); 52228bcb0991SDimitry Andric if (VT.isSimple()) { 52238bcb0991SDimitry Andric MVT MTy = VT.getSimpleVT(); 5224349cc55cSDimitry Andric if (ST->useSLMArithCosts()) 522506c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 5226480093f4SDimitry Andric return Entry->Cost; 5227480093f4SDimitry Andric 52288bcb0991SDimitry Andric if (ST->hasAVX()) 522906c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 52308bcb0991SDimitry Andric return Entry->Cost; 52310b57cec5SDimitry Andric 52328bcb0991SDimitry Andric if (ST->hasSSE2()) 523306c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 52348bcb0991SDimitry Andric return Entry->Cost; 52358bcb0991SDimitry Andric } 52360b57cec5SDimitry Andric 5237bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 52380b57cec5SDimitry Andric 52398bcb0991SDimitry Andric MVT MTy = LT.second; 52400b57cec5SDimitry Andric 52415ffd83dbSDimitry Andric auto *ValVTy = cast<FixedVectorType>(ValTy); 5242480093f4SDimitry Andric 5243fe6060f1SDimitry Andric // Special case: vXi8 mul reductions are performed as vXi16. 5244fe6060f1SDimitry Andric if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { 5245fe6060f1SDimitry Andric auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16); 5246fe6060f1SDimitry Andric auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements()); 5247fe6060f1SDimitry Andric return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, 5248fe6060f1SDimitry Andric TargetTransformInfo::CastContextHint::None, 5249fe6060f1SDimitry Andric CostKind) + 5250fe6060f1SDimitry Andric getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind); 5251fe6060f1SDimitry Andric } 5252fe6060f1SDimitry Andric 5253fe6060f1SDimitry Andric InstructionCost ArithmeticCost = 0; 52545ffd83dbSDimitry Andric if (LT.first != 1 && MTy.isVector() && 52555ffd83dbSDimitry Andric MTy.getVectorNumElements() < ValVTy->getNumElements()) { 52565ffd83dbSDimitry Andric // Type needs to be split. We need LT.first - 1 arithmetic ops. 52575ffd83dbSDimitry Andric auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), 52585ffd83dbSDimitry Andric MTy.getVectorNumElements()); 52595ffd83dbSDimitry Andric ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); 52605ffd83dbSDimitry Andric ArithmeticCost *= LT.first - 1; 52615ffd83dbSDimitry Andric } 52620b57cec5SDimitry Andric 5263349cc55cSDimitry Andric if (ST->useSLMArithCosts()) 526406c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 52655ffd83dbSDimitry Andric return ArithmeticCost + Entry->Cost; 5266480093f4SDimitry Andric 52670b57cec5SDimitry Andric if (ST->hasAVX()) 526806c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 52695ffd83dbSDimitry Andric return ArithmeticCost + Entry->Cost; 52700b57cec5SDimitry Andric 52718bcb0991SDimitry Andric if (ST->hasSSE2()) 527206c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 52735ffd83dbSDimitry Andric return ArithmeticCost + Entry->Cost; 52740b57cec5SDimitry Andric 5275480093f4SDimitry Andric // FIXME: These assume a naive kshift+binop lowering, which is probably 5276480093f4SDimitry Andric // conservative in most cases. 5277480093f4SDimitry Andric static const CostTblEntry AVX512BoolReduction[] = { 5278480093f4SDimitry Andric { ISD::AND, MVT::v2i1, 3 }, 5279480093f4SDimitry Andric { ISD::AND, MVT::v4i1, 5 }, 5280480093f4SDimitry Andric { ISD::AND, MVT::v8i1, 7 }, 5281480093f4SDimitry Andric { ISD::AND, MVT::v16i1, 9 }, 5282480093f4SDimitry Andric { ISD::AND, MVT::v32i1, 11 }, 5283480093f4SDimitry Andric { ISD::AND, MVT::v64i1, 13 }, 5284480093f4SDimitry Andric { ISD::OR, MVT::v2i1, 3 }, 5285480093f4SDimitry Andric { ISD::OR, MVT::v4i1, 5 }, 5286480093f4SDimitry Andric { ISD::OR, MVT::v8i1, 7 }, 5287480093f4SDimitry Andric { ISD::OR, MVT::v16i1, 9 }, 5288480093f4SDimitry Andric { ISD::OR, MVT::v32i1, 11 }, 5289480093f4SDimitry Andric { ISD::OR, MVT::v64i1, 13 }, 5290480093f4SDimitry Andric }; 5291480093f4SDimitry Andric 52920b57cec5SDimitry Andric static const CostTblEntry AVX2BoolReduction[] = { 52930b57cec5SDimitry Andric { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp 52940b57cec5SDimitry Andric { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp 52950b57cec5SDimitry Andric { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp 52960b57cec5SDimitry Andric { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp 52970b57cec5SDimitry Andric }; 52980b57cec5SDimitry Andric 52990b57cec5SDimitry Andric static const CostTblEntry AVX1BoolReduction[] = { 53000b57cec5SDimitry Andric { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp 53010b57cec5SDimitry Andric { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp 53020b57cec5SDimitry Andric { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp 53030b57cec5SDimitry Andric { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp 53040b57cec5SDimitry Andric { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp 53050b57cec5SDimitry Andric { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp 53060b57cec5SDimitry Andric { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp 53070b57cec5SDimitry Andric { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp 53080b57cec5SDimitry Andric }; 53090b57cec5SDimitry Andric 53100b57cec5SDimitry Andric static const CostTblEntry SSE2BoolReduction[] = { 53110b57cec5SDimitry Andric { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp 53120b57cec5SDimitry Andric { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp 53130b57cec5SDimitry Andric { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp 53140b57cec5SDimitry Andric { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp 53150b57cec5SDimitry Andric { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp 53160b57cec5SDimitry Andric { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp 53170b57cec5SDimitry Andric { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp 53180b57cec5SDimitry Andric { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp 53190b57cec5SDimitry Andric }; 53200b57cec5SDimitry Andric 53210b57cec5SDimitry Andric // Handle bool allof/anyof patterns. 53225ffd83dbSDimitry Andric if (ValVTy->getElementType()->isIntegerTy(1)) { 5323fe6060f1SDimitry Andric InstructionCost ArithmeticCost = 0; 53245ffd83dbSDimitry Andric if (LT.first != 1 && MTy.isVector() && 53255ffd83dbSDimitry Andric MTy.getVectorNumElements() < ValVTy->getNumElements()) { 53265ffd83dbSDimitry Andric // Type needs to be split. We need LT.first - 1 arithmetic ops. 53275ffd83dbSDimitry Andric auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), 53285ffd83dbSDimitry Andric MTy.getVectorNumElements()); 53295ffd83dbSDimitry Andric ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); 53305ffd83dbSDimitry Andric ArithmeticCost *= LT.first - 1; 53315ffd83dbSDimitry Andric } 53325ffd83dbSDimitry Andric 5333480093f4SDimitry Andric if (ST->hasAVX512()) 5334480093f4SDimitry Andric if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) 53355ffd83dbSDimitry Andric return ArithmeticCost + Entry->Cost; 53360b57cec5SDimitry Andric if (ST->hasAVX2()) 53370b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) 53385ffd83dbSDimitry Andric return ArithmeticCost + Entry->Cost; 53390b57cec5SDimitry Andric if (ST->hasAVX()) 53400b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) 53415ffd83dbSDimitry Andric return ArithmeticCost + Entry->Cost; 53420b57cec5SDimitry Andric if (ST->hasSSE2()) 53430b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) 53445ffd83dbSDimitry Andric return ArithmeticCost + Entry->Cost; 53455ffd83dbSDimitry Andric 5346fe6060f1SDimitry Andric return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); 53475ffd83dbSDimitry Andric } 53485ffd83dbSDimitry Andric 53495ffd83dbSDimitry Andric unsigned NumVecElts = ValVTy->getNumElements(); 53505ffd83dbSDimitry Andric unsigned ScalarSize = ValVTy->getScalarSizeInBits(); 53515ffd83dbSDimitry Andric 53525ffd83dbSDimitry Andric // Special case power of 2 reductions where the scalar type isn't changed 53535ffd83dbSDimitry Andric // by type legalization. 53545ffd83dbSDimitry Andric if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) 5355fe6060f1SDimitry Andric return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); 53565ffd83dbSDimitry Andric 5357fe6060f1SDimitry Andric InstructionCost ReductionCost = 0; 53585ffd83dbSDimitry Andric 53595ffd83dbSDimitry Andric auto *Ty = ValVTy; 53605ffd83dbSDimitry Andric if (LT.first != 1 && MTy.isVector() && 53615ffd83dbSDimitry Andric MTy.getVectorNumElements() < ValVTy->getNumElements()) { 53625ffd83dbSDimitry Andric // Type needs to be split. We need LT.first - 1 arithmetic ops. 53635ffd83dbSDimitry Andric Ty = FixedVectorType::get(ValVTy->getElementType(), 53645ffd83dbSDimitry Andric MTy.getVectorNumElements()); 53655ffd83dbSDimitry Andric ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 53665ffd83dbSDimitry Andric ReductionCost *= LT.first - 1; 53675ffd83dbSDimitry Andric NumVecElts = MTy.getVectorNumElements(); 53685ffd83dbSDimitry Andric } 53695ffd83dbSDimitry Andric 53705ffd83dbSDimitry Andric // Now handle reduction with the legal type, taking into account size changes 53715ffd83dbSDimitry Andric // at each level. 53725ffd83dbSDimitry Andric while (NumVecElts > 1) { 53735ffd83dbSDimitry Andric // Determine the size of the remaining vector we need to reduce. 53745ffd83dbSDimitry Andric unsigned Size = NumVecElts * ScalarSize; 53755ffd83dbSDimitry Andric NumVecElts /= 2; 53765ffd83dbSDimitry Andric // If we're reducing from 256/512 bits, use an extract_subvector. 53775ffd83dbSDimitry Andric if (Size > 128) { 53785ffd83dbSDimitry Andric auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); 53795ffd83dbSDimitry Andric ReductionCost += 5380bdd1243dSDimitry Andric getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, CostKind, 5381bdd1243dSDimitry Andric NumVecElts, SubTy); 53825ffd83dbSDimitry Andric Ty = SubTy; 53835ffd83dbSDimitry Andric } else if (Size == 128) { 53845ffd83dbSDimitry Andric // Reducing from 128 bits is a permute of v2f64/v2i64. 53855ffd83dbSDimitry Andric FixedVectorType *ShufTy; 53865ffd83dbSDimitry Andric if (ValVTy->isFloatingPointTy()) 53875ffd83dbSDimitry Andric ShufTy = 53885ffd83dbSDimitry Andric FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2); 53895ffd83dbSDimitry Andric else 53905ffd83dbSDimitry Andric ShufTy = 53915ffd83dbSDimitry Andric FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); 5392bdd1243dSDimitry Andric ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 5393bdd1243dSDimitry Andric std::nullopt, CostKind, 0, nullptr); 53945ffd83dbSDimitry Andric } else if (Size == 64) { 53955ffd83dbSDimitry Andric // Reducing from 64 bits is a shuffle of v4f32/v4i32. 53965ffd83dbSDimitry Andric FixedVectorType *ShufTy; 53975ffd83dbSDimitry Andric if (ValVTy->isFloatingPointTy()) 53985ffd83dbSDimitry Andric ShufTy = 53995ffd83dbSDimitry Andric FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4); 54005ffd83dbSDimitry Andric else 54015ffd83dbSDimitry Andric ShufTy = 54025ffd83dbSDimitry Andric FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); 5403bdd1243dSDimitry Andric ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 5404bdd1243dSDimitry Andric std::nullopt, CostKind, 0, nullptr); 54055ffd83dbSDimitry Andric } else { 54065ffd83dbSDimitry Andric // Reducing from smaller size is a shift by immediate. 54075ffd83dbSDimitry Andric auto *ShiftTy = FixedVectorType::get( 54085ffd83dbSDimitry Andric Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size); 54095ffd83dbSDimitry Andric ReductionCost += getArithmeticInstrCost( 54105ffd83dbSDimitry Andric Instruction::LShr, ShiftTy, CostKind, 5411bdd1243dSDimitry Andric {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 5412bdd1243dSDimitry Andric {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None}); 54135ffd83dbSDimitry Andric } 54145ffd83dbSDimitry Andric 54155ffd83dbSDimitry Andric // Add the arithmetic op for this level. 54165ffd83dbSDimitry Andric ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); 54175ffd83dbSDimitry Andric } 54185ffd83dbSDimitry Andric 54195ffd83dbSDimitry Andric // Add the final extract element to the cost. 5420bdd1243dSDimitry Andric return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 5421bdd1243dSDimitry Andric CostKind, 0, nullptr, nullptr); 54225ffd83dbSDimitry Andric } 54235ffd83dbSDimitry Andric 542406c3fb27SDimitry Andric InstructionCost X86TTIImpl::getMinMaxCost(Intrinsic::ID IID, Type *Ty, 542506c3fb27SDimitry Andric TTI::TargetCostKind CostKind, 542606c3fb27SDimitry Andric FastMathFlags FMF) { 542706c3fb27SDimitry Andric IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF); 542806c3fb27SDimitry Andric return getIntrinsicInstrCost(ICA, CostKind); 54290b57cec5SDimitry Andric } 54300b57cec5SDimitry Andric 5431fe6060f1SDimitry Andric InstructionCost 543206c3fb27SDimitry Andric X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy, 543306c3fb27SDimitry Andric FastMathFlags FMF, 54345ffd83dbSDimitry Andric TTI::TargetCostKind CostKind) { 5435bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 54360b57cec5SDimitry Andric 54370b57cec5SDimitry Andric MVT MTy = LT.second; 54380b57cec5SDimitry Andric 54390b57cec5SDimitry Andric int ISD; 54400b57cec5SDimitry Andric if (ValTy->isIntOrIntVectorTy()) { 544106c3fb27SDimitry Andric ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN 544206c3fb27SDimitry Andric : ISD::SMIN; 54430b57cec5SDimitry Andric } else { 54440b57cec5SDimitry Andric assert(ValTy->isFPOrFPVectorTy() && 54450b57cec5SDimitry Andric "Expected float point or integer vector type."); 544606c3fb27SDimitry Andric ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum) 544706c3fb27SDimitry Andric ? ISD::FMINNUM 544806c3fb27SDimitry Andric : ISD::FMINIMUM; 54490b57cec5SDimitry Andric } 54500b57cec5SDimitry Andric 54510b57cec5SDimitry Andric // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 54520b57cec5SDimitry Andric // and make it as the cost. 54530b57cec5SDimitry Andric 545406c3fb27SDimitry Andric static const CostTblEntry SSE2CostTbl[] = { 54555ffd83dbSDimitry Andric {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw 54565ffd83dbSDimitry Andric {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw 54575ffd83dbSDimitry Andric {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw 54580b57cec5SDimitry Andric }; 54590b57cec5SDimitry Andric 546006c3fb27SDimitry Andric static const CostTblEntry SSE41CostTbl[] = { 54615ffd83dbSDimitry Andric {ISD::SMIN, MVT::v2i16, 3}, // same as sse2 54625ffd83dbSDimitry Andric {ISD::SMIN, MVT::v4i16, 5}, // same as sse2 54635ffd83dbSDimitry Andric {ISD::UMIN, MVT::v2i16, 5}, // same as sse2 54645ffd83dbSDimitry Andric {ISD::UMIN, MVT::v4i16, 7}, // same as sse2 54655ffd83dbSDimitry Andric {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor 54665ffd83dbSDimitry Andric {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax 54675ffd83dbSDimitry Andric {ISD::SMIN, MVT::v2i8, 3}, // pminsb 54685ffd83dbSDimitry Andric {ISD::SMIN, MVT::v4i8, 5}, // pminsb 54695ffd83dbSDimitry Andric {ISD::SMIN, MVT::v8i8, 7}, // pminsb 54705ffd83dbSDimitry Andric {ISD::SMIN, MVT::v16i8, 6}, 54715ffd83dbSDimitry Andric {ISD::UMIN, MVT::v2i8, 3}, // same as sse2 54725ffd83dbSDimitry Andric {ISD::UMIN, MVT::v4i8, 5}, // same as sse2 54735ffd83dbSDimitry Andric {ISD::UMIN, MVT::v8i8, 7}, // same as sse2 54745ffd83dbSDimitry Andric {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax 54750b57cec5SDimitry Andric }; 54760b57cec5SDimitry Andric 547706c3fb27SDimitry Andric static const CostTblEntry AVX1CostTbl[] = { 54785ffd83dbSDimitry Andric {ISD::SMIN, MVT::v16i16, 6}, 54795ffd83dbSDimitry Andric {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax 54805ffd83dbSDimitry Andric {ISD::SMIN, MVT::v32i8, 8}, 54815ffd83dbSDimitry Andric {ISD::UMIN, MVT::v32i8, 8}, 54820b57cec5SDimitry Andric }; 54830b57cec5SDimitry Andric 548406c3fb27SDimitry Andric static const CostTblEntry AVX512BWCostTbl[] = { 54855ffd83dbSDimitry Andric {ISD::SMIN, MVT::v32i16, 8}, 54865ffd83dbSDimitry Andric {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax 54875ffd83dbSDimitry Andric {ISD::SMIN, MVT::v64i8, 10}, 54885ffd83dbSDimitry Andric {ISD::UMIN, MVT::v64i8, 10}, 54890b57cec5SDimitry Andric }; 54900b57cec5SDimitry Andric 54915ffd83dbSDimitry Andric // Before legalizing the type, give a chance to look up illegal narrow types 54925ffd83dbSDimitry Andric // in the table. 54935ffd83dbSDimitry Andric // FIXME: Is there a better way to do this? 54945ffd83dbSDimitry Andric EVT VT = TLI->getValueType(DL, ValTy); 54955ffd83dbSDimitry Andric if (VT.isSimple()) { 54965ffd83dbSDimitry Andric MVT MTy = VT.getSimpleVT(); 54975ffd83dbSDimitry Andric if (ST->hasBWI()) 549806c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 54995ffd83dbSDimitry Andric return Entry->Cost; 55000b57cec5SDimitry Andric 55010b57cec5SDimitry Andric if (ST->hasAVX()) 550206c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 55035ffd83dbSDimitry Andric return Entry->Cost; 55040b57cec5SDimitry Andric 55050b57cec5SDimitry Andric if (ST->hasSSE41()) 550606c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 55075ffd83dbSDimitry Andric return Entry->Cost; 55080b57cec5SDimitry Andric 55090b57cec5SDimitry Andric if (ST->hasSSE2()) 551006c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 55115ffd83dbSDimitry Andric return Entry->Cost; 55120b57cec5SDimitry Andric } 55130b57cec5SDimitry Andric 55145ffd83dbSDimitry Andric auto *ValVTy = cast<FixedVectorType>(ValTy); 55155ffd83dbSDimitry Andric unsigned NumVecElts = ValVTy->getNumElements(); 55165ffd83dbSDimitry Andric 55175ffd83dbSDimitry Andric auto *Ty = ValVTy; 5518fe6060f1SDimitry Andric InstructionCost MinMaxCost = 0; 55195ffd83dbSDimitry Andric if (LT.first != 1 && MTy.isVector() && 55205ffd83dbSDimitry Andric MTy.getVectorNumElements() < ValVTy->getNumElements()) { 55215ffd83dbSDimitry Andric // Type needs to be split. We need LT.first - 1 operations ops. 55225ffd83dbSDimitry Andric Ty = FixedVectorType::get(ValVTy->getElementType(), 55235ffd83dbSDimitry Andric MTy.getVectorNumElements()); 552406c3fb27SDimitry Andric MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF); 55255ffd83dbSDimitry Andric MinMaxCost *= LT.first - 1; 55265ffd83dbSDimitry Andric NumVecElts = MTy.getVectorNumElements(); 55275ffd83dbSDimitry Andric } 55285ffd83dbSDimitry Andric 55295ffd83dbSDimitry Andric if (ST->hasBWI()) 553006c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 55315ffd83dbSDimitry Andric return MinMaxCost + Entry->Cost; 55325ffd83dbSDimitry Andric 55335ffd83dbSDimitry Andric if (ST->hasAVX()) 553406c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 55355ffd83dbSDimitry Andric return MinMaxCost + Entry->Cost; 55365ffd83dbSDimitry Andric 55375ffd83dbSDimitry Andric if (ST->hasSSE41()) 553806c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 55395ffd83dbSDimitry Andric return MinMaxCost + Entry->Cost; 55405ffd83dbSDimitry Andric 55415ffd83dbSDimitry Andric if (ST->hasSSE2()) 554206c3fb27SDimitry Andric if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 55435ffd83dbSDimitry Andric return MinMaxCost + Entry->Cost; 55445ffd83dbSDimitry Andric 55455ffd83dbSDimitry Andric unsigned ScalarSize = ValTy->getScalarSizeInBits(); 55465ffd83dbSDimitry Andric 55475ffd83dbSDimitry Andric // Special case power of 2 reductions where the scalar type isn't changed 55485ffd83dbSDimitry Andric // by type legalization. 55495ffd83dbSDimitry Andric if (!isPowerOf2_32(ValVTy->getNumElements()) || 55505ffd83dbSDimitry Andric ScalarSize != MTy.getScalarSizeInBits()) 555106c3fb27SDimitry Andric return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind); 55525ffd83dbSDimitry Andric 55535ffd83dbSDimitry Andric // Now handle reduction with the legal type, taking into account size changes 55545ffd83dbSDimitry Andric // at each level. 55555ffd83dbSDimitry Andric while (NumVecElts > 1) { 55565ffd83dbSDimitry Andric // Determine the size of the remaining vector we need to reduce. 55575ffd83dbSDimitry Andric unsigned Size = NumVecElts * ScalarSize; 55585ffd83dbSDimitry Andric NumVecElts /= 2; 55595ffd83dbSDimitry Andric // If we're reducing from 256/512 bits, use an extract_subvector. 55605ffd83dbSDimitry Andric if (Size > 128) { 55615ffd83dbSDimitry Andric auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); 5562bdd1243dSDimitry Andric MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 5563bdd1243dSDimitry Andric CostKind, NumVecElts, SubTy); 55645ffd83dbSDimitry Andric Ty = SubTy; 55655ffd83dbSDimitry Andric } else if (Size == 128) { 55665ffd83dbSDimitry Andric // Reducing from 128 bits is a permute of v2f64/v2i64. 55675ffd83dbSDimitry Andric VectorType *ShufTy; 55685ffd83dbSDimitry Andric if (ValTy->isFloatingPointTy()) 55695ffd83dbSDimitry Andric ShufTy = 55705ffd83dbSDimitry Andric FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); 55715ffd83dbSDimitry Andric else 55725ffd83dbSDimitry Andric ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); 5573bdd1243dSDimitry Andric MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 5574bdd1243dSDimitry Andric std::nullopt, CostKind, 0, nullptr); 55755ffd83dbSDimitry Andric } else if (Size == 64) { 55765ffd83dbSDimitry Andric // Reducing from 64 bits is a shuffle of v4f32/v4i32. 55775ffd83dbSDimitry Andric FixedVectorType *ShufTy; 55785ffd83dbSDimitry Andric if (ValTy->isFloatingPointTy()) 55795ffd83dbSDimitry Andric ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); 55805ffd83dbSDimitry Andric else 55815ffd83dbSDimitry Andric ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); 5582bdd1243dSDimitry Andric MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 5583bdd1243dSDimitry Andric std::nullopt, CostKind, 0, nullptr); 55845ffd83dbSDimitry Andric } else { 55855ffd83dbSDimitry Andric // Reducing from smaller size is a shift by immediate. 55865ffd83dbSDimitry Andric auto *ShiftTy = FixedVectorType::get( 55875ffd83dbSDimitry Andric Type::getIntNTy(ValTy->getContext(), Size), 128 / Size); 55885ffd83dbSDimitry Andric MinMaxCost += getArithmeticInstrCost( 55895ffd83dbSDimitry Andric Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput, 5590bdd1243dSDimitry Andric {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 5591bdd1243dSDimitry Andric {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None}); 55925ffd83dbSDimitry Andric } 55935ffd83dbSDimitry Andric 55945ffd83dbSDimitry Andric // Add the arithmetic op for this level. 559506c3fb27SDimitry Andric MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF); 55965ffd83dbSDimitry Andric } 55975ffd83dbSDimitry Andric 55985ffd83dbSDimitry Andric // Add the final extract element to the cost. 5599bdd1243dSDimitry Andric return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 5600bdd1243dSDimitry Andric CostKind, 0, nullptr, nullptr); 56010b57cec5SDimitry Andric } 56020b57cec5SDimitry Andric 56030b57cec5SDimitry Andric /// Calculate the cost of materializing a 64-bit value. This helper 56040b57cec5SDimitry Andric /// method might only calculate a fraction of a larger immediate. Therefore it 56050b57cec5SDimitry Andric /// is valid to return a cost of ZERO. 5606fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) { 56070b57cec5SDimitry Andric if (Val == 0) 56080b57cec5SDimitry Andric return TTI::TCC_Free; 56090b57cec5SDimitry Andric 56100b57cec5SDimitry Andric if (isInt<32>(Val)) 56110b57cec5SDimitry Andric return TTI::TCC_Basic; 56120b57cec5SDimitry Andric 56130b57cec5SDimitry Andric return 2 * TTI::TCC_Basic; 56140b57cec5SDimitry Andric } 56150b57cec5SDimitry Andric 5616fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 56175ffd83dbSDimitry Andric TTI::TargetCostKind CostKind) { 56180b57cec5SDimitry Andric assert(Ty->isIntegerTy()); 56190b57cec5SDimitry Andric 56200b57cec5SDimitry Andric unsigned BitSize = Ty->getPrimitiveSizeInBits(); 56210b57cec5SDimitry Andric if (BitSize == 0) 56220b57cec5SDimitry Andric return ~0U; 56230b57cec5SDimitry Andric 56240b57cec5SDimitry Andric // Never hoist constants larger than 128bit, because this might lead to 56250b57cec5SDimitry Andric // incorrect code generation or assertions in codegen. 56260b57cec5SDimitry Andric // Fixme: Create a cost model for types larger than i128 once the codegen 56270b57cec5SDimitry Andric // issues have been fixed. 56280b57cec5SDimitry Andric if (BitSize > 128) 56290b57cec5SDimitry Andric return TTI::TCC_Free; 56300b57cec5SDimitry Andric 56310b57cec5SDimitry Andric if (Imm == 0) 56320b57cec5SDimitry Andric return TTI::TCC_Free; 56330b57cec5SDimitry Andric 56340b57cec5SDimitry Andric // Sign-extend all constants to a multiple of 64-bit. 56350b57cec5SDimitry Andric APInt ImmVal = Imm; 56360b57cec5SDimitry Andric if (BitSize % 64 != 0) 56370b57cec5SDimitry Andric ImmVal = Imm.sext(alignTo(BitSize, 64)); 56380b57cec5SDimitry Andric 56390b57cec5SDimitry Andric // Split the constant into 64-bit chunks and calculate the cost for each 56400b57cec5SDimitry Andric // chunk. 5641fe6060f1SDimitry Andric InstructionCost Cost = 0; 56420b57cec5SDimitry Andric for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 56430b57cec5SDimitry Andric APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 56440b57cec5SDimitry Andric int64_t Val = Tmp.getSExtValue(); 56450b57cec5SDimitry Andric Cost += getIntImmCost(Val); 56460b57cec5SDimitry Andric } 56470b57cec5SDimitry Andric // We need at least one instruction to materialize the constant. 5648fe6060f1SDimitry Andric return std::max<InstructionCost>(1, Cost); 56490b57cec5SDimitry Andric } 56500b57cec5SDimitry Andric 5651fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 5652e8d8bef9SDimitry Andric const APInt &Imm, Type *Ty, 5653e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind, 5654e8d8bef9SDimitry Andric Instruction *Inst) { 56550b57cec5SDimitry Andric assert(Ty->isIntegerTy()); 56560b57cec5SDimitry Andric 56570b57cec5SDimitry Andric unsigned BitSize = Ty->getPrimitiveSizeInBits(); 56580b57cec5SDimitry Andric // There is no cost model for constants with a bit size of 0. Return TCC_Free 56590b57cec5SDimitry Andric // here, so that constant hoisting will ignore this constant. 56600b57cec5SDimitry Andric if (BitSize == 0) 56610b57cec5SDimitry Andric return TTI::TCC_Free; 56620b57cec5SDimitry Andric 56630b57cec5SDimitry Andric unsigned ImmIdx = ~0U; 56640b57cec5SDimitry Andric switch (Opcode) { 56650b57cec5SDimitry Andric default: 56660b57cec5SDimitry Andric return TTI::TCC_Free; 56670b57cec5SDimitry Andric case Instruction::GetElementPtr: 56680b57cec5SDimitry Andric // Always hoist the base address of a GetElementPtr. This prevents the 56690b57cec5SDimitry Andric // creation of new constants for every base constant that gets constant 56700b57cec5SDimitry Andric // folded with the offset. 56710b57cec5SDimitry Andric if (Idx == 0) 56720b57cec5SDimitry Andric return 2 * TTI::TCC_Basic; 56730b57cec5SDimitry Andric return TTI::TCC_Free; 56740b57cec5SDimitry Andric case Instruction::Store: 56750b57cec5SDimitry Andric ImmIdx = 0; 56760b57cec5SDimitry Andric break; 56770b57cec5SDimitry Andric case Instruction::ICmp: 56780b57cec5SDimitry Andric // This is an imperfect hack to prevent constant hoisting of 56790b57cec5SDimitry Andric // compares that might be trying to check if a 64-bit value fits in 56800b57cec5SDimitry Andric // 32-bits. The backend can optimize these cases using a right shift by 32. 56810b57cec5SDimitry Andric // Ideally we would check the compare predicate here. There also other 56820b57cec5SDimitry Andric // similar immediates the backend can use shifts for. 56830b57cec5SDimitry Andric if (Idx == 1 && Imm.getBitWidth() == 64) { 56840b57cec5SDimitry Andric uint64_t ImmVal = Imm.getZExtValue(); 56850b57cec5SDimitry Andric if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) 56860b57cec5SDimitry Andric return TTI::TCC_Free; 56870b57cec5SDimitry Andric } 56880b57cec5SDimitry Andric ImmIdx = 1; 56890b57cec5SDimitry Andric break; 56900b57cec5SDimitry Andric case Instruction::And: 56910b57cec5SDimitry Andric // We support 64-bit ANDs with immediates with 32-bits of leading zeroes 56920b57cec5SDimitry Andric // by using a 32-bit operation with implicit zero extension. Detect such 56930b57cec5SDimitry Andric // immediates here as the normal path expects bit 31 to be sign extended. 5694bdd1243dSDimitry Andric if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32)) 56950b57cec5SDimitry Andric return TTI::TCC_Free; 56960b57cec5SDimitry Andric ImmIdx = 1; 56970b57cec5SDimitry Andric break; 56980b57cec5SDimitry Andric case Instruction::Add: 56990b57cec5SDimitry Andric case Instruction::Sub: 57000b57cec5SDimitry Andric // For add/sub, we can use the opposite instruction for INT32_MIN. 57010b57cec5SDimitry Andric if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) 57020b57cec5SDimitry Andric return TTI::TCC_Free; 57030b57cec5SDimitry Andric ImmIdx = 1; 57040b57cec5SDimitry Andric break; 57050b57cec5SDimitry Andric case Instruction::UDiv: 57060b57cec5SDimitry Andric case Instruction::SDiv: 57070b57cec5SDimitry Andric case Instruction::URem: 57080b57cec5SDimitry Andric case Instruction::SRem: 57090b57cec5SDimitry Andric // Division by constant is typically expanded later into a different 57100b57cec5SDimitry Andric // instruction sequence. This completely changes the constants. 57110b57cec5SDimitry Andric // Report them as "free" to stop ConstantHoist from marking them as opaque. 57120b57cec5SDimitry Andric return TTI::TCC_Free; 57130b57cec5SDimitry Andric case Instruction::Mul: 57140b57cec5SDimitry Andric case Instruction::Or: 57150b57cec5SDimitry Andric case Instruction::Xor: 57160b57cec5SDimitry Andric ImmIdx = 1; 57170b57cec5SDimitry Andric break; 57180b57cec5SDimitry Andric // Always return TCC_Free for the shift value of a shift instruction. 57190b57cec5SDimitry Andric case Instruction::Shl: 57200b57cec5SDimitry Andric case Instruction::LShr: 57210b57cec5SDimitry Andric case Instruction::AShr: 57220b57cec5SDimitry Andric if (Idx == 1) 57230b57cec5SDimitry Andric return TTI::TCC_Free; 57240b57cec5SDimitry Andric break; 57250b57cec5SDimitry Andric case Instruction::Trunc: 57260b57cec5SDimitry Andric case Instruction::ZExt: 57270b57cec5SDimitry Andric case Instruction::SExt: 57280b57cec5SDimitry Andric case Instruction::IntToPtr: 57290b57cec5SDimitry Andric case Instruction::PtrToInt: 57300b57cec5SDimitry Andric case Instruction::BitCast: 57310b57cec5SDimitry Andric case Instruction::PHI: 57320b57cec5SDimitry Andric case Instruction::Call: 57330b57cec5SDimitry Andric case Instruction::Select: 57340b57cec5SDimitry Andric case Instruction::Ret: 57350b57cec5SDimitry Andric case Instruction::Load: 57360b57cec5SDimitry Andric break; 57370b57cec5SDimitry Andric } 57380b57cec5SDimitry Andric 57390b57cec5SDimitry Andric if (Idx == ImmIdx) { 574006c3fb27SDimitry Andric uint64_t NumConstants = divideCeil(BitSize, 64); 5741fe6060f1SDimitry Andric InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); 57420b57cec5SDimitry Andric return (Cost <= NumConstants * TTI::TCC_Basic) 57430b57cec5SDimitry Andric ? static_cast<int>(TTI::TCC_Free) 57440b57cec5SDimitry Andric : Cost; 57450b57cec5SDimitry Andric } 57460b57cec5SDimitry Andric 57475ffd83dbSDimitry Andric return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); 57480b57cec5SDimitry Andric } 57490b57cec5SDimitry Andric 5750fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 57515ffd83dbSDimitry Andric const APInt &Imm, Type *Ty, 57525ffd83dbSDimitry Andric TTI::TargetCostKind CostKind) { 57530b57cec5SDimitry Andric assert(Ty->isIntegerTy()); 57540b57cec5SDimitry Andric 57550b57cec5SDimitry Andric unsigned BitSize = Ty->getPrimitiveSizeInBits(); 57560b57cec5SDimitry Andric // There is no cost model for constants with a bit size of 0. Return TCC_Free 57570b57cec5SDimitry Andric // here, so that constant hoisting will ignore this constant. 57580b57cec5SDimitry Andric if (BitSize == 0) 57590b57cec5SDimitry Andric return TTI::TCC_Free; 57600b57cec5SDimitry Andric 57610b57cec5SDimitry Andric switch (IID) { 57620b57cec5SDimitry Andric default: 57630b57cec5SDimitry Andric return TTI::TCC_Free; 57640b57cec5SDimitry Andric case Intrinsic::sadd_with_overflow: 57650b57cec5SDimitry Andric case Intrinsic::uadd_with_overflow: 57660b57cec5SDimitry Andric case Intrinsic::ssub_with_overflow: 57670b57cec5SDimitry Andric case Intrinsic::usub_with_overflow: 57680b57cec5SDimitry Andric case Intrinsic::smul_with_overflow: 57690b57cec5SDimitry Andric case Intrinsic::umul_with_overflow: 5770bdd1243dSDimitry Andric if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32)) 57710b57cec5SDimitry Andric return TTI::TCC_Free; 57720b57cec5SDimitry Andric break; 57730b57cec5SDimitry Andric case Intrinsic::experimental_stackmap: 5774bdd1243dSDimitry Andric if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64))) 57750b57cec5SDimitry Andric return TTI::TCC_Free; 57760b57cec5SDimitry Andric break; 57770b57cec5SDimitry Andric case Intrinsic::experimental_patchpoint_void: 5778*0fca6ea1SDimitry Andric case Intrinsic::experimental_patchpoint: 5779bdd1243dSDimitry Andric if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64))) 57800b57cec5SDimitry Andric return TTI::TCC_Free; 57810b57cec5SDimitry Andric break; 57820b57cec5SDimitry Andric } 57835ffd83dbSDimitry Andric return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); 57840b57cec5SDimitry Andric } 57850b57cec5SDimitry Andric 5786fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, 5787fe6060f1SDimitry Andric TTI::TargetCostKind CostKind, 5788fe6060f1SDimitry Andric const Instruction *I) { 57895ffd83dbSDimitry Andric if (CostKind != TTI::TCK_RecipThroughput) 57905ffd83dbSDimitry Andric return Opcode == Instruction::PHI ? 0 : 1; 57915ffd83dbSDimitry Andric // Branches are assumed to be predicted. 5792fe6060f1SDimitry Andric return 0; 57930b57cec5SDimitry Andric } 57940b57cec5SDimitry Andric 5795e8d8bef9SDimitry Andric int X86TTIImpl::getGatherOverhead() const { 5796e8d8bef9SDimitry Andric // Some CPUs have more overhead for gather. The specified overhead is relative 5797e8d8bef9SDimitry Andric // to the Load operation. "2" is the number provided by Intel architects. This 5798e8d8bef9SDimitry Andric // parameter is used for cost estimation of Gather Op and comparison with 5799e8d8bef9SDimitry Andric // other alternatives. 5800e8d8bef9SDimitry Andric // TODO: Remove the explicit hasAVX512()?, That would mean we would only 5801e8d8bef9SDimitry Andric // enable gather with a -march. 5802e8d8bef9SDimitry Andric if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) 5803e8d8bef9SDimitry Andric return 2; 5804e8d8bef9SDimitry Andric 5805e8d8bef9SDimitry Andric return 1024; 5806e8d8bef9SDimitry Andric } 5807e8d8bef9SDimitry Andric 5808e8d8bef9SDimitry Andric int X86TTIImpl::getScatterOverhead() const { 5809e8d8bef9SDimitry Andric if (ST->hasAVX512()) 5810e8d8bef9SDimitry Andric return 2; 5811e8d8bef9SDimitry Andric 5812e8d8bef9SDimitry Andric return 1024; 5813e8d8bef9SDimitry Andric } 5814e8d8bef9SDimitry Andric 5815e8d8bef9SDimitry Andric // Return an average cost of Gather / Scatter instruction, maybe improved later. 5816*0fca6ea1SDimitry Andric InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, 5817*0fca6ea1SDimitry Andric TTI::TargetCostKind CostKind, 5818*0fca6ea1SDimitry Andric Type *SrcVTy, const Value *Ptr, 5819*0fca6ea1SDimitry Andric Align Alignment, 5820fe6060f1SDimitry Andric unsigned AddressSpace) { 58210b57cec5SDimitry Andric 58220b57cec5SDimitry Andric assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); 58235ffd83dbSDimitry Andric unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); 58240b57cec5SDimitry Andric 58250b57cec5SDimitry Andric // Try to reduce index size from 64 bit (default for GEP) 58260b57cec5SDimitry Andric // to 32. It is essential for VF 16. If the index can't be reduced to 32, the 58270b57cec5SDimitry Andric // operation will use 16 x 64 indices which do not fit in a zmm and needs 58280b57cec5SDimitry Andric // to split. Also check that the base pointer is the same for all lanes, 58290b57cec5SDimitry Andric // and that there's at most one variable index. 58305ffd83dbSDimitry Andric auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { 58310b57cec5SDimitry Andric unsigned IndexSize = DL.getPointerSizeInBits(); 58325ffd83dbSDimitry Andric const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); 58330b57cec5SDimitry Andric if (IndexSize < 64 || !GEP) 58340b57cec5SDimitry Andric return IndexSize; 58350b57cec5SDimitry Andric 58360b57cec5SDimitry Andric unsigned NumOfVarIndices = 0; 58375ffd83dbSDimitry Andric const Value *Ptrs = GEP->getPointerOperand(); 58380b57cec5SDimitry Andric if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) 58390b57cec5SDimitry Andric return IndexSize; 584006c3fb27SDimitry Andric for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) { 584106c3fb27SDimitry Andric if (isa<Constant>(GEP->getOperand(I))) 58420b57cec5SDimitry Andric continue; 584306c3fb27SDimitry Andric Type *IndxTy = GEP->getOperand(I)->getType(); 58445ffd83dbSDimitry Andric if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy)) 58455ffd83dbSDimitry Andric IndxTy = IndexVTy->getElementType(); 58460b57cec5SDimitry Andric if ((IndxTy->getPrimitiveSizeInBits() == 64 && 584706c3fb27SDimitry Andric !isa<SExtInst>(GEP->getOperand(I))) || 58480b57cec5SDimitry Andric ++NumOfVarIndices > 1) 58490b57cec5SDimitry Andric return IndexSize; // 64 58500b57cec5SDimitry Andric } 58510b57cec5SDimitry Andric return (unsigned)32; 58520b57cec5SDimitry Andric }; 58530b57cec5SDimitry Andric 58540b57cec5SDimitry Andric // Trying to reduce IndexSize to 32 bits for vector 16. 58550b57cec5SDimitry Andric // By default the IndexSize is equal to pointer size. 58560b57cec5SDimitry Andric unsigned IndexSize = (ST->hasAVX512() && VF >= 16) 58570b57cec5SDimitry Andric ? getIndexSizeInBits(Ptr, DL) 58580b57cec5SDimitry Andric : DL.getPointerSizeInBits(); 58590b57cec5SDimitry Andric 58605ffd83dbSDimitry Andric auto *IndexVTy = FixedVectorType::get( 58615ffd83dbSDimitry Andric IntegerType::get(SrcVTy->getContext(), IndexSize), VF); 5862bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy); 5863bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy); 5864fe6060f1SDimitry Andric InstructionCost::CostType SplitFactor = 5865fe6060f1SDimitry Andric *std::max(IdxsLT.first, SrcLT.first).getValue(); 58660b57cec5SDimitry Andric if (SplitFactor > 1) { 58670b57cec5SDimitry Andric // Handle splitting of vector of pointers 58685ffd83dbSDimitry Andric auto *SplitSrcTy = 58695ffd83dbSDimitry Andric FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); 5870*0fca6ea1SDimitry Andric return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr, 5871*0fca6ea1SDimitry Andric Alignment, AddressSpace); 58720b57cec5SDimitry Andric } 58730b57cec5SDimitry Andric 5874*0fca6ea1SDimitry Andric // If we didn't split, this will be a single gather/scatter instruction. 5875*0fca6ea1SDimitry Andric if (CostKind == TTI::TCK_CodeSize) 5876*0fca6ea1SDimitry Andric return 1; 5877*0fca6ea1SDimitry Andric 58780b57cec5SDimitry Andric // The gather / scatter cost is given by Intel architects. It is a rough 58790b57cec5SDimitry Andric // number since we are looking at one instruction in a time. 5880*0fca6ea1SDimitry Andric const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead() 5881e8d8bef9SDimitry Andric : getScatterOverhead(); 58820b57cec5SDimitry Andric return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 58835ffd83dbSDimitry Andric MaybeAlign(Alignment), AddressSpace, 5884*0fca6ea1SDimitry Andric CostKind); 58850b57cec5SDimitry Andric } 58860b57cec5SDimitry Andric 58870b57cec5SDimitry Andric /// Calculate the cost of Gather / Scatter operation 5888fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getGatherScatterOpCost( 5889fe6060f1SDimitry Andric unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, 5890fe6060f1SDimitry Andric Align Alignment, TTI::TargetCostKind CostKind, 58915ffd83dbSDimitry Andric const Instruction *I = nullptr) { 5892480093f4SDimitry Andric if ((Opcode == Instruction::Load && 589304eeddc0SDimitry Andric (!isLegalMaskedGather(SrcVTy, Align(Alignment)) || 589404eeddc0SDimitry Andric forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), 589504eeddc0SDimitry Andric Align(Alignment)))) || 5896480093f4SDimitry Andric (Opcode == Instruction::Store && 589704eeddc0SDimitry Andric (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) || 589804eeddc0SDimitry Andric forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), 589904eeddc0SDimitry Andric Align(Alignment))))) 5900*0fca6ea1SDimitry Andric return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, 5901*0fca6ea1SDimitry Andric Alignment, CostKind, I); 59020b57cec5SDimitry Andric 5903*0fca6ea1SDimitry Andric assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); 5904*0fca6ea1SDimitry Andric PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); 5905*0fca6ea1SDimitry Andric if (!PtrTy && Ptr->getType()->isVectorTy()) 5906*0fca6ea1SDimitry Andric PtrTy = dyn_cast<PointerType>( 5907*0fca6ea1SDimitry Andric cast<VectorType>(Ptr->getType())->getElementType()); 5908*0fca6ea1SDimitry Andric assert(PtrTy && "Unexpected type for Ptr argument"); 5909*0fca6ea1SDimitry Andric unsigned AddressSpace = PtrTy->getAddressSpace(); 5910*0fca6ea1SDimitry Andric return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment, 5911*0fca6ea1SDimitry Andric AddressSpace); 59120b57cec5SDimitry Andric } 59130b57cec5SDimitry Andric 591481ad6265SDimitry Andric bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 591581ad6265SDimitry Andric const TargetTransformInfo::LSRCost &C2) { 59160b57cec5SDimitry Andric // X86 specific here are "instruction number 1st priority". 59170b57cec5SDimitry Andric return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 59180b57cec5SDimitry Andric C1.NumIVMuls, C1.NumBaseAdds, 59190b57cec5SDimitry Andric C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 59200b57cec5SDimitry Andric std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 59210b57cec5SDimitry Andric C2.NumIVMuls, C2.NumBaseAdds, 59220b57cec5SDimitry Andric C2.ScaleCost, C2.ImmCost, C2.SetupCost); 59230b57cec5SDimitry Andric } 59240b57cec5SDimitry Andric 59250b57cec5SDimitry Andric bool X86TTIImpl::canMacroFuseCmp() { 59260b57cec5SDimitry Andric return ST->hasMacroFusion() || ST->hasBranchFusion(); 59270b57cec5SDimitry Andric } 59280b57cec5SDimitry Andric 59295ffd83dbSDimitry Andric bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { 5930*0fca6ea1SDimitry Andric Type *ScalarTy = DataTy->getScalarType(); 5931*0fca6ea1SDimitry Andric 5932*0fca6ea1SDimitry Andric // The backend can't handle a single element vector w/o CFCMOV. 5933*0fca6ea1SDimitry Andric if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1) 5934*0fca6ea1SDimitry Andric return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy); 5935*0fca6ea1SDimitry Andric 59360b57cec5SDimitry Andric if (!ST->hasAVX()) 59370b57cec5SDimitry Andric return false; 59380b57cec5SDimitry Andric 59390b57cec5SDimitry Andric if (ScalarTy->isPointerTy()) 59400b57cec5SDimitry Andric return true; 59410b57cec5SDimitry Andric 59420b57cec5SDimitry Andric if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) 59430b57cec5SDimitry Andric return true; 59440b57cec5SDimitry Andric 5945fcaf7f86SDimitry Andric if (ScalarTy->isHalfTy() && ST->hasBWI()) 5946349cc55cSDimitry Andric return true; 5947349cc55cSDimitry Andric 594806c3fb27SDimitry Andric if (ScalarTy->isBFloatTy() && ST->hasBF16()) 594906c3fb27SDimitry Andric return true; 595006c3fb27SDimitry Andric 59510b57cec5SDimitry Andric if (!ScalarTy->isIntegerTy()) 59520b57cec5SDimitry Andric return false; 59530b57cec5SDimitry Andric 59540b57cec5SDimitry Andric unsigned IntWidth = ScalarTy->getIntegerBitWidth(); 59550b57cec5SDimitry Andric return IntWidth == 32 || IntWidth == 64 || 59560b57cec5SDimitry Andric ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); 59570b57cec5SDimitry Andric } 59580b57cec5SDimitry Andric 59595ffd83dbSDimitry Andric bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) { 59608bcb0991SDimitry Andric return isLegalMaskedLoad(DataType, Alignment); 59610b57cec5SDimitry Andric } 59620b57cec5SDimitry Andric 59638bcb0991SDimitry Andric bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { 59640b57cec5SDimitry Andric unsigned DataSize = DL.getTypeStoreSize(DataType); 59650b57cec5SDimitry Andric // The only supported nontemporal loads are for aligned vectors of 16 or 32 59660b57cec5SDimitry Andric // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 59670b57cec5SDimitry Andric // (the equivalent stores only require AVX). 59680b57cec5SDimitry Andric if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) 59690b57cec5SDimitry Andric return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); 59700b57cec5SDimitry Andric 59710b57cec5SDimitry Andric return false; 59720b57cec5SDimitry Andric } 59730b57cec5SDimitry Andric 59748bcb0991SDimitry Andric bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { 59750b57cec5SDimitry Andric unsigned DataSize = DL.getTypeStoreSize(DataType); 59760b57cec5SDimitry Andric 59770b57cec5SDimitry Andric // SSE4A supports nontemporal stores of float and double at arbitrary 59780b57cec5SDimitry Andric // alignment. 59790b57cec5SDimitry Andric if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) 59800b57cec5SDimitry Andric return true; 59810b57cec5SDimitry Andric 59820b57cec5SDimitry Andric // Besides the SSE4A subtarget exception above, only aligned stores are 59830b57cec5SDimitry Andric // available nontemporaly on any other subtarget. And only stores with a size 59840b57cec5SDimitry Andric // of 4..32 bytes (powers of 2, only) are permitted. 59850b57cec5SDimitry Andric if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || 59860b57cec5SDimitry Andric !isPowerOf2_32(DataSize)) 59870b57cec5SDimitry Andric return false; 59880b57cec5SDimitry Andric 59890b57cec5SDimitry Andric // 32-byte vector nontemporal stores are supported by AVX (the equivalent 59900b57cec5SDimitry Andric // loads require AVX2). 59910b57cec5SDimitry Andric if (DataSize == 32) 59920b57cec5SDimitry Andric return ST->hasAVX(); 5993349cc55cSDimitry Andric if (DataSize == 16) 59940b57cec5SDimitry Andric return ST->hasSSE1(); 59950b57cec5SDimitry Andric return true; 59960b57cec5SDimitry Andric } 59970b57cec5SDimitry Andric 599881ad6265SDimitry Andric bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy, 599981ad6265SDimitry Andric ElementCount NumElements) const { 600081ad6265SDimitry Andric // movddup 600181ad6265SDimitry Andric return ST->hasSSE3() && !NumElements.isScalable() && 600281ad6265SDimitry Andric NumElements.getFixedValue() == 2 && 600381ad6265SDimitry Andric ElementTy == Type::getDoubleTy(ElementTy->getContext()); 600481ad6265SDimitry Andric } 600581ad6265SDimitry Andric 6006*0fca6ea1SDimitry Andric bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) { 60070b57cec5SDimitry Andric if (!isa<VectorType>(DataTy)) 60080b57cec5SDimitry Andric return false; 60090b57cec5SDimitry Andric 60100b57cec5SDimitry Andric if (!ST->hasAVX512()) 60110b57cec5SDimitry Andric return false; 60120b57cec5SDimitry Andric 60130b57cec5SDimitry Andric // The backend can't handle a single element vector. 60145ffd83dbSDimitry Andric if (cast<FixedVectorType>(DataTy)->getNumElements() == 1) 60150b57cec5SDimitry Andric return false; 60160b57cec5SDimitry Andric 60175ffd83dbSDimitry Andric Type *ScalarTy = cast<VectorType>(DataTy)->getElementType(); 60180b57cec5SDimitry Andric 60190b57cec5SDimitry Andric if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) 60200b57cec5SDimitry Andric return true; 60210b57cec5SDimitry Andric 60220b57cec5SDimitry Andric if (!ScalarTy->isIntegerTy()) 60230b57cec5SDimitry Andric return false; 60240b57cec5SDimitry Andric 60250b57cec5SDimitry Andric unsigned IntWidth = ScalarTy->getIntegerBitWidth(); 60260b57cec5SDimitry Andric return IntWidth == 32 || IntWidth == 64 || 60270b57cec5SDimitry Andric ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); 60280b57cec5SDimitry Andric } 60290b57cec5SDimitry Andric 6030*0fca6ea1SDimitry Andric bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) { 6031*0fca6ea1SDimitry Andric return isLegalMaskedExpandLoad(DataTy, Alignment); 60320b57cec5SDimitry Andric } 60330b57cec5SDimitry Andric 6034349cc55cSDimitry Andric bool X86TTIImpl::supportsGather() const { 60350b57cec5SDimitry Andric // Some CPUs have better gather performance than others. 60360b57cec5SDimitry Andric // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only 60370b57cec5SDimitry Andric // enable gather with a -march. 6038349cc55cSDimitry Andric return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); 6039349cc55cSDimitry Andric } 6040349cc55cSDimitry Andric 604104eeddc0SDimitry Andric bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { 604204eeddc0SDimitry Andric // Gather / Scatter for vector 2 is not profitable on KNL / SKX 604304eeddc0SDimitry Andric // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend 604404eeddc0SDimitry Andric // it to 8 elements, but zeroing upper bits of the mask vector will add more 604504eeddc0SDimitry Andric // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: 604604eeddc0SDimitry Andric // Check, maybe the gather/scatter instruction is better in the VariableMask 604704eeddc0SDimitry Andric // case. 604804eeddc0SDimitry Andric unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements(); 604904eeddc0SDimitry Andric return NumElts == 1 || 605004eeddc0SDimitry Andric (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); 605104eeddc0SDimitry Andric } 605204eeddc0SDimitry Andric 60538a4dda33SDimitry Andric bool X86TTIImpl::isLegalMaskedGatherScatter(Type *DataTy, Align Alignment) { 60540b57cec5SDimitry Andric Type *ScalarTy = DataTy->getScalarType(); 60550b57cec5SDimitry Andric if (ScalarTy->isPointerTy()) 60560b57cec5SDimitry Andric return true; 60570b57cec5SDimitry Andric 60580b57cec5SDimitry Andric if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) 60590b57cec5SDimitry Andric return true; 60600b57cec5SDimitry Andric 60610b57cec5SDimitry Andric if (!ScalarTy->isIntegerTy()) 60620b57cec5SDimitry Andric return false; 60630b57cec5SDimitry Andric 60640b57cec5SDimitry Andric unsigned IntWidth = ScalarTy->getIntegerBitWidth(); 60650b57cec5SDimitry Andric return IntWidth == 32 || IntWidth == 64; 60660b57cec5SDimitry Andric } 60670b57cec5SDimitry Andric 60688a4dda33SDimitry Andric bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { 60698a4dda33SDimitry Andric if (!supportsGather() || !ST->preferGather()) 60708a4dda33SDimitry Andric return false; 60718a4dda33SDimitry Andric return isLegalMaskedGatherScatter(DataTy, Alignment); 60728a4dda33SDimitry Andric } 60738a4dda33SDimitry Andric 607481ad6265SDimitry Andric bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, 607581ad6265SDimitry Andric unsigned Opcode1, 607681ad6265SDimitry Andric const SmallBitVector &OpcodeMask) const { 607781ad6265SDimitry Andric // ADDSUBPS 4xf32 SSE3 607881ad6265SDimitry Andric // VADDSUBPS 4xf32 AVX 607981ad6265SDimitry Andric // VADDSUBPS 8xf32 AVX2 608081ad6265SDimitry Andric // ADDSUBPD 2xf64 SSE3 608181ad6265SDimitry Andric // VADDSUBPD 2xf64 AVX 608281ad6265SDimitry Andric // VADDSUBPD 4xf64 AVX2 608381ad6265SDimitry Andric 608481ad6265SDimitry Andric unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements(); 608581ad6265SDimitry Andric assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible"); 608681ad6265SDimitry Andric if (!isPowerOf2_32(NumElements)) 608781ad6265SDimitry Andric return false; 608881ad6265SDimitry Andric // Check the opcode pattern. We apply the mask on the opcode arguments and 608981ad6265SDimitry Andric // then check if it is what we expect. 609081ad6265SDimitry Andric for (int Lane : seq<int>(0, NumElements)) { 609181ad6265SDimitry Andric unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0; 609281ad6265SDimitry Andric // We expect FSub for even lanes and FAdd for odd lanes. 609381ad6265SDimitry Andric if (Lane % 2 == 0 && Opc != Instruction::FSub) 609481ad6265SDimitry Andric return false; 609581ad6265SDimitry Andric if (Lane % 2 == 1 && Opc != Instruction::FAdd) 609681ad6265SDimitry Andric return false; 609781ad6265SDimitry Andric } 609881ad6265SDimitry Andric // Now check that the pattern is supported by the target ISA. 609981ad6265SDimitry Andric Type *ElemTy = cast<VectorType>(VecTy)->getElementType(); 610081ad6265SDimitry Andric if (ElemTy->isFloatTy()) 610181ad6265SDimitry Andric return ST->hasSSE3() && NumElements % 4 == 0; 610281ad6265SDimitry Andric if (ElemTy->isDoubleTy()) 610381ad6265SDimitry Andric return ST->hasSSE3() && NumElements % 2 == 0; 610481ad6265SDimitry Andric return false; 610581ad6265SDimitry Andric } 610681ad6265SDimitry Andric 61075ffd83dbSDimitry Andric bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { 61080b57cec5SDimitry Andric // AVX2 doesn't support scatter 61098a4dda33SDimitry Andric if (!ST->hasAVX512() || !ST->preferScatter()) 61100b57cec5SDimitry Andric return false; 61118a4dda33SDimitry Andric return isLegalMaskedGatherScatter(DataType, Alignment); 61120b57cec5SDimitry Andric } 61130b57cec5SDimitry Andric 61140b57cec5SDimitry Andric bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { 61150b57cec5SDimitry Andric EVT VT = TLI->getValueType(DL, DataType); 61160b57cec5SDimitry Andric return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); 61170b57cec5SDimitry Andric } 61180b57cec5SDimitry Andric 6119bdd1243dSDimitry Andric bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction* I) { 6120bdd1243dSDimitry Andric // FDIV is always expensive, even if it has a very low uop count. 6121bdd1243dSDimitry Andric // TODO: Still necessary for recent CPUs with low latency/throughput fdiv? 6122bdd1243dSDimitry Andric if (I->getOpcode() == Instruction::FDiv) 6123bdd1243dSDimitry Andric return true; 6124bdd1243dSDimitry Andric 6125bdd1243dSDimitry Andric return BaseT::isExpensiveToSpeculativelyExecute(I); 6126bdd1243dSDimitry Andric } 6127bdd1243dSDimitry Andric 61280b57cec5SDimitry Andric bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 61290b57cec5SDimitry Andric return false; 61300b57cec5SDimitry Andric } 61310b57cec5SDimitry Andric 61320b57cec5SDimitry Andric bool X86TTIImpl::areInlineCompatible(const Function *Caller, 61330b57cec5SDimitry Andric const Function *Callee) const { 61340b57cec5SDimitry Andric const TargetMachine &TM = getTLI()->getTargetMachine(); 61350b57cec5SDimitry Andric 61360b57cec5SDimitry Andric // Work this as a subsetting of subtarget features. 61370b57cec5SDimitry Andric const FeatureBitset &CallerBits = 61380b57cec5SDimitry Andric TM.getSubtargetImpl(*Caller)->getFeatureBits(); 61390b57cec5SDimitry Andric const FeatureBitset &CalleeBits = 61400b57cec5SDimitry Andric TM.getSubtargetImpl(*Callee)->getFeatureBits(); 61410b57cec5SDimitry Andric 614204eeddc0SDimitry Andric // Check whether features are the same (apart from the ignore list). 61430b57cec5SDimitry Andric FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; 61440b57cec5SDimitry Andric FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; 614504eeddc0SDimitry Andric if (RealCallerBits == RealCalleeBits) 614604eeddc0SDimitry Andric return true; 614704eeddc0SDimitry Andric 614804eeddc0SDimitry Andric // If the features are a subset, we need to additionally check for calls 614904eeddc0SDimitry Andric // that may become ABI-incompatible as a result of inlining. 615004eeddc0SDimitry Andric if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) 615104eeddc0SDimitry Andric return false; 615204eeddc0SDimitry Andric 615304eeddc0SDimitry Andric for (const Instruction &I : instructions(Callee)) { 615404eeddc0SDimitry Andric if (const auto *CB = dyn_cast<CallBase>(&I)) { 6155439352acSDimitry Andric // Having more target features is fine for inline ASM. 6156439352acSDimitry Andric if (CB->isInlineAsm()) 6157439352acSDimitry Andric continue; 6158439352acSDimitry Andric 615904eeddc0SDimitry Andric SmallVector<Type *, 8> Types; 616004eeddc0SDimitry Andric for (Value *Arg : CB->args()) 616104eeddc0SDimitry Andric Types.push_back(Arg->getType()); 616204eeddc0SDimitry Andric if (!CB->getType()->isVoidTy()) 616304eeddc0SDimitry Andric Types.push_back(CB->getType()); 616404eeddc0SDimitry Andric 616504eeddc0SDimitry Andric // Simple types are always ABI compatible. 616604eeddc0SDimitry Andric auto IsSimpleTy = [](Type *Ty) { 616704eeddc0SDimitry Andric return !Ty->isVectorTy() && !Ty->isAggregateType(); 616804eeddc0SDimitry Andric }; 616904eeddc0SDimitry Andric if (all_of(Types, IsSimpleTy)) 617004eeddc0SDimitry Andric continue; 617104eeddc0SDimitry Andric 617204eeddc0SDimitry Andric if (Function *NestedCallee = CB->getCalledFunction()) { 617304eeddc0SDimitry Andric // Assume that intrinsics are always ABI compatible. 617404eeddc0SDimitry Andric if (NestedCallee->isIntrinsic()) 617504eeddc0SDimitry Andric continue; 617604eeddc0SDimitry Andric 617704eeddc0SDimitry Andric // Do a precise compatibility check. 617804eeddc0SDimitry Andric if (!areTypesABICompatible(Caller, NestedCallee, Types)) 617904eeddc0SDimitry Andric return false; 618004eeddc0SDimitry Andric } else { 618104eeddc0SDimitry Andric // We don't know the target features of the callee, 618204eeddc0SDimitry Andric // assume it is incompatible. 618304eeddc0SDimitry Andric return false; 618404eeddc0SDimitry Andric } 618504eeddc0SDimitry Andric } 618604eeddc0SDimitry Andric } 618704eeddc0SDimitry Andric return true; 61880b57cec5SDimitry Andric } 61890b57cec5SDimitry Andric 61900eae32dcSDimitry Andric bool X86TTIImpl::areTypesABICompatible(const Function *Caller, 61910eae32dcSDimitry Andric const Function *Callee, 61920eae32dcSDimitry Andric const ArrayRef<Type *> &Types) const { 61930eae32dcSDimitry Andric if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) 61940b57cec5SDimitry Andric return false; 61950b57cec5SDimitry Andric 61960b57cec5SDimitry Andric // If we get here, we know the target features match. If one function 61970b57cec5SDimitry Andric // considers 512-bit vectors legal and the other does not, consider them 61980b57cec5SDimitry Andric // incompatible. 61990b57cec5SDimitry Andric const TargetMachine &TM = getTLI()->getTargetMachine(); 62000b57cec5SDimitry Andric 62015ffd83dbSDimitry Andric if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == 62025ffd83dbSDimitry Andric TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs()) 62035ffd83dbSDimitry Andric return true; 62045ffd83dbSDimitry Andric 62055ffd83dbSDimitry Andric // Consider the arguments compatible if they aren't vectors or aggregates. 62065ffd83dbSDimitry Andric // FIXME: Look at the size of vectors. 62075ffd83dbSDimitry Andric // FIXME: Look at the element types of aggregates to see if there are vectors. 62080eae32dcSDimitry Andric return llvm::none_of(Types, 62090eae32dcSDimitry Andric [](Type *T) { return T->isVectorTy() || T->isAggregateType(); }); 62100b57cec5SDimitry Andric } 62110b57cec5SDimitry Andric 62120b57cec5SDimitry Andric X86TTIImpl::TTI::MemCmpExpansionOptions 62130b57cec5SDimitry Andric X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 62140b57cec5SDimitry Andric TTI::MemCmpExpansionOptions Options; 62150b57cec5SDimitry Andric Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 62160b57cec5SDimitry Andric Options.NumLoadsPerBlock = 2; 62175ffd83dbSDimitry Andric // All GPR and vector loads can be unaligned. 62185ffd83dbSDimitry Andric Options.AllowOverlappingLoads = true; 62190b57cec5SDimitry Andric if (IsZeroCmp) { 62200b57cec5SDimitry Andric // Only enable vector loads for equality comparison. Right now the vector 62210b57cec5SDimitry Andric // version is not as fast for three way compare (see #33329). 62220b57cec5SDimitry Andric const unsigned PreferredWidth = ST->getPreferVectorWidth(); 62235f757f3fSDimitry Andric if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512()) 62245f757f3fSDimitry Andric Options.LoadSizes.push_back(64); 6225480093f4SDimitry Andric if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); 62260b57cec5SDimitry Andric if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); 62270b57cec5SDimitry Andric } 62280b57cec5SDimitry Andric if (ST->is64Bit()) { 62290b57cec5SDimitry Andric Options.LoadSizes.push_back(8); 62300b57cec5SDimitry Andric } 62310b57cec5SDimitry Andric Options.LoadSizes.push_back(4); 62320b57cec5SDimitry Andric Options.LoadSizes.push_back(2); 62330b57cec5SDimitry Andric Options.LoadSizes.push_back(1); 62340b57cec5SDimitry Andric return Options; 62350b57cec5SDimitry Andric } 62360b57cec5SDimitry Andric 6237349cc55cSDimitry Andric bool X86TTIImpl::prefersVectorizedAddressing() const { 6238349cc55cSDimitry Andric return supportsGather(); 6239349cc55cSDimitry Andric } 6240349cc55cSDimitry Andric 6241349cc55cSDimitry Andric bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const { 6242349cc55cSDimitry Andric return false; 6243349cc55cSDimitry Andric } 6244349cc55cSDimitry Andric 62450b57cec5SDimitry Andric bool X86TTIImpl::enableInterleavedAccessVectorization() { 62460b57cec5SDimitry Andric // TODO: We expect this to be beneficial regardless of arch, 62470b57cec5SDimitry Andric // but there are currently some unexplained performance artifacts on Atom. 62480b57cec5SDimitry Andric // As a temporary solution, disable on Atom. 62490b57cec5SDimitry Andric return !(ST->isAtom()); 62500b57cec5SDimitry Andric } 62510b57cec5SDimitry Andric 62520b57cec5SDimitry Andric // Get estimation for interleaved load/store operations and strided load. 62530b57cec5SDimitry Andric // \p Indices contains indices for strided load. 62540b57cec5SDimitry Andric // \p Factor - the factor of interleaving. 62550b57cec5SDimitry Andric // AVX-512 provides 3-src shuffles that significantly reduces the cost. 6256fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( 62575ffd83dbSDimitry Andric unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, 62585ffd83dbSDimitry Andric ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, 62595ffd83dbSDimitry Andric TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { 62600b57cec5SDimitry Andric // VecTy for interleave memop is <VF*Factor x Elt>. 62610b57cec5SDimitry Andric // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 62620b57cec5SDimitry Andric // VecTy = <12 x i32>. 62630b57cec5SDimitry Andric 62640b57cec5SDimitry Andric // Calculate the number of memory operations (NumOfMemOps), required 62650b57cec5SDimitry Andric // for load/store the VecTy. 6266bdd1243dSDimitry Andric MVT LegalVT = getTypeLegalizationCost(VecTy).second; 62670b57cec5SDimitry Andric unsigned VecTySize = DL.getTypeStoreSize(VecTy); 62680b57cec5SDimitry Andric unsigned LegalVTSize = LegalVT.getStoreSize(); 62690b57cec5SDimitry Andric unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; 62700b57cec5SDimitry Andric 62710b57cec5SDimitry Andric // Get the cost of one memory operation. 62725ffd83dbSDimitry Andric auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), 62730b57cec5SDimitry Andric LegalVT.getVectorNumElements()); 6274349cc55cSDimitry Andric InstructionCost MemOpCost; 62754824e7fdSDimitry Andric bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; 62764824e7fdSDimitry Andric if (UseMaskedMemOp) 6277349cc55cSDimitry Andric MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, 6278349cc55cSDimitry Andric AddressSpace, CostKind); 6279349cc55cSDimitry Andric else 6280349cc55cSDimitry Andric MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment), 6281349cc55cSDimitry Andric AddressSpace, CostKind); 62820b57cec5SDimitry Andric 62835ffd83dbSDimitry Andric unsigned VF = VecTy->getNumElements() / Factor; 6284*0fca6ea1SDimitry Andric MVT VT = 6285*0fca6ea1SDimitry Andric MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF); 62860b57cec5SDimitry Andric 6287349cc55cSDimitry Andric InstructionCost MaskCost; 62884824e7fdSDimitry Andric if (UseMaskedMemOp) { 6289349cc55cSDimitry Andric APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements()); 6290349cc55cSDimitry Andric for (unsigned Index : Indices) { 6291349cc55cSDimitry Andric assert(Index < Factor && "Invalid index for interleaved memory op"); 6292349cc55cSDimitry Andric for (unsigned Elm = 0; Elm < VF; Elm++) 6293349cc55cSDimitry Andric DemandedLoadStoreElts.setBit(Index + Elm * Factor); 6294349cc55cSDimitry Andric } 6295349cc55cSDimitry Andric 62964824e7fdSDimitry Andric Type *I1Type = Type::getInt1Ty(VecTy->getContext()); 6297349cc55cSDimitry Andric 6298349cc55cSDimitry Andric MaskCost = getReplicationShuffleCost( 62994824e7fdSDimitry Andric I1Type, Factor, VF, 6300349cc55cSDimitry Andric UseMaskForGaps ? DemandedLoadStoreElts 6301349cc55cSDimitry Andric : APInt::getAllOnes(VecTy->getNumElements()), 6302349cc55cSDimitry Andric CostKind); 6303349cc55cSDimitry Andric 6304349cc55cSDimitry Andric // The Gaps mask is invariant and created outside the loop, therefore the 6305349cc55cSDimitry Andric // cost of creating it is not accounted for here. However if we have both 6306349cc55cSDimitry Andric // a MaskForGaps and some other mask that guards the execution of the 6307349cc55cSDimitry Andric // memory access, we need to account for the cost of And-ing the two masks 6308349cc55cSDimitry Andric // inside the loop. 6309349cc55cSDimitry Andric if (UseMaskForGaps) { 63104824e7fdSDimitry Andric auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements()); 6311349cc55cSDimitry Andric MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind); 6312349cc55cSDimitry Andric } 6313349cc55cSDimitry Andric } 6314349cc55cSDimitry Andric 63150b57cec5SDimitry Andric if (Opcode == Instruction::Load) { 63160b57cec5SDimitry Andric // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) 63170b57cec5SDimitry Andric // contain the cost of the optimized shuffle sequence that the 63180b57cec5SDimitry Andric // X86InterleavedAccess pass will generate. 63190b57cec5SDimitry Andric // The cost of loads and stores are computed separately from the table. 63200b57cec5SDimitry Andric 63210b57cec5SDimitry Andric // X86InterleavedAccess support only the following interleaved-access group. 63220b57cec5SDimitry Andric static const CostTblEntry AVX512InterleavedLoadTbl[] = { 63230b57cec5SDimitry Andric {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 63240b57cec5SDimitry Andric {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 63250b57cec5SDimitry Andric {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 63260b57cec5SDimitry Andric }; 63270b57cec5SDimitry Andric 63280b57cec5SDimitry Andric if (const auto *Entry = 63290b57cec5SDimitry Andric CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) 6330349cc55cSDimitry Andric return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; 63310b57cec5SDimitry Andric //If an entry does not exist, fallback to the default implementation. 63320b57cec5SDimitry Andric 63330b57cec5SDimitry Andric // Kind of shuffle depends on number of loaded values. 63340b57cec5SDimitry Andric // If we load the entire data in one register, we can use a 1-src shuffle. 63350b57cec5SDimitry Andric // Otherwise, we'll merge 2 sources in each operation. 63360b57cec5SDimitry Andric TTI::ShuffleKind ShuffleKind = 63370b57cec5SDimitry Andric (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; 63380b57cec5SDimitry Andric 6339bdd1243dSDimitry Andric InstructionCost ShuffleCost = getShuffleCost( 6340bdd1243dSDimitry Andric ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr); 63410b57cec5SDimitry Andric 63420b57cec5SDimitry Andric unsigned NumOfLoadsInInterleaveGrp = 63430b57cec5SDimitry Andric Indices.size() ? Indices.size() : Factor; 63445ffd83dbSDimitry Andric auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), 63455ffd83dbSDimitry Andric VecTy->getNumElements() / Factor); 6346fe6060f1SDimitry Andric InstructionCost NumOfResults = 6347bdd1243dSDimitry Andric getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp; 63480b57cec5SDimitry Andric 63490b57cec5SDimitry Andric // About a half of the loads may be folded in shuffles when we have only 63504824e7fdSDimitry Andric // one result. If we have more than one result, or the loads are masked, 63514824e7fdSDimitry Andric // we do not fold loads at all. 63520b57cec5SDimitry Andric unsigned NumOfUnfoldedLoads = 63534824e7fdSDimitry Andric UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; 63540b57cec5SDimitry Andric 63550b57cec5SDimitry Andric // Get a number of shuffle operations per result. 63560b57cec5SDimitry Andric unsigned NumOfShufflesPerResult = 63570b57cec5SDimitry Andric std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); 63580b57cec5SDimitry Andric 63590b57cec5SDimitry Andric // The SK_MergeTwoSrc shuffle clobbers one of src operands. 63600b57cec5SDimitry Andric // When we have more than one destination, we need additional instructions 63610b57cec5SDimitry Andric // to keep sources. 6362fe6060f1SDimitry Andric InstructionCost NumOfMoves = 0; 63630b57cec5SDimitry Andric if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) 63640b57cec5SDimitry Andric NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; 63650b57cec5SDimitry Andric 6366fe6060f1SDimitry Andric InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + 6367349cc55cSDimitry Andric MaskCost + NumOfUnfoldedLoads * MemOpCost + 6368349cc55cSDimitry Andric NumOfMoves; 63690b57cec5SDimitry Andric 63700b57cec5SDimitry Andric return Cost; 63710b57cec5SDimitry Andric } 63720b57cec5SDimitry Andric 63730b57cec5SDimitry Andric // Store. 63740b57cec5SDimitry Andric assert(Opcode == Instruction::Store && 63750b57cec5SDimitry Andric "Expected Store Instruction at this point"); 63760b57cec5SDimitry Andric // X86InterleavedAccess support only the following interleaved-access group. 63770b57cec5SDimitry Andric static const CostTblEntry AVX512InterleavedStoreTbl[] = { 63780b57cec5SDimitry Andric {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) 63790b57cec5SDimitry Andric {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) 63800b57cec5SDimitry Andric {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) 63810b57cec5SDimitry Andric 63820b57cec5SDimitry Andric {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) 63830b57cec5SDimitry Andric {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) 63840b57cec5SDimitry Andric {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) 63850b57cec5SDimitry Andric {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) 63860b57cec5SDimitry Andric }; 63870b57cec5SDimitry Andric 63880b57cec5SDimitry Andric if (const auto *Entry = 63890b57cec5SDimitry Andric CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) 6390349cc55cSDimitry Andric return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; 63910b57cec5SDimitry Andric //If an entry does not exist, fallback to the default implementation. 63920b57cec5SDimitry Andric 63930b57cec5SDimitry Andric // There is no strided stores meanwhile. And store can't be folded in 63940b57cec5SDimitry Andric // shuffle. 63950b57cec5SDimitry Andric unsigned NumOfSources = Factor; // The number of values to be merged. 6396bdd1243dSDimitry Andric InstructionCost ShuffleCost = getShuffleCost( 6397bdd1243dSDimitry Andric TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr); 63980b57cec5SDimitry Andric unsigned NumOfShufflesPerStore = NumOfSources - 1; 63990b57cec5SDimitry Andric 64000b57cec5SDimitry Andric // The SK_MergeTwoSrc shuffle clobbers one of src operands. 64010b57cec5SDimitry Andric // We need additional instructions to keep sources. 64020b57cec5SDimitry Andric unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; 6403fe6060f1SDimitry Andric InstructionCost Cost = 6404349cc55cSDimitry Andric MaskCost + 6405fe6060f1SDimitry Andric NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + 64060b57cec5SDimitry Andric NumOfMoves; 64070b57cec5SDimitry Andric return Cost; 64080b57cec5SDimitry Andric } 64090b57cec5SDimitry Andric 6410fe6060f1SDimitry Andric InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( 6411349cc55cSDimitry Andric unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices, 64125ffd83dbSDimitry Andric Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 64135ffd83dbSDimitry Andric bool UseMaskForCond, bool UseMaskForGaps) { 6414349cc55cSDimitry Andric auto *VecTy = cast<FixedVectorType>(BaseTy); 6415349cc55cSDimitry Andric 641606c3fb27SDimitry Andric auto isSupportedOnAVX512 = [&](Type *VecTy) { 64175ffd83dbSDimitry Andric Type *EltTy = cast<VectorType>(VecTy)->getElementType(); 64180b57cec5SDimitry Andric if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || 64190b57cec5SDimitry Andric EltTy->isIntegerTy(32) || EltTy->isPointerTy()) 64200b57cec5SDimitry Andric return true; 6421fcaf7f86SDimitry Andric if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy()) 642206c3fb27SDimitry Andric return ST->hasBWI(); 642306c3fb27SDimitry Andric if (EltTy->isBFloatTy()) 642406c3fb27SDimitry Andric return ST->hasBF16(); 64250b57cec5SDimitry Andric return false; 64260b57cec5SDimitry Andric }; 642706c3fb27SDimitry Andric if (ST->hasAVX512() && isSupportedOnAVX512(VecTy)) 64285ffd83dbSDimitry Andric return getInterleavedMemoryOpCostAVX512( 6429349cc55cSDimitry Andric Opcode, VecTy, Factor, Indices, Alignment, 64305ffd83dbSDimitry Andric AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); 6431349cc55cSDimitry Andric 6432349cc55cSDimitry Andric if (UseMaskForCond || UseMaskForGaps) 6433349cc55cSDimitry Andric return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 6434349cc55cSDimitry Andric Alignment, AddressSpace, CostKind, 6435349cc55cSDimitry Andric UseMaskForCond, UseMaskForGaps); 6436349cc55cSDimitry Andric 6437349cc55cSDimitry Andric // Get estimation for interleaved load/store operations for SSE-AVX2. 6438349cc55cSDimitry Andric // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow 6439349cc55cSDimitry Andric // computing the cost using a generic formula as a function of generic 6440349cc55cSDimitry Andric // shuffles. We therefore use a lookup table instead, filled according to 6441349cc55cSDimitry Andric // the instruction sequences that codegen currently generates. 6442349cc55cSDimitry Andric 6443349cc55cSDimitry Andric // VecTy for interleave memop is <VF*Factor x Elt>. 6444349cc55cSDimitry Andric // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 6445349cc55cSDimitry Andric // VecTy = <12 x i32>. 6446bdd1243dSDimitry Andric MVT LegalVT = getTypeLegalizationCost(VecTy).second; 6447349cc55cSDimitry Andric 6448349cc55cSDimitry Andric // This function can be called with VecTy=<6xi128>, Factor=3, in which case 6449349cc55cSDimitry Andric // the VF=2, while v2i128 is an unsupported MVT vector type 6450349cc55cSDimitry Andric // (see MachineValueType.h::getVectorVT()). 6451349cc55cSDimitry Andric if (!LegalVT.isVector()) 6452349cc55cSDimitry Andric return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 6453349cc55cSDimitry Andric Alignment, AddressSpace, CostKind); 6454349cc55cSDimitry Andric 6455349cc55cSDimitry Andric unsigned VF = VecTy->getNumElements() / Factor; 6456349cc55cSDimitry Andric Type *ScalarTy = VecTy->getElementType(); 6457349cc55cSDimitry Andric // Deduplicate entries, model floats/pointers as appropriately-sized integers. 6458349cc55cSDimitry Andric if (!ScalarTy->isIntegerTy()) 6459349cc55cSDimitry Andric ScalarTy = 6460349cc55cSDimitry Andric Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy)); 6461349cc55cSDimitry Andric 6462349cc55cSDimitry Andric // Get the cost of all the memory operations. 6463349cc55cSDimitry Andric // FIXME: discount dead loads. 6464349cc55cSDimitry Andric InstructionCost MemOpCosts = getMemoryOpCost( 6465349cc55cSDimitry Andric Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); 6466349cc55cSDimitry Andric 6467349cc55cSDimitry Andric auto *VT = FixedVectorType::get(ScalarTy, VF); 6468349cc55cSDimitry Andric EVT ETy = TLI->getValueType(DL, VT); 6469349cc55cSDimitry Andric if (!ETy.isSimple()) 6470349cc55cSDimitry Andric return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 6471349cc55cSDimitry Andric Alignment, AddressSpace, CostKind); 6472349cc55cSDimitry Andric 6473349cc55cSDimitry Andric // TODO: Complete for other data-types and strides. 6474349cc55cSDimitry Andric // Each combination of Stride, element bit width and VF results in a different 6475349cc55cSDimitry Andric // sequence; The cost tables are therefore accessed with: 6476349cc55cSDimitry Andric // Factor (stride) and VectorType=VFxiN. 6477349cc55cSDimitry Andric // The Cost accounts only for the shuffle sequence; 6478349cc55cSDimitry Andric // The cost of the loads/stores is accounted for separately. 6479349cc55cSDimitry Andric // 6480349cc55cSDimitry Andric static const CostTblEntry AVX2InterleavedLoadTbl[] = { 6481349cc55cSDimitry Andric {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8 6482349cc55cSDimitry Andric {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8 6483349cc55cSDimitry Andric {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8 6484349cc55cSDimitry Andric {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8 6485349cc55cSDimitry Andric {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8 6486349cc55cSDimitry Andric 6487349cc55cSDimitry Andric {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16 6488349cc55cSDimitry Andric {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16 6489349cc55cSDimitry Andric {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16 6490349cc55cSDimitry Andric 6491349cc55cSDimitry Andric {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32 6492349cc55cSDimitry Andric {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32 6493349cc55cSDimitry Andric {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32 6494349cc55cSDimitry Andric 6495349cc55cSDimitry Andric {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64 6496349cc55cSDimitry Andric {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64 6497349cc55cSDimitry Andric {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64 6498349cc55cSDimitry Andric {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64 6499349cc55cSDimitry Andric 6500349cc55cSDimitry Andric {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8 6501349cc55cSDimitry Andric {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8 6502349cc55cSDimitry Andric {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8 6503349cc55cSDimitry Andric {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8 6504349cc55cSDimitry Andric {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8 6505349cc55cSDimitry Andric 6506349cc55cSDimitry Andric {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16 6507349cc55cSDimitry Andric {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16 6508349cc55cSDimitry Andric {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16 6509349cc55cSDimitry Andric {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16 6510349cc55cSDimitry Andric {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16 6511349cc55cSDimitry Andric 6512349cc55cSDimitry Andric {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32 6513349cc55cSDimitry Andric {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32 6514349cc55cSDimitry Andric {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32 6515349cc55cSDimitry Andric {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32 6516349cc55cSDimitry Andric {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32 6517349cc55cSDimitry Andric 6518349cc55cSDimitry Andric {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64 6519349cc55cSDimitry Andric {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64 6520349cc55cSDimitry Andric {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64 6521349cc55cSDimitry Andric {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64 6522349cc55cSDimitry Andric 6523349cc55cSDimitry Andric {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8 6524349cc55cSDimitry Andric {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8 6525349cc55cSDimitry Andric {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8 6526349cc55cSDimitry Andric {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8 6527349cc55cSDimitry Andric {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8 6528349cc55cSDimitry Andric 6529349cc55cSDimitry Andric {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16 6530349cc55cSDimitry Andric {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16 6531349cc55cSDimitry Andric {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16 6532349cc55cSDimitry Andric {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16 6533349cc55cSDimitry Andric {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16 6534349cc55cSDimitry Andric 6535349cc55cSDimitry Andric {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32 6536349cc55cSDimitry Andric {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32 6537349cc55cSDimitry Andric {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32 6538349cc55cSDimitry Andric {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32 6539349cc55cSDimitry Andric {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32 6540349cc55cSDimitry Andric 6541349cc55cSDimitry Andric {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64 6542349cc55cSDimitry Andric {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64 6543349cc55cSDimitry Andric {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64 6544349cc55cSDimitry Andric {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64 6545349cc55cSDimitry Andric 6546349cc55cSDimitry Andric {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8 6547349cc55cSDimitry Andric {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8 6548349cc55cSDimitry Andric {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8 6549349cc55cSDimitry Andric {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8 6550349cc55cSDimitry Andric {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8 6551349cc55cSDimitry Andric 6552349cc55cSDimitry Andric {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16 6553349cc55cSDimitry Andric {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16 6554349cc55cSDimitry Andric {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16 6555349cc55cSDimitry Andric {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16 6556349cc55cSDimitry Andric {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16 6557349cc55cSDimitry Andric 6558349cc55cSDimitry Andric {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32 6559349cc55cSDimitry Andric {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32 6560349cc55cSDimitry Andric {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32 6561349cc55cSDimitry Andric {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32 6562349cc55cSDimitry Andric 6563349cc55cSDimitry Andric {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64 6564349cc55cSDimitry Andric {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64 6565349cc55cSDimitry Andric {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64 6566349cc55cSDimitry Andric 6567349cc55cSDimitry Andric {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32 6568349cc55cSDimitry Andric }; 6569349cc55cSDimitry Andric 6570349cc55cSDimitry Andric static const CostTblEntry SSSE3InterleavedLoadTbl[] = { 6571349cc55cSDimitry Andric {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16 6572349cc55cSDimitry Andric }; 6573349cc55cSDimitry Andric 6574349cc55cSDimitry Andric static const CostTblEntry SSE2InterleavedLoadTbl[] = { 6575349cc55cSDimitry Andric {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16 6576349cc55cSDimitry Andric {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16 6577349cc55cSDimitry Andric 6578349cc55cSDimitry Andric {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32 6579349cc55cSDimitry Andric {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32 6580349cc55cSDimitry Andric 6581349cc55cSDimitry Andric {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64 6582349cc55cSDimitry Andric }; 6583349cc55cSDimitry Andric 6584349cc55cSDimitry Andric static const CostTblEntry AVX2InterleavedStoreTbl[] = { 6585349cc55cSDimitry Andric {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store) 6586349cc55cSDimitry Andric {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store) 6587349cc55cSDimitry Andric 6588349cc55cSDimitry Andric {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store) 6589349cc55cSDimitry Andric {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store) 6590349cc55cSDimitry Andric {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store) 6591349cc55cSDimitry Andric 6592349cc55cSDimitry Andric {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store) 6593349cc55cSDimitry Andric {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store) 6594349cc55cSDimitry Andric {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store) 6595349cc55cSDimitry Andric {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store) 6596349cc55cSDimitry Andric 6597349cc55cSDimitry Andric {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store) 6598349cc55cSDimitry Andric {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store) 6599349cc55cSDimitry Andric {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store) 6600349cc55cSDimitry Andric {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store) 6601349cc55cSDimitry Andric {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store) 6602349cc55cSDimitry Andric 6603349cc55cSDimitry Andric {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store) 6604349cc55cSDimitry Andric {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store) 6605349cc55cSDimitry Andric {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store) 6606349cc55cSDimitry Andric {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store) 6607349cc55cSDimitry Andric {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store) 6608349cc55cSDimitry Andric 6609349cc55cSDimitry Andric {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store) 6610349cc55cSDimitry Andric {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store) 6611349cc55cSDimitry Andric {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store) 6612349cc55cSDimitry Andric {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store) 6613349cc55cSDimitry Andric {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store) 6614349cc55cSDimitry Andric 6615349cc55cSDimitry Andric {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store) 6616349cc55cSDimitry Andric {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store) 6617349cc55cSDimitry Andric {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store) 6618349cc55cSDimitry Andric {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store) 6619349cc55cSDimitry Andric {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store) 6620349cc55cSDimitry Andric 6621349cc55cSDimitry Andric {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store) 6622349cc55cSDimitry Andric {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store) 6623349cc55cSDimitry Andric {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store) 6624349cc55cSDimitry Andric {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store) 6625349cc55cSDimitry Andric 6626349cc55cSDimitry Andric {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store) 6627349cc55cSDimitry Andric {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store) 6628349cc55cSDimitry Andric {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store) 6629349cc55cSDimitry Andric {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store) 6630349cc55cSDimitry Andric {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store) 6631349cc55cSDimitry Andric 6632349cc55cSDimitry Andric {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store) 6633349cc55cSDimitry Andric {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store) 6634349cc55cSDimitry Andric {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store) 6635349cc55cSDimitry Andric {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store) 6636349cc55cSDimitry Andric {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store) 6637349cc55cSDimitry Andric 6638349cc55cSDimitry Andric {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store) 6639349cc55cSDimitry Andric {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store) 6640349cc55cSDimitry Andric {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store) 6641349cc55cSDimitry Andric {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store) 6642349cc55cSDimitry Andric {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store) 6643349cc55cSDimitry Andric 6644349cc55cSDimitry Andric {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store) 6645349cc55cSDimitry Andric {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store) 6646349cc55cSDimitry Andric {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store) 6647349cc55cSDimitry Andric {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store) 6648349cc55cSDimitry Andric 6649349cc55cSDimitry Andric {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store) 6650349cc55cSDimitry Andric {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store) 6651349cc55cSDimitry Andric {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store) 6652349cc55cSDimitry Andric {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store) 6653349cc55cSDimitry Andric {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store) 6654349cc55cSDimitry Andric 6655349cc55cSDimitry Andric {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store) 6656349cc55cSDimitry Andric {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store) 6657349cc55cSDimitry Andric {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store) 6658349cc55cSDimitry Andric {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store) 6659349cc55cSDimitry Andric {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store) 6660349cc55cSDimitry Andric 6661349cc55cSDimitry Andric {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store) 6662349cc55cSDimitry Andric {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store) 6663349cc55cSDimitry Andric {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store) 6664349cc55cSDimitry Andric {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store) 6665349cc55cSDimitry Andric 6666349cc55cSDimitry Andric {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store) 6667349cc55cSDimitry Andric {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store) 6668349cc55cSDimitry Andric {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store) 6669349cc55cSDimitry Andric }; 6670349cc55cSDimitry Andric 6671349cc55cSDimitry Andric static const CostTblEntry SSE2InterleavedStoreTbl[] = { 6672349cc55cSDimitry Andric {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store) 6673349cc55cSDimitry Andric {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store) 6674349cc55cSDimitry Andric {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store) 6675349cc55cSDimitry Andric 6676349cc55cSDimitry Andric {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store) 6677349cc55cSDimitry Andric {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store) 6678349cc55cSDimitry Andric 6679349cc55cSDimitry Andric {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store) 6680349cc55cSDimitry Andric }; 6681349cc55cSDimitry Andric 6682349cc55cSDimitry Andric if (Opcode == Instruction::Load) { 6683349cc55cSDimitry Andric auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), 6684349cc55cSDimitry Andric MemOpCosts](const CostTblEntry *Entry) { 6685349cc55cSDimitry Andric // NOTE: this is just an approximation! 6686349cc55cSDimitry Andric // It can over/under -estimate the cost! 6687349cc55cSDimitry Andric return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor); 6688349cc55cSDimitry Andric }; 6689349cc55cSDimitry Andric 66900b57cec5SDimitry Andric if (ST->hasAVX2()) 6691349cc55cSDimitry Andric if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, 6692349cc55cSDimitry Andric ETy.getSimpleVT())) 6693349cc55cSDimitry Andric return GetDiscountedCost(Entry); 6694349cc55cSDimitry Andric 6695349cc55cSDimitry Andric if (ST->hasSSSE3()) 6696349cc55cSDimitry Andric if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor, 6697349cc55cSDimitry Andric ETy.getSimpleVT())) 6698349cc55cSDimitry Andric return GetDiscountedCost(Entry); 6699349cc55cSDimitry Andric 6700349cc55cSDimitry Andric if (ST->hasSSE2()) 6701349cc55cSDimitry Andric if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor, 6702349cc55cSDimitry Andric ETy.getSimpleVT())) 6703349cc55cSDimitry Andric return GetDiscountedCost(Entry); 6704349cc55cSDimitry Andric } else { 6705349cc55cSDimitry Andric assert(Opcode == Instruction::Store && 6706349cc55cSDimitry Andric "Expected Store Instruction at this point"); 6707349cc55cSDimitry Andric assert((!Indices.size() || Indices.size() == Factor) && 6708349cc55cSDimitry Andric "Interleaved store only supports fully-interleaved groups."); 6709349cc55cSDimitry Andric if (ST->hasAVX2()) 6710349cc55cSDimitry Andric if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, 6711349cc55cSDimitry Andric ETy.getSimpleVT())) 6712349cc55cSDimitry Andric return MemOpCosts + Entry->Cost; 6713349cc55cSDimitry Andric 6714349cc55cSDimitry Andric if (ST->hasSSE2()) 6715349cc55cSDimitry Andric if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor, 6716349cc55cSDimitry Andric ETy.getSimpleVT())) 6717349cc55cSDimitry Andric return MemOpCosts + Entry->Cost; 6718349cc55cSDimitry Andric } 67190b57cec5SDimitry Andric 67200b57cec5SDimitry Andric return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 67215ffd83dbSDimitry Andric Alignment, AddressSpace, CostKind, 67220b57cec5SDimitry Andric UseMaskForCond, UseMaskForGaps); 67230b57cec5SDimitry Andric } 6724bdd1243dSDimitry Andric 6725bdd1243dSDimitry Andric InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 6726*0fca6ea1SDimitry Andric StackOffset BaseOffset, 6727bdd1243dSDimitry Andric bool HasBaseReg, int64_t Scale, 6728bdd1243dSDimitry Andric unsigned AddrSpace) const { 6729bdd1243dSDimitry Andric // Scaling factors are not free at all. 6730bdd1243dSDimitry Andric // An indexed folded instruction, i.e., inst (reg1, reg2, scale), 6731bdd1243dSDimitry Andric // will take 2 allocations in the out of order engine instead of 1 6732bdd1243dSDimitry Andric // for plain addressing mode, i.e. inst (reg1). 6733bdd1243dSDimitry Andric // E.g., 6734bdd1243dSDimitry Andric // vaddps (%rsi,%rdx), %ymm0, %ymm1 6735bdd1243dSDimitry Andric // Requires two allocations (one for the load, one for the computation) 6736bdd1243dSDimitry Andric // whereas: 6737bdd1243dSDimitry Andric // vaddps (%rsi), %ymm0, %ymm1 6738bdd1243dSDimitry Andric // Requires just 1 allocation, i.e., freeing allocations for other operations 6739bdd1243dSDimitry Andric // and having less micro operations to execute. 6740bdd1243dSDimitry Andric // 6741bdd1243dSDimitry Andric // For some X86 architectures, this is even worse because for instance for 6742bdd1243dSDimitry Andric // stores, the complex addressing mode forces the instruction to use the 6743bdd1243dSDimitry Andric // "load" ports instead of the dedicated "store" port. 6744bdd1243dSDimitry Andric // E.g., on Haswell: 6745bdd1243dSDimitry Andric // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. 6746bdd1243dSDimitry Andric // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. 6747bdd1243dSDimitry Andric TargetLoweringBase::AddrMode AM; 6748bdd1243dSDimitry Andric AM.BaseGV = BaseGV; 6749*0fca6ea1SDimitry Andric AM.BaseOffs = BaseOffset.getFixed(); 6750bdd1243dSDimitry Andric AM.HasBaseReg = HasBaseReg; 6751bdd1243dSDimitry Andric AM.Scale = Scale; 6752*0fca6ea1SDimitry Andric AM.ScalableOffset = BaseOffset.getScalable(); 6753bdd1243dSDimitry Andric if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) 6754bdd1243dSDimitry Andric // Scale represents reg2 * scale, thus account for 1 6755bdd1243dSDimitry Andric // as soon as we use a second register. 6756bdd1243dSDimitry Andric return AM.Scale != 0; 6757bdd1243dSDimitry Andric return -1; 6758bdd1243dSDimitry Andric } 6759*0fca6ea1SDimitry Andric 6760*0fca6ea1SDimitry Andric InstructionCost X86TTIImpl::getBranchMispredictPenalty() const { 6761*0fca6ea1SDimitry Andric // TODO: Hook MispredictPenalty of SchedMachineModel into this. 6762*0fca6ea1SDimitry Andric return 14; 6763*0fca6ea1SDimitry Andric } 6764