1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 /// About Cost Model numbers used below it's necessary to say the following: 16 /// the numbers correspond to some "generic" X86 CPU instead of usage of a 17 /// specific CPU model. Usually the numbers correspond to the CPU where the 18 /// feature first appeared. For example, if we do Subtarget.hasSSE42() in 19 /// the lookups below the cost is based on Nehalem as that was the first CPU 20 /// to support that feature level and thus has most likely the worst case cost, 21 /// although we may discard an outlying worst cost from one CPU (e.g. Atom). 22 /// 23 /// Some examples of other technologies/CPUs: 24 /// SSE 3 - Pentium4 / Athlon64 25 /// SSE 4.1 - Penryn 26 /// SSE 4.2 - Nehalem / Silvermont 27 /// AVX - Sandy Bridge / Jaguar / Bulldozer 28 /// AVX2 - Haswell / Ryzen 29 /// AVX-512 - Xeon Phi / Skylake 30 /// 31 /// And some examples of instruction target dependent costs (latency) 32 /// divss sqrtss rsqrtss 33 /// AMD K7 11-16 19 3 34 /// Piledriver 9-24 13-15 5 35 /// Jaguar 14 16 2 36 /// Pentium II,III 18 30 2 37 /// Nehalem 7-14 7-18 3 38 /// Haswell 10-13 11 5 39 /// 40 /// Interpreting the 4 TargetCostKind types: 41 /// TCK_RecipThroughput and TCK_Latency should try to match the worst case 42 /// values reported by the CPU scheduler models (and llvm-mca). 43 /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the 44 /// actual encoding size of the instruction. 45 /// TCK_SizeAndLatency should match the worst case micro-op counts reported by 46 /// by the CPU scheduler models (and llvm-mca), to ensure that they are 47 /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are 48 /// often used as the cost thresholds where TCK_SizeAndLatency is requested. 49 //===----------------------------------------------------------------------===// 50 51 #include "X86TargetTransformInfo.h" 52 #include "llvm/Analysis/TargetTransformInfo.h" 53 #include "llvm/CodeGen/BasicTTIImpl.h" 54 #include "llvm/CodeGen/CostTable.h" 55 #include "llvm/CodeGen/TargetLowering.h" 56 #include "llvm/IR/InstIterator.h" 57 #include "llvm/IR/IntrinsicInst.h" 58 #include <optional> 59 60 using namespace llvm; 61 62 #define DEBUG_TYPE "x86tti" 63 64 //===----------------------------------------------------------------------===// 65 // 66 // X86 cost model. 67 // 68 //===----------------------------------------------------------------------===// 69 70 // Helper struct to store/access costs for each cost kind. 71 // TODO: Move this to allow other targets to use it? 72 struct CostKindCosts { 73 unsigned RecipThroughputCost = ~0U; 74 unsigned LatencyCost = ~0U; 75 unsigned CodeSizeCost = ~0U; 76 unsigned SizeAndLatencyCost = ~0U; 77 78 std::optional<unsigned> 79 operator[](TargetTransformInfo::TargetCostKind Kind) const { 80 unsigned Cost = ~0U; 81 switch (Kind) { 82 case TargetTransformInfo::TCK_RecipThroughput: 83 Cost = RecipThroughputCost; 84 break; 85 case TargetTransformInfo::TCK_Latency: 86 Cost = LatencyCost; 87 break; 88 case TargetTransformInfo::TCK_CodeSize: 89 Cost = CodeSizeCost; 90 break; 91 case TargetTransformInfo::TCK_SizeAndLatency: 92 Cost = SizeAndLatencyCost; 93 break; 94 } 95 if (Cost == ~0U) 96 return std::nullopt; 97 return Cost; 98 } 99 }; 100 using CostKindTblEntry = CostTblEntryT<CostKindCosts>; 101 using TypeConversionCostKindTblEntry = TypeConversionCostTblEntryT<CostKindCosts>; 102 103 TargetTransformInfo::PopcntSupportKind 104 X86TTIImpl::getPopcntSupport(unsigned TyWidth) { 105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 106 // TODO: Currently the __builtin_popcount() implementation using SSE3 107 // instructions is inefficient. Once the problem is fixed, we should 108 // call ST->hasSSE3() instead of ST->hasPOPCNT(). 109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; 110 } 111 112 std::optional<unsigned> X86TTIImpl::getCacheSize( 113 TargetTransformInfo::CacheLevel Level) const { 114 switch (Level) { 115 case TargetTransformInfo::CacheLevel::L1D: 116 // - Penryn 117 // - Nehalem 118 // - Westmere 119 // - Sandy Bridge 120 // - Ivy Bridge 121 // - Haswell 122 // - Broadwell 123 // - Skylake 124 // - Kabylake 125 return 32 * 1024; // 32 KByte 126 case TargetTransformInfo::CacheLevel::L2D: 127 // - Penryn 128 // - Nehalem 129 // - Westmere 130 // - Sandy Bridge 131 // - Ivy Bridge 132 // - Haswell 133 // - Broadwell 134 // - Skylake 135 // - Kabylake 136 return 256 * 1024; // 256 KByte 137 } 138 139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 140 } 141 142 std::optional<unsigned> X86TTIImpl::getCacheAssociativity( 143 TargetTransformInfo::CacheLevel Level) const { 144 // - Penryn 145 // - Nehalem 146 // - Westmere 147 // - Sandy Bridge 148 // - Ivy Bridge 149 // - Haswell 150 // - Broadwell 151 // - Skylake 152 // - Kabylake 153 switch (Level) { 154 case TargetTransformInfo::CacheLevel::L1D: 155 [[fallthrough]]; 156 case TargetTransformInfo::CacheLevel::L2D: 157 return 8; 158 } 159 160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 161 } 162 163 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { 164 bool Vector = (ClassID == 1); 165 if (Vector && !ST->hasSSE1()) 166 return 0; 167 168 if (ST->is64Bit()) { 169 if (Vector && ST->hasAVX512()) 170 return 32; 171 if (!Vector && ST->hasEGPR()) 172 return 32; 173 return 16; 174 } 175 return 8; 176 } 177 178 bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty) const { 179 if (!ST->hasCF()) 180 return false; 181 if (!Ty) 182 return true; 183 // Conditional faulting is supported by CFCMOV, which only accepts 184 // 16/32/64-bit operands. 185 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's 186 // profitable. 187 auto *VTy = dyn_cast<FixedVectorType>(Ty); 188 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1)) 189 return false; 190 auto *ScalarTy = Ty->getScalarType(); 191 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) { 192 default: 193 return false; 194 case 16: 195 case 32: 196 case 64: 197 return true; 198 } 199 } 200 201 TypeSize 202 X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 203 unsigned PreferVectorWidth = ST->getPreferVectorWidth(); 204 switch (K) { 205 case TargetTransformInfo::RGK_Scalar: 206 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); 207 case TargetTransformInfo::RGK_FixedWidthVector: 208 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512) 209 return TypeSize::getFixed(512); 210 if (ST->hasAVX() && PreferVectorWidth >= 256) 211 return TypeSize::getFixed(256); 212 if (ST->hasSSE1() && PreferVectorWidth >= 128) 213 return TypeSize::getFixed(128); 214 return TypeSize::getFixed(0); 215 case TargetTransformInfo::RGK_ScalableVector: 216 return TypeSize::getScalable(0); 217 } 218 219 llvm_unreachable("Unsupported register kind"); 220 } 221 222 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { 223 return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 224 .getFixedValue(); 225 } 226 227 unsigned X86TTIImpl::getMaxInterleaveFactor(ElementCount VF) { 228 // If the loop will not be vectorized, don't interleave the loop. 229 // Let regular unroll to unroll the loop, which saves the overflow 230 // check and memory check cost. 231 if (VF.isScalar()) 232 return 1; 233 234 if (ST->isAtom()) 235 return 1; 236 237 // Sandybridge and Haswell have multiple execution ports and pipelined 238 // vector units. 239 if (ST->hasAVX()) 240 return 4; 241 242 return 2; 243 } 244 245 InstructionCost X86TTIImpl::getArithmeticInstrCost( 246 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 247 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 248 ArrayRef<const Value *> Args, 249 const Instruction *CxtI) { 250 251 // vXi8 multiplications are always promoted to vXi16. 252 // Sub-128-bit types can be extended/packed more efficiently. 253 if (Opcode == Instruction::Mul && Ty->isVectorTy() && 254 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) { 255 Type *WideVecTy = 256 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty)); 257 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty, 258 TargetTransformInfo::CastContextHint::None, 259 CostKind) + 260 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, 261 TargetTransformInfo::CastContextHint::None, 262 CostKind) + 263 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info); 264 } 265 266 // Legalize the type. 267 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 268 269 int ISD = TLI->InstructionOpcodeToISD(Opcode); 270 assert(ISD && "Invalid opcode"); 271 272 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && 273 (LT.second.getScalarType() == MVT::i32 || 274 LT.second.getScalarType() == MVT::i64)) { 275 // Check if the operands can be represented as a smaller datatype. 276 bool Op1Signed = false, Op2Signed = false; 277 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); 278 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); 279 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); 280 bool SignedMode = Op1Signed || Op2Signed; 281 282 // If both vXi32 are representable as i15 and at least one is constant, 283 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we 284 // can treat this as PMADDWD which has the same costs as a vXi16 multiply. 285 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() && 286 LT.second.getScalarType() == MVT::i32) { 287 bool Op1Constant = 288 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]); 289 bool Op2Constant = 290 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]); 291 bool Op1Sext = isa<SExtInst>(Args[0]) && 292 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); 293 bool Op2Sext = isa<SExtInst>(Args[1]) && 294 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); 295 296 bool IsZeroExtended = !Op1Signed || !Op2Signed; 297 bool IsConstant = Op1Constant || Op2Constant; 298 bool IsSext = Op1Sext || Op2Sext; 299 if (IsConstant || IsZeroExtended || IsSext) 300 LT.second = 301 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements()); 302 } 303 304 // Check if the vXi32 operands can be shrunk into a smaller datatype. 305 // This should match the codegen from reduceVMULWidth. 306 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()). 307 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) { 308 if (OpMinSize <= 7) 309 return LT.first * 3; // pmullw/sext 310 if (!SignedMode && OpMinSize <= 8) 311 return LT.first * 3; // pmullw/zext 312 if (OpMinSize <= 15) 313 return LT.first * 5; // pmullw/pmulhw/pshuf 314 if (!SignedMode && OpMinSize <= 16) 315 return LT.first * 5; // pmullw/pmulhw/pshuf 316 } 317 318 // If both vXi64 are representable as (unsigned) i32, then we can perform 319 // the multiple with a single PMULUDQ instruction. 320 // TODO: Add (SSE41+) PMULDQ handling for signed extensions. 321 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64) 322 ISD = X86ISD::PMULUDQ; 323 } 324 325 // Vector multiply by pow2 will be simplified to shifts. 326 // Vector multiply by -pow2 will be simplified to shifts/negates. 327 if (ISD == ISD::MUL && Op2Info.isConstant() && 328 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) { 329 InstructionCost Cost = 330 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, 331 Op1Info.getNoProps(), Op2Info.getNoProps()); 332 if (Op2Info.isNegatedPowerOf2()) 333 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); 334 return Cost; 335 } 336 337 // On X86, vector signed division by constants power-of-two are 338 // normally expanded to the sequence SRA + SRL + ADD + SRA. 339 // The OperandValue properties may not be the same as that of the previous 340 // operation; conservatively assume OP_None. 341 if ((ISD == ISD::SDIV || ISD == ISD::SREM) && 342 Op2Info.isConstant() && Op2Info.isPowerOf2()) { 343 InstructionCost Cost = 344 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, 345 Op1Info.getNoProps(), Op2Info.getNoProps()); 346 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, 347 Op1Info.getNoProps(), Op2Info.getNoProps()); 348 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, 349 Op1Info.getNoProps(), Op2Info.getNoProps()); 350 351 if (ISD == ISD::SREM) { 352 // For SREM: (X % C) is the equivalent of (X - (X/C)*C) 353 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), 354 Op2Info.getNoProps()); 355 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(), 356 Op2Info.getNoProps()); 357 } 358 359 return Cost; 360 } 361 362 // Vector unsigned division/remainder will be simplified to shifts/masks. 363 if ((ISD == ISD::UDIV || ISD == ISD::UREM) && 364 Op2Info.isConstant() && Op2Info.isPowerOf2()) { 365 if (ISD == ISD::UDIV) 366 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, 367 Op1Info.getNoProps(), Op2Info.getNoProps()); 368 // UREM 369 return getArithmeticInstrCost(Instruction::And, Ty, CostKind, 370 Op1Info.getNoProps(), Op2Info.getNoProps()); 371 } 372 373 static const CostKindTblEntry GFNIUniformConstCostTable[] = { 374 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 375 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 376 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 377 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 378 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 379 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 380 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 381 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 382 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 383 }; 384 385 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI()) 386 if (const auto *Entry = 387 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second)) 388 if (auto KindCost = Entry->Cost[CostKind]) 389 return LT.first * *KindCost; 390 391 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = { 392 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. 393 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. 394 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb. 395 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand. 396 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand. 397 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb. 398 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand. 399 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand. 400 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb. 401 402 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw 403 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw 404 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw 405 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw 406 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw 407 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw 408 }; 409 410 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI()) 411 if (const auto *Entry = 412 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second)) 413 if (auto KindCost = Entry->Cost[CostKind]) 414 return LT.first * *KindCost; 415 416 static const CostKindTblEntry AVX512UniformConstCostTable[] = { 417 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand. 418 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand. 419 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb. 420 421 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split. 422 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split. 423 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split. 424 425 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld 426 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld 427 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad 428 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld 429 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld 430 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad 431 432 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq 433 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq 434 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq 435 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq 436 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq 437 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq 438 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq 439 440 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence 441 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence 442 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence 443 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence 444 }; 445 446 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512()) 447 if (const auto *Entry = 448 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second)) 449 if (auto KindCost = Entry->Cost[CostKind]) 450 return LT.first * *KindCost; 451 452 static const CostKindTblEntry AVX2UniformConstCostTable[] = { 453 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand. 454 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand. 455 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb. 456 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand. 457 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand. 458 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb. 459 460 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw 461 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw 462 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw 463 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw 464 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw 465 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw 466 467 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld 468 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld 469 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad 470 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld 471 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld 472 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad 473 474 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq 475 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq 476 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. 477 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq 478 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq 479 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split. 480 481 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence 482 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence 483 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence 484 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence 485 }; 486 487 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2()) 488 if (const auto *Entry = 489 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second)) 490 if (auto KindCost = Entry->Cost[CostKind]) 491 return LT.first * *KindCost; 492 493 static const CostKindTblEntry AVXUniformConstCostTable[] = { 494 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand. 495 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand. 496 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. 497 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split. 498 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split. 499 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split. 500 501 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw. 502 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw. 503 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw. 504 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split. 505 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split. 506 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split. 507 508 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld. 509 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld. 510 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad. 511 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split. 512 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split. 513 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split. 514 515 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq. 516 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq. 517 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. 518 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. 519 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. 520 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split. 521 522 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split. 523 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split. 524 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split. 525 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split. 526 }; 527 528 // XOP has faster vXi8 shifts. 529 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() && 530 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) 531 if (const auto *Entry = 532 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second)) 533 if (auto KindCost = Entry->Cost[CostKind]) 534 return LT.first * *KindCost; 535 536 static const CostKindTblEntry SSE2UniformConstCostTable[] = { 537 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. 538 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. 539 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. 540 541 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw. 542 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw. 543 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw. 544 545 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld 546 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld. 547 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad. 548 549 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq. 550 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq. 551 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle. 552 553 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence 554 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence 555 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence 556 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence 557 }; 558 559 // XOP has faster vXi8 shifts. 560 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() && 561 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) 562 if (const auto *Entry = 563 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) 564 if (auto KindCost = Entry->Cost[CostKind]) 565 return LT.first * *KindCost; 566 567 static const CostKindTblEntry AVX512BWConstCostTable[] = { 568 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence 569 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 570 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence 571 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 572 573 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence 574 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence 575 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence 576 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence 577 }; 578 579 if (Op2Info.isConstant() && ST->hasBWI()) 580 if (const auto *Entry = 581 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) 582 if (auto KindCost = Entry->Cost[CostKind]) 583 return LT.first * *KindCost; 584 585 static const CostKindTblEntry AVX512ConstCostTable[] = { 586 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence 587 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence 588 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence 589 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence 590 591 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence 592 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence 593 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence 594 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence 595 596 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence 597 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence 598 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence 599 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence 600 }; 601 602 if (Op2Info.isConstant() && ST->hasAVX512()) 603 if (const auto *Entry = 604 CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) 605 if (auto KindCost = Entry->Cost[CostKind]) 606 return LT.first * *KindCost; 607 608 static const CostKindTblEntry AVX2ConstCostTable[] = { 609 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence 610 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 611 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence 612 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 613 614 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence 615 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence 616 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence 617 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence 618 619 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence 620 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence 621 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence 622 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence 623 }; 624 625 if (Op2Info.isConstant() && ST->hasAVX2()) 626 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) 627 if (auto KindCost = Entry->Cost[CostKind]) 628 return LT.first * *KindCost; 629 630 static const CostKindTblEntry AVXConstCostTable[] = { 631 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. 632 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. 633 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. 634 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. 635 636 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split. 637 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split. 638 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split. 639 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split. 640 641 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence 642 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence 643 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split. 644 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split. 645 }; 646 647 if (Op2Info.isConstant() && ST->hasAVX()) 648 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second)) 649 if (auto KindCost = Entry->Cost[CostKind]) 650 return LT.first * *KindCost; 651 652 static const CostKindTblEntry SSE41ConstCostTable[] = { 653 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence 654 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence 655 }; 656 657 if (Op2Info.isConstant() && ST->hasSSE41()) 658 if (const auto *Entry = 659 CostTableLookup(SSE41ConstCostTable, ISD, LT.second)) 660 if (auto KindCost = Entry->Cost[CostKind]) 661 return LT.first * *KindCost; 662 663 static const CostKindTblEntry SSE2ConstCostTable[] = { 664 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence 665 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 666 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence 667 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 668 669 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence 670 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence 671 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence 672 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence 673 674 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence 675 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence 676 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence 677 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence 678 }; 679 680 if (Op2Info.isConstant() && ST->hasSSE2()) 681 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) 682 if (auto KindCost = Entry->Cost[CostKind]) 683 return LT.first * *KindCost; 684 685 static const CostKindTblEntry AVX512BWUniformCostTable[] = { 686 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. 687 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand. 688 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb. 689 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. 690 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. 691 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb. 692 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand. 693 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand. 694 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb. 695 696 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw 697 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw 698 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw 699 }; 700 701 if (ST->hasBWI() && Op2Info.isUniform()) 702 if (const auto *Entry = 703 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second)) 704 if (auto KindCost = Entry->Cost[CostKind]) 705 return LT.first * *KindCost; 706 707 static const CostKindTblEntry AVX512UniformCostTable[] = { 708 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split. 709 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split. 710 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split. 711 712 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld 713 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld 714 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad 715 716 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq 717 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq 718 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq 719 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq 720 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq 721 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq 722 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq 723 }; 724 725 if (ST->hasAVX512() && Op2Info.isUniform()) 726 if (const auto *Entry = 727 CostTableLookup(AVX512UniformCostTable, ISD, LT.second)) 728 if (auto KindCost = Entry->Cost[CostKind]) 729 return LT.first * *KindCost; 730 731 static const CostKindTblEntry AVX2UniformCostTable[] = { 732 // Uniform splats are cheaper for the following instructions. 733 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. 734 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand. 735 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb. 736 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. 737 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. 738 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb. 739 740 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw. 741 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw. 742 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw. 743 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw. 744 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw. 745 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw. 746 747 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld 748 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld 749 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad 750 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld 751 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld 752 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad 753 754 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq 755 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq 756 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle. 757 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq 758 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq 759 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle. 760 }; 761 762 if (ST->hasAVX2() && Op2Info.isUniform()) 763 if (const auto *Entry = 764 CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) 765 if (auto KindCost = Entry->Cost[CostKind]) 766 return LT.first * *KindCost; 767 768 static const CostKindTblEntry AVXUniformCostTable[] = { 769 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand. 770 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand. 771 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb. 772 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split. 773 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split. 774 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split. 775 776 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw. 777 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw. 778 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw. 779 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split. 780 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split. 781 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split. 782 783 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld. 784 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld. 785 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad. 786 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split. 787 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split. 788 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split. 789 790 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq. 791 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq. 792 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle. 793 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split. 794 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split. 795 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split. 796 }; 797 798 // XOP has faster vXi8 shifts. 799 if (ST->hasAVX() && Op2Info.isUniform() && 800 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) 801 if (const auto *Entry = 802 CostTableLookup(AVXUniformCostTable, ISD, LT.second)) 803 if (auto KindCost = Entry->Cost[CostKind]) 804 return LT.first * *KindCost; 805 806 static const CostKindTblEntry SSE2UniformCostTable[] = { 807 // Uniform splats are cheaper for the following instructions. 808 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand. 809 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand. 810 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence. 811 812 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw. 813 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw. 814 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw. 815 816 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld 817 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld. 818 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad. 819 820 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq. 821 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq. 822 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub. 823 }; 824 825 if (ST->hasSSE2() && Op2Info.isUniform() && 826 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) 827 if (const auto *Entry = 828 CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) 829 if (auto KindCost = Entry->Cost[CostKind]) 830 return LT.first * *KindCost; 831 832 static const CostKindTblEntry AVX512DQCostTable[] = { 833 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq 834 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq 835 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq 836 }; 837 838 // Look for AVX512DQ lowering tricks for custom cases. 839 if (ST->hasDQI()) 840 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) 841 if (auto KindCost = Entry->Cost[CostKind]) 842 return LT.first * *KindCost; 843 844 static const CostKindTblEntry AVX512BWCostTable[] = { 845 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence. 846 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence. 847 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence. 848 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence. 849 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence. 850 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence. 851 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence. 852 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence. 853 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence. 854 855 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw 856 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw 857 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw 858 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw 859 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw 860 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw 861 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw 862 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw 863 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw 864 865 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb 866 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw 867 868 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb 869 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw 870 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd 871 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq 872 873 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb 874 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw 875 876 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc 877 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw 878 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw 879 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw 880 881 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb 882 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw 883 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd 884 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq 885 }; 886 887 // Look for AVX512BW lowering tricks for custom cases. 888 if (ST->hasBWI()) 889 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) 890 if (auto KindCost = Entry->Cost[CostKind]) 891 return LT.first * *KindCost; 892 893 static const CostKindTblEntry AVX512CostTable[] = { 894 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence. 895 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence. 896 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence. 897 898 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. 899 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. 900 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence. 901 902 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, 903 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, 904 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, 905 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, 906 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, 907 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, 908 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, 909 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, 910 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, 911 912 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, 913 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, 914 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, 915 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, 916 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, 917 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, 918 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, 919 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, 920 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, 921 922 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split 923 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split 924 925 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split 926 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split 927 928 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } }, 929 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } }, 930 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } }, 931 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } }, 932 933 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } }, 934 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } }, 935 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } }, 936 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } }, 937 938 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } }, 939 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } }, 940 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } }, 941 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } }, 942 943 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) 944 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) 945 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) 946 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add 947 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/ 948 949 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } }, 950 951 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ 952 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 953 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 954 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 955 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 956 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 957 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 958 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 959 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 960 961 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ 962 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ 963 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/ 964 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/ 965 966 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ 967 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 968 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 969 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 970 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 971 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 972 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 973 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 974 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 975 976 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ 977 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ 978 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/ 979 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/ 980 }; 981 982 if (ST->hasAVX512()) 983 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) 984 if (auto KindCost = Entry->Cost[CostKind]) 985 return LT.first * *KindCost; 986 987 static const CostKindTblEntry AVX2ShiftCostTable[] = { 988 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to 989 // customize them to detect the cases where shift amount is a scalar one. 990 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org) 991 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org) 992 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org) 993 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org) 994 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org) 995 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org) 996 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org) 997 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org) 998 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org) 999 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org) 1000 }; 1001 1002 if (ST->hasAVX512()) { 1003 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant()) 1004 // On AVX512, a packed v32i16 shift left by a constant build_vector 1005 // is lowered into a vector multiply (vpmullw). 1006 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, 1007 Op1Info.getNoProps(), Op2Info.getNoProps()); 1008 } 1009 1010 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). 1011 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { 1012 if (ISD == ISD::SHL && LT.second == MVT::v16i16 && 1013 Op2Info.isConstant()) 1014 // On AVX2, a packed v16i16 shift left by a constant build_vector 1015 // is lowered into a vector multiply (vpmullw). 1016 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, 1017 Op1Info.getNoProps(), Op2Info.getNoProps()); 1018 1019 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) 1020 if (auto KindCost = Entry->Cost[CostKind]) 1021 return LT.first * *KindCost; 1022 } 1023 1024 static const CostKindTblEntry XOPShiftCostTable[] = { 1025 // 128bit shifts take 1cy, but right shifts require negation beforehand. 1026 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } }, 1027 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } }, 1028 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } }, 1029 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } }, 1030 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } }, 1031 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } }, 1032 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } }, 1033 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } }, 1034 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } }, 1035 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } }, 1036 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, 1037 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } }, 1038 // 256bit shifts require splitting if AVX2 didn't catch them above. 1039 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } }, 1040 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } }, 1041 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } }, 1042 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } }, 1043 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } }, 1044 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } }, 1045 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } }, 1046 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } }, 1047 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } }, 1048 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } }, 1049 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } }, 1050 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } }, 1051 }; 1052 1053 // Look for XOP lowering tricks. 1054 if (ST->hasXOP()) { 1055 // If the right shift is constant then we'll fold the negation so 1056 // it's as cheap as a left shift. 1057 int ShiftISD = ISD; 1058 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant()) 1059 ShiftISD = ISD::SHL; 1060 if (const auto *Entry = 1061 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) 1062 if (auto KindCost = Entry->Cost[CostKind]) 1063 return LT.first * *KindCost; 1064 } 1065 1066 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) { 1067 MVT VT = LT.second; 1068 // Vector shift left by non uniform constant can be lowered 1069 // into vector multiply. 1070 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || 1071 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) 1072 ISD = ISD::MUL; 1073 } 1074 1075 static const CostKindTblEntry GLMCostTable[] = { 1076 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss 1077 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps 1078 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd 1079 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd 1080 }; 1081 1082 if (ST->useGLMDivSqrtCosts()) 1083 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second)) 1084 if (auto KindCost = Entry->Cost[CostKind]) 1085 return LT.first * *KindCost; 1086 1087 static const CostKindTblEntry SLMCostTable[] = { 1088 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld 1089 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw 1090 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd 1091 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss 1092 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd 1093 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps 1094 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss 1095 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps 1096 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd 1097 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd 1098 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd 1099 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd 1100 // v2i64/v4i64 mul is custom lowered as a series of long: 1101 // multiplies(3), shifts(3) and adds(2) 1102 // slm muldq version throughput is 2 and addq throughput 4 1103 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + 1104 // 3X4 (addq throughput) = 17 1105 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } }, 1106 // slm addq\subq throughput is 4 1107 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } }, 1108 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } }, 1109 }; 1110 1111 if (ST->useSLMArithCosts()) 1112 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second)) 1113 if (auto KindCost = Entry->Cost[CostKind]) 1114 return LT.first * *KindCost; 1115 1116 static const CostKindTblEntry AVX2CostTable[] = { 1117 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence. 1118 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence. 1119 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence. 1120 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. 1121 1122 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence. 1123 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence. 1124 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence. 1125 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. 1126 1127 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence. 1128 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence. 1129 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence. 1130 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence. 1131 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence. 1132 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence. 1133 1134 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb 1135 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb 1136 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw 1137 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw 1138 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd 1139 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd 1140 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq 1141 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq 1142 1143 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack 1144 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw 1145 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw 1146 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld 1147 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld 1148 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add 1149 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add 1150 1151 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } }, 1152 1153 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd 1154 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps 1155 1156 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd 1157 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss 1158 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd 1159 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps 1160 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd 1161 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps 1162 1163 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd 1164 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss 1165 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd 1166 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps 1167 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd 1168 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps 1169 1170 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd 1171 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss 1172 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd 1173 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps 1174 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd 1175 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps 1176 1177 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss 1178 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps 1179 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps 1180 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd 1181 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd 1182 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd 1183 }; 1184 1185 // Look for AVX2 lowering tricks for custom cases. 1186 if (ST->hasAVX2()) 1187 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) 1188 if (auto KindCost = Entry->Cost[CostKind]) 1189 return LT.first * *KindCost; 1190 1191 static const CostKindTblEntry AVX1CostTable[] = { 1192 // We don't have to scalarize unsupported ops. We can issue two half-sized 1193 // operations and we only need to extract the upper YMM half. 1194 // Two ops + 1 extract + 1 insert = 4. 1195 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split 1196 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or 1197 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split 1198 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split 1199 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld 1200 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } }, 1201 1202 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps 1203 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps 1204 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps 1205 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps 1206 1207 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps 1208 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps 1209 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps 1210 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps 1211 1212 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps 1213 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps 1214 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps 1215 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps 1216 1217 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split 1218 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split 1219 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split 1220 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split 1221 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split 1222 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split 1223 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split 1224 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split 1225 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq 1226 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq 1227 1228 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence. 1229 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split. 1230 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence. 1231 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split. 1232 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld 1233 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split 1234 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. 1235 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. 1236 1237 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence. 1238 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split. 1239 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. 1240 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. 1241 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. 1242 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. 1243 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. 1244 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. 1245 1246 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence. 1247 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split. 1248 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. 1249 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. 1250 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. 1251 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. 1252 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend. 1253 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split. 1254 1255 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ 1256 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ 1257 1258 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1259 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1260 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1261 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1262 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ 1263 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ 1264 1265 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1266 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1267 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1268 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1269 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ 1270 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ 1271 1272 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ 1273 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ 1274 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ 1275 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ 1276 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ 1277 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ 1278 1279 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ 1280 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ 1281 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/ 1282 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ 1283 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ 1284 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/ 1285 }; 1286 1287 if (ST->hasAVX()) 1288 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) 1289 if (auto KindCost = Entry->Cost[CostKind]) 1290 return LT.first * *KindCost; 1291 1292 static const CostKindTblEntry SSE42CostTable[] = { 1293 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1294 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1295 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1296 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1297 1298 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1299 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1300 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1301 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1302 1303 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ 1304 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ 1305 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ 1306 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ 1307 1308 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ 1309 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ 1310 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ 1311 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ 1312 1313 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add 1314 }; 1315 1316 if (ST->hasSSE42()) 1317 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) 1318 if (auto KindCost = Entry->Cost[CostKind]) 1319 return LT.first * *KindCost; 1320 1321 static const CostKindTblEntry SSE41CostTable[] = { 1322 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence. 1323 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence. 1324 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld 1325 1326 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence. 1327 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. 1328 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. 1329 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. 1330 1331 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence. 1332 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. 1333 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. 1334 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence. 1335 1336 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org) 1337 }; 1338 1339 if (ST->hasSSE41()) 1340 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) 1341 if (auto KindCost = Entry->Cost[CostKind]) 1342 return LT.first * *KindCost; 1343 1344 static const CostKindTblEntry SSSE3CostTable[] = { 1345 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or 1346 }; 1347 1348 if (ST->hasSSSE3()) 1349 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second)) 1350 if (auto KindCost = Entry->Cost[CostKind]) 1351 return LT.first * *KindCost; 1352 1353 static const CostKindTblEntry SSE2CostTable[] = { 1354 // We don't correctly identify costs of casts because they are marked as 1355 // custom. 1356 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence. 1357 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence. 1358 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq. 1359 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. 1360 1361 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence. 1362 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. 1363 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. 1364 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. 1365 1366 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence. 1367 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. 1368 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. 1369 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence. 1370 1371 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand 1372 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand 1373 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand 1374 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand 1375 1376 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por 1377 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por 1378 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por 1379 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por 1380 1381 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor 1382 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor 1383 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor 1384 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor 1385 1386 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq 1387 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq 1388 1389 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack 1390 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw 1391 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle 1392 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add 1393 1394 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } }, 1395 1396 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1397 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1398 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1399 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1400 1401 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1402 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1403 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1404 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1405 1406 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1407 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1408 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1409 1410 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1411 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1412 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1413 1414 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1415 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1416 }; 1417 1418 if (ST->hasSSE2()) 1419 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) 1420 if (auto KindCost = Entry->Cost[CostKind]) 1421 return LT.first * *KindCost; 1422 1423 static const CostKindTblEntry SSE1CostTable[] = { 1424 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/ 1425 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/ 1426 1427 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ 1428 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ 1429 1430 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ 1431 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ 1432 1433 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ 1434 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ 1435 1436 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ 1437 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ 1438 }; 1439 1440 if (ST->hasSSE1()) 1441 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) 1442 if (auto KindCost = Entry->Cost[CostKind]) 1443 return LT.first * *KindCost; 1444 1445 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets 1446 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ 1447 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ 1448 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } }, 1449 }; 1450 1451 if (ST->is64Bit()) 1452 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second)) 1453 if (auto KindCost = Entry->Cost[CostKind]) 1454 return LT.first * *KindCost; 1455 1456 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets 1457 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ 1458 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ 1459 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ 1460 1461 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ 1462 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ 1463 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ 1464 1465 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } }, 1466 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } }, 1467 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } }, 1468 1469 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87) 1470 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87) 1471 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87) 1472 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87) 1473 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87) 1474 }; 1475 1476 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second)) 1477 if (auto KindCost = Entry->Cost[CostKind]) 1478 return LT.first * *KindCost; 1479 1480 // It is not a good idea to vectorize division. We have to scalarize it and 1481 // in the process we will often end up having to spilling regular 1482 // registers. The overhead of division is going to dominate most kernels 1483 // anyways so try hard to prevent vectorization of division - it is 1484 // generally a bad idea. Assume somewhat arbitrarily that we have to be able 1485 // to hide "20 cycles" for each lane. 1486 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() && 1487 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || 1488 ISD == ISD::UREM)) { 1489 InstructionCost ScalarCost = 1490 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind, 1491 Op1Info.getNoProps(), Op2Info.getNoProps()); 1492 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; 1493 } 1494 1495 // Handle some basic single instruction code size cases. 1496 if (CostKind == TTI::TCK_CodeSize) { 1497 switch (ISD) { 1498 case ISD::FADD: 1499 case ISD::FSUB: 1500 case ISD::FMUL: 1501 case ISD::FDIV: 1502 case ISD::FNEG: 1503 case ISD::AND: 1504 case ISD::OR: 1505 case ISD::XOR: 1506 return LT.first; 1507 break; 1508 } 1509 } 1510 1511 // Fallback to the default implementation. 1512 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1513 Args, CxtI); 1514 } 1515 1516 InstructionCost 1517 X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0, 1518 unsigned Opcode1, const SmallBitVector &OpcodeMask, 1519 TTI::TargetCostKind CostKind) const { 1520 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) 1521 return TTI::TCC_Basic; 1522 return InstructionCost::getInvalid(); 1523 } 1524 1525 InstructionCost X86TTIImpl::getShuffleCost( 1526 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask, 1527 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, 1528 ArrayRef<const Value *> Args, const Instruction *CxtI) { 1529 // 64-bit packed float vectors (v2f32) are widened to type v4f32. 1530 // 64-bit packed integer vectors (v2i32) are widened to type v4i32. 1531 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp); 1532 1533 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp); 1534 1535 // If all args are constant than this will be constant folded away. 1536 if (!Args.empty() && 1537 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); })) 1538 return TTI::TCC_Free; 1539 1540 // Recognize a basic concat_vector shuffle. 1541 if (Kind == TTI::SK_PermuteTwoSrc && 1542 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) && 1543 ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) 1544 return getShuffleCost(TTI::SK_InsertSubvector, 1545 VectorType::getDoubleElementsVectorType(BaseTp), Mask, 1546 CostKind, Mask.size() / 2, BaseTp); 1547 1548 // Treat Transpose as 2-op shuffles - there's no difference in lowering. 1549 if (Kind == TTI::SK_Transpose) 1550 Kind = TTI::SK_PermuteTwoSrc; 1551 1552 if (Kind == TTI::SK_Broadcast) { 1553 // For Broadcasts we are splatting the first element from the first input 1554 // register, so only need to reference that input and all the output 1555 // registers are the same. 1556 LT.first = 1; 1557 1558 // If we're broadcasting a load then AVX/AVX2 can do this for free. 1559 using namespace PatternMatch; 1560 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) && 1561 (ST->hasAVX2() || 1562 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32))) 1563 return TTI::TCC_Free; 1564 } 1565 1566 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector 1567 // permutation. 1568 // Attempt to detect a shuffle mask with a single defined element. 1569 bool IsInLaneShuffle = false; 1570 bool IsSingleElementMask = false; 1571 if (BaseTp->getPrimitiveSizeInBits() > 0 && 1572 (BaseTp->getPrimitiveSizeInBits() % 128) == 0 && 1573 BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && 1574 Mask.size() == BaseTp->getElementCount().getKnownMinValue()) { 1575 unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128; 1576 unsigned NumEltsPerLane = Mask.size() / NumLanes; 1577 if ((Mask.size() % NumLanes) == 0) { 1578 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) { 1579 return P.value() == PoisonMaskElem || 1580 ((P.value() % Mask.size()) / NumEltsPerLane) == 1581 (P.index() / NumEltsPerLane); 1582 }); 1583 IsSingleElementMask = 1584 (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) { 1585 return M == PoisonMaskElem; 1586 })); 1587 } 1588 } 1589 1590 // Treat <X x bfloat> shuffles as <X x half>. 1591 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16) 1592 LT.second = LT.second.changeVectorElementType(MVT::f16); 1593 1594 // Subvector extractions are free if they start at the beginning of a 1595 // vector and cheap if the subvectors are aligned. 1596 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { 1597 int NumElts = LT.second.getVectorNumElements(); 1598 if ((Index % NumElts) == 0) 1599 return TTI::TCC_Free; 1600 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); 1601 if (SubLT.second.isVector()) { 1602 int NumSubElts = SubLT.second.getVectorNumElements(); 1603 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 1604 return SubLT.first; 1605 // Handle some cases for widening legalization. For now we only handle 1606 // cases where the original subvector was naturally aligned and evenly 1607 // fit in its legalized subvector type. 1608 // FIXME: Remove some of the alignment restrictions. 1609 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit 1610 // vectors. 1611 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements(); 1612 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && 1613 (NumSubElts % OrigSubElts) == 0 && 1614 LT.second.getVectorElementType() == 1615 SubLT.second.getVectorElementType() && 1616 LT.second.getVectorElementType().getSizeInBits() == 1617 BaseTp->getElementType()->getPrimitiveSizeInBits()) { 1618 assert(NumElts >= NumSubElts && NumElts > OrigSubElts && 1619 "Unexpected number of elements!"); 1620 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), 1621 LT.second.getVectorNumElements()); 1622 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), 1623 SubLT.second.getVectorNumElements()); 1624 int ExtractIndex = alignDown((Index % NumElts), NumSubElts); 1625 InstructionCost ExtractCost = getShuffleCost( 1626 TTI::SK_ExtractSubvector, VecTy, {}, CostKind, ExtractIndex, SubTy); 1627 1628 // If the original size is 32-bits or more, we can use pshufd. Otherwise 1629 // if we have SSSE3 we can use pshufb. 1630 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) 1631 return ExtractCost + 1; // pshufd or pshufb 1632 1633 assert(SubTp->getPrimitiveSizeInBits() == 16 && 1634 "Unexpected vector size"); 1635 1636 return ExtractCost + 2; // worst case pshufhw + pshufd 1637 } 1638 } 1639 // If the extract subvector is not optimal, treat it as single op shuffle. 1640 Kind = TTI::SK_PermuteSingleSrc; 1641 } 1642 1643 // Subvector insertions are cheap if the subvectors are aligned. 1644 // Note that in general, the insertion starting at the beginning of a vector 1645 // isn't free, because we need to preserve the rest of the wide vector, 1646 // but if the destination vector legalizes to the same width as the subvector 1647 // then the insertion will simplify to a (free) register copy. 1648 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { 1649 int NumElts = LT.second.getVectorNumElements(); 1650 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); 1651 if (SubLT.second.isVector()) { 1652 int NumSubElts = SubLT.second.getVectorNumElements(); 1653 bool MatchingTypes = 1654 NumElts == NumSubElts && 1655 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0; 1656 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 1657 return MatchingTypes ? TTI::TCC_Free : SubLT.first; 1658 } 1659 1660 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have 1661 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of 1662 // v1f32 (legalised to f32) into a v4f32. 1663 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 && 1664 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41())) 1665 return 1; 1666 1667 // If the insertion isn't aligned, treat it like a 2-op shuffle. 1668 Kind = TTI::SK_PermuteTwoSrc; 1669 } 1670 1671 // Handle some common (illegal) sub-vector types as they are often very cheap 1672 // to shuffle even on targets without PSHUFB. 1673 EVT VT = TLI->getValueType(DL, BaseTp); 1674 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && 1675 !ST->hasSSSE3()) { 1676 static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = { 1677 {TTI::SK_Broadcast, MVT::v4i16, {1,1,1,1}}, // pshuflw 1678 {TTI::SK_Broadcast, MVT::v2i16, {1,1,1,1}}, // pshuflw 1679 {TTI::SK_Broadcast, MVT::v8i8, {2,2,2,2}}, // punpck/pshuflw 1680 {TTI::SK_Broadcast, MVT::v4i8, {2,2,2,2}}, // punpck/pshuflw 1681 {TTI::SK_Broadcast, MVT::v2i8, {1,1,1,1}}, // punpck 1682 1683 {TTI::SK_Reverse, MVT::v4i16, {1,1,1,1}}, // pshuflw 1684 {TTI::SK_Reverse, MVT::v2i16, {1,1,1,1}}, // pshuflw 1685 {TTI::SK_Reverse, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw/packus 1686 {TTI::SK_Reverse, MVT::v2i8, {1,1,1,1}}, // punpck 1687 1688 {TTI::SK_Splice, MVT::v4i16, {2,2,2,2}}, // punpck+psrldq 1689 {TTI::SK_Splice, MVT::v2i16, {2,2,2,2}}, // punpck+psrldq 1690 {TTI::SK_Splice, MVT::v4i8, {2,2,2,2}}, // punpck+psrldq 1691 {TTI::SK_Splice, MVT::v2i8, {2,2,2,2}}, // punpck+psrldq 1692 1693 {TTI::SK_PermuteTwoSrc, MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw 1694 {TTI::SK_PermuteTwoSrc, MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw 1695 {TTI::SK_PermuteTwoSrc, MVT::v8i8, {7,7,7,7}}, // punpck/pshuflw 1696 {TTI::SK_PermuteTwoSrc, MVT::v4i8, {4,4,4,4}}, // punpck/pshuflw 1697 {TTI::SK_PermuteTwoSrc, MVT::v2i8, {2,2,2,2}}, // punpck 1698 1699 {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw 1700 {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw 1701 {TTI::SK_PermuteSingleSrc, MVT::v8i8, {5,5,5,5}}, // punpck/pshuflw 1702 {TTI::SK_PermuteSingleSrc, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw 1703 {TTI::SK_PermuteSingleSrc, MVT::v2i8, {1,1,1,1}}, // punpck 1704 }; 1705 1706 if (ST->hasSSE2()) 1707 if (const auto *Entry = 1708 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) 1709 if (auto KindCost = Entry->Cost[CostKind]) 1710 return LT.first * *KindCost; 1711 } 1712 1713 // We are going to permute multiple sources and the result will be in multiple 1714 // destinations. Providing an accurate cost only for splits where the element 1715 // type remains the same. 1716 if (LT.first != 1) { 1717 MVT LegalVT = LT.second; 1718 if (LegalVT.isVector() && 1719 LegalVT.getVectorElementType().getSizeInBits() == 1720 BaseTp->getElementType()->getPrimitiveSizeInBits() && 1721 LegalVT.getVectorNumElements() < 1722 cast<FixedVectorType>(BaseTp)->getNumElements()) { 1723 unsigned VecTySize = DL.getTypeStoreSize(BaseTp); 1724 unsigned LegalVTSize = LegalVT.getStoreSize(); 1725 // Number of source vectors after legalization: 1726 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; 1727 // Number of destination vectors after legalization: 1728 InstructionCost NumOfDests = LT.first; 1729 1730 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), 1731 LegalVT.getVectorNumElements()); 1732 1733 if (!Mask.empty() && NumOfDests.isValid()) { 1734 // Try to perform better estimation of the permutation. 1735 // 1. Split the source/destination vectors into real registers. 1736 // 2. Do the mask analysis to identify which real registers are 1737 // permuted. If more than 1 source registers are used for the 1738 // destination register building, the cost for this destination register 1739 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one 1740 // source register is used, build mask and calculate the cost as a cost 1741 // of PermuteSingleSrc. 1742 // Also, for the single register permute we try to identify if the 1743 // destination register is just a copy of the source register or the 1744 // copy of the previous destination register (the cost is 1745 // TTI::TCC_Basic). If the source register is just reused, the cost for 1746 // this operation is TTI::TCC_Free. 1747 NumOfDests = 1748 getTypeLegalizationCost( 1749 FixedVectorType::get(BaseTp->getElementType(), Mask.size())) 1750 .first; 1751 unsigned E = *NumOfDests.getValue(); 1752 unsigned NormalizedVF = 1753 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E); 1754 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); 1755 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); 1756 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem); 1757 copy(Mask, NormalizedMask.begin()); 1758 unsigned PrevSrcReg = 0; 1759 ArrayRef<int> PrevRegMask; 1760 InstructionCost Cost = 0; 1761 processShuffleMasks( 1762 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, 1763 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask, 1764 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { 1765 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) { 1766 // Check if the previous register can be just copied to the next 1767 // one. 1768 if (PrevRegMask.empty() || PrevSrcReg != SrcReg || 1769 PrevRegMask != RegMask) 1770 Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, 1771 RegMask, CostKind, 0, nullptr); 1772 else 1773 // Just a copy of previous destination register. 1774 Cost += TTI::TCC_Basic; 1775 return; 1776 } 1777 if (SrcReg != DestReg && 1778 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) { 1779 // Just a copy of the source register. 1780 Cost += TTI::TCC_Free; 1781 } 1782 PrevSrcReg = SrcReg; 1783 PrevRegMask = RegMask; 1784 }, 1785 [this, SingleOpTy, CostKind, 1786 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/, 1787 unsigned /*Unused*/, bool /*Unused*/) { 1788 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, 1789 CostKind, 0, nullptr); 1790 }); 1791 return Cost; 1792 } 1793 1794 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; 1795 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 1796 {}, CostKind, 0, nullptr); 1797 } 1798 1799 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); 1800 } 1801 1802 // If we're just moving a single element around (probably as an alternative to 1803 // extracting it), we can assume this is cheap. 1804 if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask) 1805 return TTI::TCC_Basic; 1806 1807 static const CostKindTblEntry AVX512VBMIShuffleTbl[] = { 1808 { TTI::SK_Reverse, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb 1809 { TTI::SK_Reverse, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb 1810 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb 1811 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb 1812 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 2, 2, 2, 2 } }, // vpermt2b 1813 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // vpermt2b 1814 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 2, 2, 2, 2 } } // vpermt2b 1815 }; 1816 1817 if (ST->hasVBMI()) 1818 if (const auto *Entry = 1819 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) 1820 if (auto KindCost = Entry->Cost[CostKind]) 1821 return LT.first * *KindCost; 1822 1823 static const CostKindTblEntry AVX512BWShuffleTbl[] = { 1824 { TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw 1825 { TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw 1826 { TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb 1827 1828 { TTI::SK_Reverse, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw 1829 { TTI::SK_Reverse, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw 1830 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw 1831 { TTI::SK_Reverse, MVT::v64i8, { 2, 2, 2, 2 } }, // pshufb + vshufi64x2 1832 1833 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw 1834 { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw 1835 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw 1836 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw 1837 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 8, 8, 8, 8 } }, // extend to v32i16 1838 1839 { TTI::SK_PermuteTwoSrc, MVT::v32i16,{ 2, 2, 2, 2 } }, // vpermt2w 1840 { TTI::SK_PermuteTwoSrc, MVT::v32f16,{ 2, 2, 2, 2 } }, // vpermt2w 1841 { TTI::SK_PermuteTwoSrc, MVT::v16i16,{ 2, 2, 2, 2 } }, // vpermt2w 1842 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 2, 2, 2, 2 } }, // vpermt2w 1843 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1 1844 1845 { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw 1846 { TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vblendmb 1847 1848 { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr 1849 { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr 1850 { TTI::SK_Splice, MVT::v64i8, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr 1851 }; 1852 1853 if (ST->hasBWI()) 1854 if (const auto *Entry = 1855 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) 1856 if (auto KindCost = Entry->Cost[CostKind]) 1857 return LT.first * *KindCost; 1858 1859 static const CostKindTblEntry AVX512ShuffleTbl[] = { 1860 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd 1861 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss 1862 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq 1863 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd 1864 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw 1865 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw 1866 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb 1867 1868 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd 1869 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps 1870 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq 1871 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd 1872 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca 1873 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca 1874 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca 1875 1876 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd 1877 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd 1878 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd 1879 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd 1880 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd 1881 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd 1882 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd 1883 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd 1884 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr 1885 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr 1886 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr 1887 1888 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd 1889 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd 1890 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd 1891 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps 1892 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps 1893 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps 1894 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq 1895 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq 1896 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq 1897 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd 1898 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd 1899 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd 1900 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb 1901 1902 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd 1903 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps 1904 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q 1905 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d 1906 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd 1907 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps 1908 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q 1909 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d 1910 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd 1911 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps 1912 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q 1913 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d 1914 1915 // FIXME: This just applies the type legalization cost rules above 1916 // assuming these completely split. 1917 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } }, 1918 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } }, 1919 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } }, 1920 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } }, 1921 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } }, 1922 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } }, 1923 1924 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq 1925 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq 1926 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq 1927 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd 1928 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps 1929 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq 1930 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd 1931 }; 1932 1933 if (ST->hasAVX512()) 1934 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) 1935 if (auto KindCost = Entry->Cost[CostKind]) 1936 return LT.first * *KindCost; 1937 1938 static const CostKindTblEntry AVX2InLaneShuffleTbl[] = { 1939 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb 1940 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb 1941 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpshufb 1942 1943 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd 1944 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps 1945 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd 1946 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd 1947 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor 1948 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor 1949 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor 1950 }; 1951 1952 if (IsInLaneShuffle && ST->hasAVX2()) 1953 if (const auto *Entry = 1954 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second)) 1955 if (auto KindCost = Entry->Cost[CostKind]) 1956 return LT.first * *KindCost; 1957 1958 static const CostKindTblEntry AVX2ShuffleTbl[] = { 1959 { TTI::SK_Broadcast, MVT::v4f64, { 1, 1, 1, 1 } }, // vbroadcastpd 1960 { TTI::SK_Broadcast, MVT::v8f32, { 1, 1, 1, 1 } }, // vbroadcastps 1961 { TTI::SK_Broadcast, MVT::v4i64, { 1, 1, 1, 1 } }, // vpbroadcastq 1962 { TTI::SK_Broadcast, MVT::v8i32, { 1, 1, 1, 1 } }, // vpbroadcastd 1963 { TTI::SK_Broadcast, MVT::v16i16, { 1, 1, 1, 1 } }, // vpbroadcastw 1964 { TTI::SK_Broadcast, MVT::v16f16, { 1, 1, 1, 1 } }, // vpbroadcastw 1965 { TTI::SK_Broadcast, MVT::v32i8, { 1, 1, 1, 1 } }, // vpbroadcastb 1966 1967 { TTI::SK_Reverse, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd 1968 { TTI::SK_Reverse, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps 1969 { TTI::SK_Reverse, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq 1970 { TTI::SK_Reverse, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd 1971 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb 1972 { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb 1973 { TTI::SK_Reverse, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb 1974 1975 { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb 1976 { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb 1977 { TTI::SK_Select, MVT::v32i8, { 1, 1, 1, 1 } }, // vpblendvb 1978 1979 { TTI::SK_Splice, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr 1980 { TTI::SK_Splice, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr 1981 { TTI::SK_Splice, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr 1982 { TTI::SK_Splice, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr 1983 { TTI::SK_Splice, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr 1984 1985 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd 1986 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps 1987 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq 1988 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd 1989 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, 1990 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, 1991 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, 1992 1993 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd 1994 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 3, 3, 3, 3 } }, // 2*vpermps + vblendps 1995 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd 1996 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd 1997 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } }, 1998 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } }, 1999 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 7, 7, 7, 7 } }, 2000 }; 2001 2002 if (ST->hasAVX2()) 2003 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) 2004 if (auto KindCost = Entry->Cost[CostKind]) 2005 return LT.first * *KindCost; 2006 2007 static const CostKindTblEntry XOPShuffleTbl[] = { 2008 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd 2009 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps 2010 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd 2011 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps 2012 { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm 2013 // + vinsertf128 2014 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm 2015 // + vinsertf128 2016 2017 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm 2018 // + vinsertf128 2019 2020 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 1, 1, 1, 1 } }, // vpperm 2021 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm 2022 // + vinsertf128 2023 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 1, 1, 1, 1 } }, // vpperm 2024 }; 2025 2026 if (ST->hasXOP()) 2027 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) 2028 if (auto KindCost = Entry->Cost[CostKind]) 2029 return LT.first * *KindCost; 2030 2031 static const CostKindTblEntry AVX1InLaneShuffleTbl[] = { 2032 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermilpd 2033 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermilpd 2034 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermilps 2035 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermilps 2036 2037 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb 2038 // + vpor + vinsertf128 2039 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb 2040 // + vpor + vinsertf128 2041 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb 2042 // + vpor + vinsertf128 2043 2044 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd 2045 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps 2046 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd 2047 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps 2048 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb 2049 // + 2*vpor + vinsertf128 2050 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb 2051 // + 2*vpor + vinsertf128 2052 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb 2053 // + 2*vpor + vinsertf128 2054 }; 2055 2056 if (IsInLaneShuffle && ST->hasAVX()) 2057 if (const auto *Entry = 2058 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second)) 2059 if (auto KindCost = Entry->Cost[CostKind]) 2060 return LT.first * *KindCost; 2061 2062 static const CostKindTblEntry AVX1ShuffleTbl[] = { 2063 {TTI::SK_Broadcast, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vpermilpd 2064 {TTI::SK_Broadcast, MVT::v8f32, {2,2,2,2}}, // vperm2f128 + vpermilps 2065 {TTI::SK_Broadcast, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vpermilpd 2066 {TTI::SK_Broadcast, MVT::v8i32, {2,2,2,2}}, // vperm2f128 + vpermilps 2067 {TTI::SK_Broadcast, MVT::v16i16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128 2068 {TTI::SK_Broadcast, MVT::v16f16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128 2069 {TTI::SK_Broadcast, MVT::v32i8, {2,2,2,2}}, // vpshufb + vinsertf128 2070 2071 {TTI::SK_Reverse, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vpermilpd 2072 {TTI::SK_Reverse, MVT::v8f32, {2,2,2,2}}, // vperm2f128 + vpermilps 2073 {TTI::SK_Reverse, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vpermilpd 2074 {TTI::SK_Reverse, MVT::v8i32, {2,2,2,2}}, // vperm2f128 + vpermilps 2075 {TTI::SK_Reverse, MVT::v16i16, {4,4,4,4}}, // vextractf128 + 2*pshufb 2076 // + vinsertf128 2077 {TTI::SK_Reverse, MVT::v16f16, {4,4,4,4}}, // vextractf128 + 2*pshufb 2078 // + vinsertf128 2079 {TTI::SK_Reverse, MVT::v32i8, {4,4,4,4}}, // vextractf128 + 2*pshufb 2080 // + vinsertf128 2081 2082 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd 2083 {TTI::SK_Select, MVT::v4f64, {1,1,1,1}}, // vblendpd 2084 {TTI::SK_Select, MVT::v8i32, {1,1,1,1}}, // vblendps 2085 {TTI::SK_Select, MVT::v8f32, {1,1,1,1}}, // vblendps 2086 {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor 2087 {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor 2088 {TTI::SK_Select, MVT::v32i8, {3,3,3,3}}, // vpand + vpandn + vpor 2089 2090 {TTI::SK_Splice, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + shufpd 2091 {TTI::SK_Splice, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + shufpd 2092 {TTI::SK_Splice, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps 2093 {TTI::SK_Splice, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps 2094 {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 2095 {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 2096 {TTI::SK_Splice, MVT::v32i8, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 2097 2098 {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd 2099 {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd 2100 {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps 2101 {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps 2102 {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb 2103 // + 2*por + vinsertf128 2104 {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb 2105 // + 2*por + vinsertf128 2106 {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb 2107 // + 2*por + vinsertf128 2108 2109 {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd 2110 {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd 2111 {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps 2112 {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps 2113 {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb 2114 // + 4*por + vinsertf128 2115 {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb 2116 // + 4*por + vinsertf128 2117 {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb 2118 // + 4*por + vinsertf128 2119 }; 2120 2121 if (ST->hasAVX()) 2122 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) 2123 if (auto KindCost = Entry->Cost[CostKind]) 2124 return LT.first * *KindCost; 2125 2126 static const CostKindTblEntry SSE41ShuffleTbl[] = { 2127 {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw 2128 {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd 2129 {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw 2130 {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps 2131 {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw 2132 {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw 2133 {TTI::SK_Select, MVT::v16i8, {1,1,1,1}} // pblendvb 2134 }; 2135 2136 if (ST->hasSSE41()) 2137 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) 2138 if (auto KindCost = Entry->Cost[CostKind]) 2139 return LT.first * *KindCost; 2140 2141 static const CostKindTblEntry SSSE3ShuffleTbl[] = { 2142 {TTI::SK_Broadcast, MVT::v8i16, {1, 1, 1, 1}}, // pshufb 2143 {TTI::SK_Broadcast, MVT::v8f16, {1, 1, 1, 1}}, // pshufb 2144 {TTI::SK_Broadcast, MVT::v16i8, {1, 1, 1, 1}}, // pshufb 2145 2146 {TTI::SK_Reverse, MVT::v8i16, {1, 1, 1, 1}}, // pshufb 2147 {TTI::SK_Reverse, MVT::v8f16, {1, 1, 1, 1}}, // pshufb 2148 {TTI::SK_Reverse, MVT::v16i8, {1, 1, 1, 1}}, // pshufb 2149 2150 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por 2151 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por 2152 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por 2153 2154 {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr 2155 {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr 2156 {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr 2157 {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr 2158 {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr 2159 2160 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb 2161 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb 2162 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb 2163 2164 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por 2165 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por 2166 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por 2167 }; 2168 2169 if (ST->hasSSSE3()) 2170 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) 2171 if (auto KindCost = Entry->Cost[CostKind]) 2172 return LT.first * *KindCost; 2173 2174 static const CostKindTblEntry SSE2ShuffleTbl[] = { 2175 {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd 2176 {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd 2177 {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd 2178 {TTI::SK_Broadcast, MVT::v8i16, {2, 2, 2, 2}}, // pshuflw + pshufd 2179 {TTI::SK_Broadcast, MVT::v8f16, {2, 2, 2, 2}}, // pshuflw + pshufd 2180 {TTI::SK_Broadcast, MVT::v16i8, {3, 3, 3, 3}}, // unpck + pshuflw + pshufd 2181 2182 {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd 2183 {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd 2184 {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd 2185 {TTI::SK_Reverse, MVT::v8i16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd 2186 {TTI::SK_Reverse, MVT::v8f16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd 2187 {TTI::SK_Reverse, MVT::v16i8, {9, 9, 9, 9}}, // 2*pshuflw + 2*pshufhw 2188 // + 2*pshufd + 2*unpck + packus 2189 2190 {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd 2191 {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd 2192 {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps 2193 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // pand + pandn + por 2194 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // pand + pandn + por 2195 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // pand + pandn + por 2196 2197 {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd 2198 {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd 2199 {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd} 2200 {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por 2201 {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por 2202 {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por 2203 2204 {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd 2205 {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd 2206 {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd 2207 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw 2208 // + pshufd/unpck 2209 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw 2210 // + pshufd/unpck 2211 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw 2212 // + 2*pshufd + 2*unpck + 2*packus 2213 2214 {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd 2215 {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd 2216 {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd} 2217 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {6, 8, 8, 8}}, // blend+permute 2218 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {6, 8, 8, 8}}, // blend+permute 2219 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {11, 13, 13, 13}}, // blend+permute 2220 }; 2221 2222 static const CostTblEntry SSE3BroadcastLoadTbl[] = { 2223 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup 2224 }; 2225 2226 if (ST->hasSSE2()) { 2227 bool IsLoad = 2228 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); }); 2229 if (ST->hasSSE3() && IsLoad) 2230 if (const auto *Entry = 2231 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { 2232 assert(isLegalBroadcastLoad(BaseTp->getElementType(), 2233 LT.second.getVectorElementCount()) && 2234 "Table entry missing from isLegalBroadcastLoad()"); 2235 return LT.first * Entry->Cost; 2236 } 2237 2238 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) 2239 if (auto KindCost = Entry->Cost[CostKind]) 2240 return LT.first * *KindCost; 2241 } 2242 2243 static const CostKindTblEntry SSE1ShuffleTbl[] = { 2244 { TTI::SK_Broadcast, MVT::v4f32, {1,1,1,1} }, // shufps 2245 { TTI::SK_Reverse, MVT::v4f32, {1,1,1,1} }, // shufps 2246 { TTI::SK_Select, MVT::v4f32, {2,2,2,2} }, // 2*shufps 2247 { TTI::SK_Splice, MVT::v4f32, {2,2,2,2} }, // 2*shufps 2248 { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps 2249 { TTI::SK_PermuteTwoSrc, MVT::v4f32, {2,2,2,2} }, // 2*shufps 2250 }; 2251 2252 if (ST->hasSSE1()) { 2253 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) { 2254 // SHUFPS: both pairs must come from the same source register. 2255 auto MatchSHUFPS = [](int X, int Y) { 2256 return X < 0 || Y < 0 || ((X & 4) == (Y & 4)); 2257 }; 2258 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3])) 2259 return 1; 2260 } 2261 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) 2262 if (auto KindCost = Entry->Cost[CostKind]) 2263 return LT.first * *KindCost; 2264 } 2265 2266 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); 2267 } 2268 2269 InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 2270 Type *Src, 2271 TTI::CastContextHint CCH, 2272 TTI::TargetCostKind CostKind, 2273 const Instruction *I) { 2274 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2275 assert(ISD && "Invalid opcode"); 2276 2277 // The cost tables include both specific, custom (non-legal) src/dst type 2278 // conversions and generic, legalized types. We test for customs first, before 2279 // falling back to legalization. 2280 // FIXME: Need a better design of the cost table to handle non-simple types of 2281 // potential massive combinations (elem_num x src_type x dst_type). 2282 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{ 2283 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } }, 2284 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } }, 2285 2286 // Mask sign extend has an instruction. 2287 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } }, 2288 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } }, 2289 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } }, 2290 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } }, 2291 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } }, 2292 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } }, 2293 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } }, 2294 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } }, 2295 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } }, 2296 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } }, 2297 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } }, 2298 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } }, 2299 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } }, 2300 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } }, 2301 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } }, 2302 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } }, 2303 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } }, 2304 2305 // Mask zero extend is a sext + shift. 2306 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } }, 2307 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } }, 2308 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } }, 2309 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } }, 2310 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } }, 2311 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } }, 2312 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } }, 2313 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } }, 2314 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } }, 2315 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } }, 2316 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } }, 2317 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } }, 2318 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } }, 2319 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } }, 2320 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } }, 2321 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } }, 2322 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } }, 2323 2324 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } }, 2325 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2326 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, 2327 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } }, 2328 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, 2329 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2330 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } }, 2331 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } }, 2332 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } }, 2333 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2334 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } }, 2335 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2336 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } }, 2337 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } }, 2338 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } }, 2339 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } }, 2340 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } }, 2341 2342 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } }, 2343 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm 2344 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb 2345 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb 2346 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb 2347 }; 2348 2349 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = { 2350 // Mask sign extend has an instruction. 2351 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, 2352 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } }, 2353 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, 2354 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, 2355 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, 2356 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } }, 2357 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, 2358 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, 2359 2360 // Mask zero extend is a sext + shift. 2361 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } }, 2362 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } }, 2363 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } }, 2364 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } }, 2365 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } }, 2366 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } }, 2367 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } }, 2368 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } }, 2369 2370 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, 2371 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } }, 2372 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, 2373 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, 2374 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, 2375 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, 2376 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, 2377 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } }, 2378 2379 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } }, 2380 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } }, 2381 2382 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } }, 2383 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } }, 2384 2385 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } }, 2386 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } }, 2387 2388 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } }, 2389 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } }, 2390 }; 2391 2392 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and 2393 // 256-bit wide vectors. 2394 2395 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = { 2396 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } }, 2397 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } }, 2398 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4 2399 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps 2400 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd 2401 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } }, 2402 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph 2403 2404 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2405 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2406 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2407 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2408 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq 2409 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq 2410 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq 2411 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2412 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd 2413 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd 2414 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd 2415 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd 2416 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq 2417 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq 2418 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq 2419 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb 2420 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb 2421 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb 2422 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb 2423 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb 2424 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw 2425 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw 2426 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb 2427 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb 2428 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb 2429 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb 2430 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb 2431 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb 2432 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw 2433 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw 2434 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw 2435 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd 2436 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd 2437 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb 2438 2439 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32 2440 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } }, 2441 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } }, 2442 2443 // Sign extend is zmm vpternlogd+vptruncdb. 2444 // Zero extend is zmm broadcast load+vptruncdw. 2445 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } }, 2446 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } }, 2447 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } }, 2448 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } }, 2449 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } }, 2450 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } }, 2451 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } }, 2452 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } }, 2453 2454 // Sign extend is zmm vpternlogd+vptruncdw. 2455 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. 2456 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } }, 2457 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } }, 2458 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } }, 2459 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } }, 2460 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } }, 2461 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } }, 2462 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } }, 2463 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } }, 2464 2465 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd 2466 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld 2467 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd 2468 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld 2469 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd 2470 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld 2471 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq 2472 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq 2473 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq 2474 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq 2475 2476 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd 2477 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld 2478 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq 2479 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq 2480 2481 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } }, 2482 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } }, 2483 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } }, 2484 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } }, 2485 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } }, 2486 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } }, 2487 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } }, 2488 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } }, 2489 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } }, 2490 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } }, 2491 2492 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right 2493 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right 2494 2495 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } }, 2496 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } }, 2497 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } }, 2498 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } }, 2499 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } }, 2500 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } }, 2501 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } }, 2502 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } }, 2503 2504 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } }, 2505 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } }, 2506 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } }, 2507 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } }, 2508 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } }, 2509 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } }, 2510 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } }, 2511 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } }, 2512 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } }, 2513 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } }, 2514 2515 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } }, 2516 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } }, 2517 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } }, 2518 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } }, 2519 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } }, 2520 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } }, 2521 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } }, 2522 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } }, 2523 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } }, 2524 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } }, 2525 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } }, 2526 2527 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } }, 2528 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } }, 2529 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } }, 2530 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } }, 2531 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } }, 2532 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } }, 2533 }; 2534 2535 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] { 2536 // Mask sign extend has an instruction. 2537 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } }, 2538 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } }, 2539 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } }, 2540 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } }, 2541 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } }, 2542 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } }, 2543 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } }, 2544 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } }, 2545 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } }, 2546 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } }, 2547 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } }, 2548 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } }, 2549 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } }, 2550 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } }, 2551 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } }, 2552 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } }, 2553 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } }, 2554 2555 // Mask zero extend is a sext + shift. 2556 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } }, 2557 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } }, 2558 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } }, 2559 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } }, 2560 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } }, 2561 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } }, 2562 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } }, 2563 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } }, 2564 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } }, 2565 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } }, 2566 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } }, 2567 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } }, 2568 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } }, 2569 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } }, 2570 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } }, 2571 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } }, 2572 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } }, 2573 2574 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } }, 2575 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2576 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, 2577 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } }, 2578 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, 2579 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2580 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } }, 2581 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } }, 2582 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } }, 2583 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2584 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } }, 2585 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } }, 2586 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } }, 2587 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } }, 2588 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } }, 2589 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } }, 2590 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } }, 2591 2592 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, 2593 }; 2594 2595 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = { 2596 // Mask sign extend has an instruction. 2597 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, 2598 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } }, 2599 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, 2600 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } }, 2601 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, 2602 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } }, 2603 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, 2604 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, 2605 2606 // Mask zero extend is a sext + shift. 2607 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, 2608 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } }, 2609 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, 2610 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } }, 2611 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, 2612 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } }, 2613 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, 2614 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, 2615 2616 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } }, 2617 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, 2618 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, 2619 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } }, 2620 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, 2621 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, 2622 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } }, 2623 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, 2624 2625 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } }, 2626 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } }, 2627 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } }, 2628 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } }, 2629 2630 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } }, 2631 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } }, 2632 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } }, 2633 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } }, 2634 2635 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } }, 2636 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } }, 2637 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } }, 2638 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } }, 2639 2640 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } }, 2641 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } }, 2642 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } }, 2643 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } }, 2644 }; 2645 2646 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = { 2647 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2648 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2649 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd 2650 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8 2651 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq 2652 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq 2653 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq 2654 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16 2655 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd 2656 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd 2657 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd 2658 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd 2659 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq 2660 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq 2661 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd 2662 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb 2663 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw 2664 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb 2665 2666 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb 2667 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb 2668 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } }, 2669 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } }, 2670 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } }, 2671 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } }, 2672 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } }, 2673 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } }, 2674 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } }, 2675 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } }, 2676 2677 // sign extend is vpcmpeq+maskedmove+vpmovdw 2678 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw 2679 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } }, 2680 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } }, 2681 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } }, 2682 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } }, 2683 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } }, 2684 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } }, 2685 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } }, 2686 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } }, 2687 2688 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd 2689 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld 2690 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd 2691 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld 2692 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd 2693 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld 2694 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd 2695 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld 2696 2697 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq 2698 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq 2699 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq 2700 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq 2701 2702 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } }, 2703 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } }, 2704 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } }, 2705 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } }, 2706 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } }, 2707 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } }, 2708 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } }, 2709 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } }, 2710 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } }, 2711 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } }, 2712 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } }, 2713 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } }, 2714 2715 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } }, 2716 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } }, 2717 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } }, 2718 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } }, 2719 2720 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } }, 2721 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } }, 2722 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } }, 2723 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } }, 2724 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } }, 2725 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } }, 2726 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } }, 2727 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } }, 2728 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } }, 2729 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } }, 2730 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } }, 2731 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } }, 2732 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } }, 2733 2734 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } }, 2735 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } }, 2736 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } }, 2737 2738 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } }, 2739 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } }, 2740 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } }, 2741 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } }, 2742 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } }, 2743 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } }, 2744 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } }, 2745 }; 2746 2747 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = { 2748 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } }, 2749 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } }, 2750 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } }, 2751 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } }, 2752 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } }, 2753 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } }, 2754 2755 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } }, 2756 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } }, 2757 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } }, 2758 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } }, 2759 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } }, 2760 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } }, 2761 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } }, 2762 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } }, 2763 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } }, 2764 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } }, 2765 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } }, 2766 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } }, 2767 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } }, 2768 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } }, 2769 2770 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, 2771 2772 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } }, 2773 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } }, 2774 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } }, 2775 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } }, 2776 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } }, 2777 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } }, 2778 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } }, 2779 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } }, 2780 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } }, 2781 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } }, 2782 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, 2783 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } }, 2784 2785 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } }, 2786 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } }, 2787 2788 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } }, 2789 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } }, 2790 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } }, 2791 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } }, 2792 2793 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } }, 2794 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } }, 2795 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } }, 2796 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } }, 2797 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } }, 2798 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } }, 2799 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } }, 2800 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } }, 2801 2802 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } }, 2803 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } }, 2804 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } }, 2805 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } }, 2806 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } }, 2807 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } }, 2808 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } }, 2809 2810 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } }, 2811 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } }, 2812 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } }, 2813 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } }, 2814 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } }, 2815 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } }, 2816 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } }, 2817 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } }, 2818 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } }, 2819 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } }, 2820 }; 2821 2822 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = { 2823 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } }, 2824 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } }, 2825 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } }, 2826 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } }, 2827 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } }, 2828 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } }, 2829 2830 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } }, 2831 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } }, 2832 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } }, 2833 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } }, 2834 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } }, 2835 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } }, 2836 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } }, 2837 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } }, 2838 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } }, 2839 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } }, 2840 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } }, 2841 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } }, 2842 2843 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } }, 2844 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } }, 2845 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } }, 2846 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } }, 2847 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } }, 2848 2849 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } }, 2850 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } }, 2851 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb 2852 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } }, 2853 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } }, 2854 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } }, 2855 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw 2856 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } }, 2857 2858 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } }, 2859 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } }, 2860 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } }, 2861 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } }, 2862 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } }, 2863 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } }, 2864 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } }, 2865 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } }, 2866 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } }, 2867 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } }, 2868 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } }, 2869 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } }, 2870 2871 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } }, 2872 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } }, 2873 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } }, 2874 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } }, 2875 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } }, 2876 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } }, 2877 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } }, 2878 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } }, 2879 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } }, 2880 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } }, 2881 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } }, 2882 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } }, 2883 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } }, 2884 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } }, 2885 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } }, 2886 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } }, 2887 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } }, 2888 2889 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } }, 2890 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } }, 2891 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } }, 2892 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } }, 2893 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } }, 2894 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } }, 2895 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } }, 2896 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } }, 2897 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } }, 2898 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } }, 2899 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } }, 2900 2901 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } }, 2902 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } }, 2903 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } }, 2904 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } }, 2905 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } }, 2906 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } }, 2907 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } }, 2908 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } }, 2909 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } }, 2910 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } }, 2911 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } }, 2912 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } }, 2913 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } }, 2914 2915 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } }, 2916 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } }, 2917 }; 2918 2919 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = { 2920 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } }, 2921 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } }, 2922 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } }, 2923 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } }, 2924 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } }, 2925 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } }, 2926 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } }, 2927 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } }, 2928 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } }, 2929 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } }, 2930 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } }, 2931 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } }, 2932 2933 // These truncates end up widening elements. 2934 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ 2935 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ 2936 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD 2937 2938 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } }, 2939 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } }, 2940 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } }, 2941 2942 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } }, 2943 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } }, 2944 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } }, 2945 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } }, 2946 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } }, 2947 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } }, 2948 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } }, 2949 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } }, 2950 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } }, 2951 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } }, 2952 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } }, 2953 2954 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } }, 2955 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } }, 2956 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } }, 2957 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } }, 2958 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } }, 2959 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } }, 2960 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } }, 2961 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } }, 2962 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } }, 2963 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } }, 2964 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } }, 2965 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } }, 2966 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } }, 2967 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } }, 2968 2969 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } }, 2970 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } }, 2971 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } }, 2972 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } }, 2973 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } }, 2974 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } }, 2975 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } }, 2976 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } }, 2977 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } }, 2978 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } }, 2979 2980 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } }, 2981 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } }, 2982 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } }, 2983 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } }, 2984 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } }, 2985 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } }, 2986 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } }, 2987 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } }, 2988 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } }, 2989 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } }, 2990 }; 2991 2992 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = { 2993 // These are somewhat magic numbers justified by comparing the 2994 // output of llvm-mca for our various supported scheduler models 2995 // and basing it off the worst case scenario. 2996 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } }, 2997 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } }, 2998 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } }, 2999 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } }, 3000 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } }, 3001 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } }, 3002 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } }, 3003 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } }, 3004 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } }, 3005 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } }, 3006 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } }, 3007 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } }, 3008 3009 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } }, 3010 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } }, 3011 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } }, 3012 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } }, 3013 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } }, 3014 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } }, 3015 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } }, 3016 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } }, 3017 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } }, 3018 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } }, 3019 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } }, 3020 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } }, 3021 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } }, 3022 3023 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } }, 3024 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } }, 3025 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } }, 3026 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } }, 3027 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } }, 3028 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } }, 3029 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } }, 3030 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } }, 3031 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } }, 3032 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } }, 3033 3034 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } }, 3035 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } }, 3036 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } }, 3037 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } }, 3038 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } }, 3039 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } }, 3040 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } }, 3041 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } }, 3042 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } }, 3043 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } }, 3044 3045 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } }, 3046 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } }, 3047 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } }, 3048 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } }, 3049 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } }, 3050 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } }, 3051 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } }, 3052 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } }, 3053 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } }, 3054 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } }, 3055 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } }, 3056 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } }, 3057 3058 // These truncates are really widening elements. 3059 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD 3060 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ 3061 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD 3062 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD 3063 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD 3064 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW 3065 3066 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB 3067 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, 3068 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB 3069 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } }, 3070 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } }, 3071 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } }, 3072 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } }, 3073 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } }, 3074 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB 3075 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW 3076 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD 3077 }; 3078 3079 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = { 3080 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } }, 3081 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } }, 3082 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } }, 3083 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } }, 3084 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd 3085 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } }, 3086 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } }, 3087 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd 3088 }; 3089 3090 // Attempt to map directly to (simple) MVT types to let us match custom entries. 3091 EVT SrcTy = TLI->getValueType(DL, Src); 3092 EVT DstTy = TLI->getValueType(DL, Dst); 3093 3094 // The function getSimpleVT only handles simple value types. 3095 if (SrcTy.isSimple() && DstTy.isSimple()) { 3096 MVT SimpleSrcTy = SrcTy.getSimpleVT(); 3097 MVT SimpleDstTy = DstTy.getSimpleVT(); 3098 3099 if (ST->useAVX512Regs()) { 3100 if (ST->hasBWI()) 3101 if (const auto *Entry = ConvertCostTableLookup( 3102 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 3103 if (auto KindCost = Entry->Cost[CostKind]) 3104 return *KindCost; 3105 3106 if (ST->hasDQI()) 3107 if (const auto *Entry = ConvertCostTableLookup( 3108 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 3109 if (auto KindCost = Entry->Cost[CostKind]) 3110 return *KindCost; 3111 3112 if (ST->hasAVX512()) 3113 if (const auto *Entry = ConvertCostTableLookup( 3114 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 3115 if (auto KindCost = Entry->Cost[CostKind]) 3116 return *KindCost; 3117 } 3118 3119 if (ST->hasBWI()) 3120 if (const auto *Entry = ConvertCostTableLookup( 3121 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 3122 if (auto KindCost = Entry->Cost[CostKind]) 3123 return *KindCost; 3124 3125 if (ST->hasDQI()) 3126 if (const auto *Entry = ConvertCostTableLookup( 3127 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 3128 if (auto KindCost = Entry->Cost[CostKind]) 3129 return *KindCost; 3130 3131 if (ST->hasAVX512()) 3132 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, 3133 SimpleDstTy, SimpleSrcTy)) 3134 if (auto KindCost = Entry->Cost[CostKind]) 3135 return *KindCost; 3136 3137 if (ST->hasAVX2()) { 3138 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 3139 SimpleDstTy, SimpleSrcTy)) 3140 if (auto KindCost = Entry->Cost[CostKind]) 3141 return *KindCost; 3142 } 3143 3144 if (ST->hasAVX()) { 3145 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, 3146 SimpleDstTy, SimpleSrcTy)) 3147 if (auto KindCost = Entry->Cost[CostKind]) 3148 return *KindCost; 3149 } 3150 3151 if (ST->hasF16C()) { 3152 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD, 3153 SimpleDstTy, SimpleSrcTy)) 3154 if (auto KindCost = Entry->Cost[CostKind]) 3155 return *KindCost; 3156 } 3157 3158 if (ST->hasSSE41()) { 3159 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, 3160 SimpleDstTy, SimpleSrcTy)) 3161 if (auto KindCost = Entry->Cost[CostKind]) 3162 return *KindCost; 3163 } 3164 3165 if (ST->hasSSE2()) { 3166 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 3167 SimpleDstTy, SimpleSrcTy)) 3168 if (auto KindCost = Entry->Cost[CostKind]) 3169 return *KindCost; 3170 } 3171 3172 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) || 3173 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) { 3174 // fp16 conversions not covered by any table entries require a libcall. 3175 // Return a large (arbitrary) number to model this. 3176 return InstructionCost(64); 3177 } 3178 } 3179 3180 // Fall back to legalized types. 3181 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src); 3182 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst); 3183 3184 // If we're truncating to the same legalized type - just assume its free. 3185 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) 3186 return TTI::TCC_Free; 3187 3188 if (ST->useAVX512Regs()) { 3189 if (ST->hasBWI()) 3190 if (const auto *Entry = ConvertCostTableLookup( 3191 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second)) 3192 if (auto KindCost = Entry->Cost[CostKind]) 3193 return std::max(LTSrc.first, LTDest.first) * *KindCost; 3194 3195 if (ST->hasDQI()) 3196 if (const auto *Entry = ConvertCostTableLookup( 3197 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second)) 3198 if (auto KindCost = Entry->Cost[CostKind]) 3199 return std::max(LTSrc.first, LTDest.first) * *KindCost; 3200 3201 if (ST->hasAVX512()) 3202 if (const auto *Entry = ConvertCostTableLookup( 3203 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second)) 3204 if (auto KindCost = Entry->Cost[CostKind]) 3205 return std::max(LTSrc.first, LTDest.first) * *KindCost; 3206 } 3207 3208 if (ST->hasBWI()) 3209 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, 3210 LTDest.second, LTSrc.second)) 3211 if (auto KindCost = Entry->Cost[CostKind]) 3212 return std::max(LTSrc.first, LTDest.first) * *KindCost; 3213 3214 if (ST->hasDQI()) 3215 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, 3216 LTDest.second, LTSrc.second)) 3217 if (auto KindCost = Entry->Cost[CostKind]) 3218 return std::max(LTSrc.first, LTDest.first) * *KindCost; 3219 3220 if (ST->hasAVX512()) 3221 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, 3222 LTDest.second, LTSrc.second)) 3223 if (auto KindCost = Entry->Cost[CostKind]) 3224 return std::max(LTSrc.first, LTDest.first) * *KindCost; 3225 3226 if (ST->hasAVX2()) 3227 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 3228 LTDest.second, LTSrc.second)) 3229 if (auto KindCost = Entry->Cost[CostKind]) 3230 return std::max(LTSrc.first, LTDest.first) * *KindCost; 3231 3232 if (ST->hasAVX()) 3233 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, 3234 LTDest.second, LTSrc.second)) 3235 if (auto KindCost = Entry->Cost[CostKind]) 3236 return std::max(LTSrc.first, LTDest.first) * *KindCost; 3237 3238 if (ST->hasF16C()) { 3239 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD, 3240 LTDest.second, LTSrc.second)) 3241 if (auto KindCost = Entry->Cost[CostKind]) 3242 return std::max(LTSrc.first, LTDest.first) * *KindCost; 3243 } 3244 3245 if (ST->hasSSE41()) 3246 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, 3247 LTDest.second, LTSrc.second)) 3248 if (auto KindCost = Entry->Cost[CostKind]) 3249 return std::max(LTSrc.first, LTDest.first) * *KindCost; 3250 3251 if (ST->hasSSE2()) 3252 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 3253 LTDest.second, LTSrc.second)) 3254 if (auto KindCost = Entry->Cost[CostKind]) 3255 return std::max(LTSrc.first, LTDest.first) * *KindCost; 3256 3257 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for 3258 // sitofp. 3259 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && 3260 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { 3261 Type *ExtSrc = Src->getWithNewBitWidth(32); 3262 unsigned ExtOpc = 3263 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; 3264 3265 // For scalar loads the extend would be free. 3266 InstructionCost ExtCost = 0; 3267 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0)))) 3268 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind); 3269 3270 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc, 3271 TTI::CastContextHint::None, CostKind); 3272 } 3273 3274 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi 3275 // i32. 3276 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && 3277 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { 3278 Type *TruncDst = Dst->getWithNewBitWidth(32); 3279 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) + 3280 getCastInstrCost(Instruction::Trunc, Dst, TruncDst, 3281 TTI::CastContextHint::None, CostKind); 3282 } 3283 3284 // TODO: Allow non-throughput costs that aren't binary. 3285 auto AdjustCost = [&CostKind](InstructionCost Cost, 3286 InstructionCost N = 1) -> InstructionCost { 3287 if (CostKind != TTI::TCK_RecipThroughput) 3288 return Cost == 0 ? 0 : N; 3289 return Cost * N; 3290 }; 3291 return AdjustCost( 3292 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 3293 } 3294 3295 InstructionCost X86TTIImpl::getCmpSelInstrCost( 3296 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, 3297 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, 3298 TTI::OperandValueInfo Op2Info, const Instruction *I) { 3299 // Early out if this type isn't scalar/vector integer/float. 3300 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) 3301 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 3302 Op1Info, Op2Info, I); 3303 3304 // Legalize the type. 3305 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 3306 3307 MVT MTy = LT.second; 3308 3309 int ISD = TLI->InstructionOpcodeToISD(Opcode); 3310 assert(ISD && "Invalid opcode"); 3311 3312 InstructionCost ExtraCost = 0; 3313 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { 3314 // Some vector comparison predicates cost extra instructions. 3315 // TODO: Adjust ExtraCost based on CostKind? 3316 // TODO: Should we invert this and assume worst case cmp costs 3317 // and reduce for particular predicates? 3318 if (MTy.isVector() && 3319 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || 3320 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || 3321 ST->hasBWI())) { 3322 // Fallback to I if a specific predicate wasn't specified. 3323 CmpInst::Predicate Pred = VecPred; 3324 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE || 3325 Pred == CmpInst::BAD_FCMP_PREDICATE)) 3326 Pred = cast<CmpInst>(I)->getPredicate(); 3327 3328 bool CmpWithConstant = false; 3329 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I)) 3330 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1)); 3331 3332 switch (Pred) { 3333 case CmpInst::Predicate::ICMP_NE: 3334 // xor(cmpeq(x,y),-1) 3335 ExtraCost = CmpWithConstant ? 0 : 1; 3336 break; 3337 case CmpInst::Predicate::ICMP_SGE: 3338 case CmpInst::Predicate::ICMP_SLE: 3339 // xor(cmpgt(x,y),-1) 3340 ExtraCost = CmpWithConstant ? 0 : 1; 3341 break; 3342 case CmpInst::Predicate::ICMP_ULT: 3343 case CmpInst::Predicate::ICMP_UGT: 3344 // cmpgt(xor(x,signbit),xor(y,signbit)) 3345 // xor(cmpeq(pmaxu(x,y),x),-1) 3346 ExtraCost = CmpWithConstant ? 1 : 2; 3347 break; 3348 case CmpInst::Predicate::ICMP_ULE: 3349 case CmpInst::Predicate::ICMP_UGE: 3350 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || 3351 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { 3352 // cmpeq(psubus(x,y),0) 3353 // cmpeq(pminu(x,y),x) 3354 ExtraCost = 1; 3355 } else { 3356 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) 3357 ExtraCost = CmpWithConstant ? 2 : 3; 3358 } 3359 break; 3360 case CmpInst::Predicate::FCMP_ONE: 3361 case CmpInst::Predicate::FCMP_UEQ: 3362 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases. 3363 // Use FCMP_UEQ expansion - FCMP_ONE should be the same. 3364 if (CondTy && !ST->hasAVX()) 3365 return getCmpSelInstrCost(Opcode, ValTy, CondTy, 3366 CmpInst::Predicate::FCMP_UNO, CostKind, 3367 Op1Info, Op2Info) + 3368 getCmpSelInstrCost(Opcode, ValTy, CondTy, 3369 CmpInst::Predicate::FCMP_OEQ, CostKind, 3370 Op1Info, Op2Info) + 3371 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind); 3372 3373 break; 3374 case CmpInst::Predicate::BAD_ICMP_PREDICATE: 3375 case CmpInst::Predicate::BAD_FCMP_PREDICATE: 3376 // Assume worst case scenario and add the maximum extra cost. 3377 ExtraCost = 3; 3378 break; 3379 default: 3380 break; 3381 } 3382 } 3383 } 3384 3385 static const CostKindTblEntry SLMCostTbl[] = { 3386 // slm pcmpeq/pcmpgt throughput is 2 3387 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } }, 3388 // slm pblendvb/blendvpd/blendvps throughput is 4 3389 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd 3390 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps 3391 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb 3392 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb 3393 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb 3394 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb 3395 }; 3396 3397 static const CostKindTblEntry AVX512BWCostTbl[] = { 3398 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } }, 3399 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } }, 3400 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } }, 3401 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } }, 3402 3403 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } }, 3404 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } }, 3405 }; 3406 3407 static const CostKindTblEntry AVX512CostTbl[] = { 3408 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } }, 3409 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } }, 3410 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } }, 3411 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } }, 3412 3413 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } }, 3414 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } }, 3415 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, 3416 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } }, 3417 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } }, 3418 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } }, 3419 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } }, 3420 3421 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } }, 3422 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } }, 3423 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } }, 3424 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } }, 3425 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } }, 3426 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } }, 3427 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } }, 3428 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } }, 3429 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } }, 3430 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } }, 3431 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } }, 3432 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } }, 3433 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } }, 3434 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } }, 3435 3436 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } }, 3437 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } }, 3438 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } }, 3439 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } }, 3440 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } }, 3441 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } }, 3442 }; 3443 3444 static const CostKindTblEntry AVX2CostTbl[] = { 3445 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } }, 3446 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } }, 3447 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } }, 3448 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } }, 3449 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } }, 3450 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } }, 3451 3452 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } }, 3453 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } }, 3454 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } }, 3455 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } }, 3456 3457 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd 3458 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps 3459 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb 3460 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb 3461 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb 3462 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb 3463 }; 3464 3465 static const CostKindTblEntry XOPCostTbl[] = { 3466 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, 3467 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, 3468 }; 3469 3470 static const CostKindTblEntry AVX1CostTbl[] = { 3471 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } }, 3472 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } }, 3473 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } }, 3474 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } }, 3475 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } }, 3476 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } }, 3477 3478 // AVX1 does not support 8-wide integer compare. 3479 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, 3480 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } }, 3481 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } }, 3482 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } }, 3483 3484 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd 3485 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps 3486 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd 3487 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps 3488 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps 3489 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps 3490 }; 3491 3492 static const CostKindTblEntry SSE42CostTbl[] = { 3493 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } }, 3494 }; 3495 3496 static const CostKindTblEntry SSE41CostTbl[] = { 3497 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } }, 3498 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } }, 3499 3500 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd 3501 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd 3502 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps 3503 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps 3504 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb 3505 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb 3506 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb 3507 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb 3508 }; 3509 3510 static const CostKindTblEntry SSE2CostTbl[] = { 3511 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } }, 3512 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } }, 3513 3514 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion 3515 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } }, 3516 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } }, 3517 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } }, 3518 3519 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd 3520 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd 3521 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por 3522 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por 3523 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por 3524 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por 3525 }; 3526 3527 static const CostKindTblEntry SSE1CostTbl[] = { 3528 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } }, 3529 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } }, 3530 3531 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps 3532 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps 3533 }; 3534 3535 if (ST->useSLMArithCosts()) 3536 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 3537 if (auto KindCost = Entry->Cost[CostKind]) 3538 return LT.first * (ExtraCost + *KindCost); 3539 3540 if (ST->hasBWI()) 3541 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 3542 if (auto KindCost = Entry->Cost[CostKind]) 3543 return LT.first * (ExtraCost + *KindCost); 3544 3545 if (ST->hasAVX512()) 3546 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 3547 if (auto KindCost = Entry->Cost[CostKind]) 3548 return LT.first * (ExtraCost + *KindCost); 3549 3550 if (ST->hasAVX2()) 3551 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 3552 if (auto KindCost = Entry->Cost[CostKind]) 3553 return LT.first * (ExtraCost + *KindCost); 3554 3555 if (ST->hasXOP()) 3556 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) 3557 if (auto KindCost = Entry->Cost[CostKind]) 3558 return LT.first * (ExtraCost + *KindCost); 3559 3560 if (ST->hasAVX()) 3561 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 3562 if (auto KindCost = Entry->Cost[CostKind]) 3563 return LT.first * (ExtraCost + *KindCost); 3564 3565 if (ST->hasSSE42()) 3566 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 3567 if (auto KindCost = Entry->Cost[CostKind]) 3568 return LT.first * (ExtraCost + *KindCost); 3569 3570 if (ST->hasSSE41()) 3571 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 3572 if (auto KindCost = Entry->Cost[CostKind]) 3573 return LT.first * (ExtraCost + *KindCost); 3574 3575 if (ST->hasSSE2()) 3576 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 3577 if (auto KindCost = Entry->Cost[CostKind]) 3578 return LT.first * (ExtraCost + *KindCost); 3579 3580 if (ST->hasSSE1()) 3581 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) 3582 if (auto KindCost = Entry->Cost[CostKind]) 3583 return LT.first * (ExtraCost + *KindCost); 3584 3585 // Assume a 3cy latency for fp select ops. 3586 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select) 3587 if (ValTy->getScalarType()->isFloatingPointTy()) 3588 return 3; 3589 3590 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 3591 Op1Info, Op2Info, I); 3592 } 3593 3594 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } 3595 3596 InstructionCost 3597 X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 3598 TTI::TargetCostKind CostKind) { 3599 // Costs should match the codegen from: 3600 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll 3601 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll 3602 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll 3603 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll 3604 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll 3605 3606 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not 3607 // specialized in these tables yet. 3608 static const CostKindTblEntry AVX512VBMI2CostTbl[] = { 3609 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } }, 3610 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } }, 3611 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } }, 3612 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } }, 3613 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } }, 3614 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } }, 3615 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } }, 3616 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } }, 3617 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } }, 3618 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } }, 3619 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } }, 3620 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } }, 3621 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } }, 3622 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } }, 3623 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } }, 3624 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } }, 3625 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } }, 3626 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } }, 3627 }; 3628 static const CostKindTblEntry AVX512BITALGCostTbl[] = { 3629 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } }, 3630 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } }, 3631 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } }, 3632 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } }, 3633 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } }, 3634 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } }, 3635 }; 3636 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = { 3637 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } }, 3638 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } }, 3639 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } }, 3640 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } }, 3641 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } }, 3642 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } }, 3643 }; 3644 static const CostKindTblEntry AVX512CDCostTbl[] = { 3645 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } }, 3646 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } }, 3647 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } }, 3648 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } }, 3649 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } }, 3650 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } }, 3651 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } }, 3652 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } }, 3653 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } }, 3654 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } }, 3655 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } }, 3656 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } }, 3657 3658 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, 3659 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, 3660 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } }, 3661 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } }, 3662 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } }, 3663 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } }, 3664 }; 3665 static const CostKindTblEntry AVX512BWCostTbl[] = { 3666 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } }, 3667 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } }, 3668 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } }, 3669 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } }, 3670 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } }, 3671 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } }, 3672 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } }, 3673 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } }, 3674 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } }, 3675 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } }, 3676 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } }, 3677 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } }, 3678 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } }, 3679 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } }, 3680 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } }, 3681 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } }, 3682 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } }, 3683 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } }, 3684 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } }, 3685 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } }, 3686 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } }, 3687 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } }, 3688 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } }, 3689 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } }, 3690 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } }, 3691 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } }, 3692 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } }, 3693 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } }, 3694 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } }, 3695 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } }, 3696 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } }, 3697 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } }, 3698 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } }, 3699 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } }, 3700 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } }, 3701 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } }, 3702 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } }, 3703 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } }, 3704 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } }, 3705 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } }, 3706 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } }, 3707 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } }, 3708 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } }, 3709 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } }, 3710 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } }, 3711 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } }, 3712 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } }, 3713 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } }, 3714 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } }, 3715 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } }, 3716 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } }, 3717 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } }, 3718 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } }, 3719 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } }, 3720 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } }, 3721 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } }, 3722 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } }, 3723 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } }, 3724 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } }, 3725 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } }, 3726 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } }, 3727 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } }, 3728 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } }, 3729 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } }, 3730 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } }, 3731 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } }, 3732 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } }, 3733 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } }, 3734 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } }, 3735 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } }, 3736 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } }, 3737 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } }, 3738 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } }, 3739 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } }, 3740 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } }, 3741 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } }, 3742 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } }, 3743 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } }, 3744 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } }, 3745 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } }, 3746 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } }, 3747 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } }, 3748 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } }, 3749 }; 3750 static const CostKindTblEntry AVX512CostTbl[] = { 3751 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } }, 3752 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } }, 3753 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } }, 3754 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } }, 3755 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } }, 3756 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } }, 3757 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } }, 3758 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } }, 3759 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } }, 3760 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } }, 3761 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } }, 3762 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } }, 3763 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } }, 3764 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } }, 3765 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } }, 3766 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } }, 3767 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } }, 3768 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } }, 3769 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } }, 3770 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } }, 3771 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } }, 3772 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } }, 3773 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } }, 3774 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } }, 3775 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, 3776 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, 3777 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } }, 3778 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } }, 3779 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } }, 3780 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } }, 3781 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } }, 3782 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } }, 3783 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } }, 3784 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } }, 3785 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } }, 3786 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } }, 3787 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } }, 3788 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } }, 3789 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } }, 3790 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } }, 3791 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } }, 3792 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } }, 3793 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } }, 3794 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } }, 3795 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } }, 3796 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } }, 3797 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } }, 3798 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } }, 3799 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } }, 3800 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } }, 3801 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } }, 3802 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } }, 3803 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } }, 3804 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } }, 3805 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } }, 3806 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } }, 3807 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } }, 3808 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } }, 3809 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } }, 3810 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } }, 3811 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } }, 3812 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } }, 3813 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } }, 3814 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } }, 3815 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } }, 3816 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } }, 3817 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } }, 3818 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } }, 3819 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } }, 3820 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } }, 3821 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } }, 3822 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } }, 3823 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } }, 3824 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } }, 3825 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } }, 3826 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } }, 3827 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } }, 3828 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } }, 3829 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } }, 3830 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } }, 3831 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } }, 3832 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } }, 3833 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } }, 3834 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } }, 3835 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } }, 3836 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } }, 3837 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } }, 3838 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } }, 3839 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } }, 3840 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } }, 3841 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} }, 3842 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } }, 3843 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } }, 3844 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } }, 3845 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } }, 3846 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } }, 3847 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } }, 3848 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } }, 3849 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } }, 3850 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } }, 3851 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } }, 3852 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } }, 3853 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } }, 3854 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } }, 3855 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } }, 3856 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } }, 3857 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } }, 3858 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } }, 3859 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } }, 3860 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } }, 3861 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } }, 3862 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } }, 3863 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } }, 3864 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } }, 3865 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } }, 3866 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } }, 3867 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } }, 3868 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ 3869 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ 3870 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/ 3871 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/ 3872 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ 3873 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ 3874 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/ 3875 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/ 3876 }; 3877 static const CostKindTblEntry XOPCostTbl[] = { 3878 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } }, 3879 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } }, 3880 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } }, 3881 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } }, 3882 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } }, 3883 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } }, 3884 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } }, 3885 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } }, 3886 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } }, 3887 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } }, 3888 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } }, 3889 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } }, 3890 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) 3891 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } }, 3892 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } }, 3893 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } }, 3894 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } }, 3895 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } }, 3896 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } }, 3897 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } }, 3898 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } }, 3899 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } }, 3900 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } }, 3901 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } }, 3902 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } }, 3903 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } }, 3904 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } }, 3905 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } }, 3906 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } }, 3907 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } }, 3908 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } }, 3909 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } }, 3910 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } }, 3911 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } }, 3912 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } }, 3913 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } }, 3914 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } }, 3915 }; 3916 static const CostKindTblEntry AVX2CostTbl[] = { 3917 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) 3918 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) 3919 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } }, 3920 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } }, 3921 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } }, 3922 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } }, 3923 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } }, 3924 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } }, 3925 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } }, 3926 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } }, 3927 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } }, 3928 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } }, 3929 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } }, 3930 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } }, 3931 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } }, 3932 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } }, 3933 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } }, 3934 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } }, 3935 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } }, 3936 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } }, 3937 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } }, 3938 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } }, 3939 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } }, 3940 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } }, 3941 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } }, 3942 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } }, 3943 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } }, 3944 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } }, 3945 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } }, 3946 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } }, 3947 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } }, 3948 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } }, 3949 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } }, 3950 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } }, 3951 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } }, 3952 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } }, 3953 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } }, 3954 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } }, 3955 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } }, 3956 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } }, 3957 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } }, 3958 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } }, 3959 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } }, 3960 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } }, 3961 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } }, 3962 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } }, 3963 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } }, 3964 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } }, 3965 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } }, 3966 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } }, 3967 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } }, 3968 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } }, 3969 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } }, 3970 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } }, 3971 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } }, 3972 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } }, 3973 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } }, 3974 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } }, 3975 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } }, 3976 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } }, 3977 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } }, 3978 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } }, 3979 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } }, 3980 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } }, 3981 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } }, 3982 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } }, 3983 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } }, 3984 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } }, 3985 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } }, 3986 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } }, 3987 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } }, 3988 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } }, 3989 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } }, 3990 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } }, 3991 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } }, 3992 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } }, 3993 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } }, 3994 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } }, 3995 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } }, 3996 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } }, 3997 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } }, 3998 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } }, 3999 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } }, 4000 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } }, 4001 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } }, 4002 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } }, 4003 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } }, 4004 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } }, 4005 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } }, 4006 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } }, 4007 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } }, 4008 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } }, 4009 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } }, 4010 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } }, 4011 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } }, 4012 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } }, 4013 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } }, 4014 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } }, 4015 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } }, 4016 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } }, 4017 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } }, 4018 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } }, 4019 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } }, 4020 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } }, 4021 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS 4022 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS 4023 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS 4024 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD 4025 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD 4026 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD 4027 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss 4028 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps 4029 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps 4030 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd 4031 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd 4032 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd 4033 }; 4034 static const CostKindTblEntry AVX1CostTbl[] = { 4035 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) 4036 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } }, 4037 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } }, 4038 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } }, 4039 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert 4040 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } }, 4041 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert 4042 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } }, 4043 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert 4044 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } }, 4045 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert 4046 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } }, 4047 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } }, 4048 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } }, 4049 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } }, 4050 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } }, 4051 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } }, 4052 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } }, 4053 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert 4054 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } }, 4055 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert 4056 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } }, 4057 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert 4058 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } }, 4059 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert 4060 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } }, 4061 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert 4062 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } }, 4063 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert 4064 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } }, 4065 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert 4066 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } }, 4067 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert 4068 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } }, 4069 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert 4070 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } }, 4071 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert 4072 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } }, 4073 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert 4074 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } }, 4075 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert 4076 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } }, 4077 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } }, 4078 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert 4079 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert 4080 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4081 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4082 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert 4083 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } }, 4084 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4085 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4086 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4087 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert 4088 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, 4089 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4090 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4091 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4092 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } }, 4093 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } }, 4094 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } }, 4095 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } }, 4096 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } }, 4097 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } }, 4098 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } }, 4099 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } }, 4100 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } }, 4101 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert 4102 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert 4103 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4104 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4105 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } }, 4106 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert 4107 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert 4108 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4109 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4110 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert 4111 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } }, 4112 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4113 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4114 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4115 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert 4116 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } }, 4117 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4118 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4119 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4120 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } }, 4121 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } }, 4122 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } }, 4123 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } }, 4124 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } }, 4125 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } }, 4126 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } }, 4127 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } }, 4128 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } }, 4129 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert 4130 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert 4131 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4132 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4133 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert 4134 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS 4135 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS 4136 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS 4137 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD 4138 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD 4139 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD 4140 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss 4141 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps 4142 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps 4143 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd 4144 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd 4145 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd 4146 }; 4147 static const CostKindTblEntry GFNICostTbl[] = { 4148 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb 4149 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb 4150 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb 4151 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb 4152 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 4153 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 4154 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 4155 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb 4156 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb 4157 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb 4158 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb 4159 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb 4160 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb 4161 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb 4162 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb 4163 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb 4164 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 4165 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 4166 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb 4167 }; 4168 static const CostKindTblEntry GLMCostTbl[] = { 4169 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss 4170 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps 4171 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd 4172 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd 4173 }; 4174 static const CostKindTblEntry SLMCostTbl[] = { 4175 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } }, 4176 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } }, 4177 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } }, 4178 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss 4179 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps 4180 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd 4181 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd 4182 }; 4183 static const CostKindTblEntry SSE42CostTbl[] = { 4184 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS 4185 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS 4186 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD 4187 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD 4188 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ 4189 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ 4190 }; 4191 static const CostKindTblEntry SSE41CostTbl[] = { 4192 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X) 4193 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } }, 4194 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } }, 4195 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } }, 4196 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } }, 4197 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } }, 4198 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } }, 4199 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } }, 4200 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, 4201 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } }, 4202 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } }, 4203 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } }, 4204 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } }, 4205 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } }, 4206 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } }, 4207 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } }, 4208 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } }, 4209 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } }, 4210 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } }, 4211 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } }, 4212 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } }, 4213 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } }, 4214 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } }, 4215 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } }, 4216 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } }, 4217 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } }, 4218 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } }, 4219 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } }, 4220 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } }, 4221 }; 4222 static const CostKindTblEntry SSSE3CostTbl[] = { 4223 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } }, 4224 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } }, 4225 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } }, 4226 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } }, 4227 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } }, 4228 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } }, 4229 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } }, 4230 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } }, 4231 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } }, 4232 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } }, 4233 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } }, 4234 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } }, 4235 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } }, 4236 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } }, 4237 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } }, 4238 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } }, 4239 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } }, 4240 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } }, 4241 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } }, 4242 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } }, 4243 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } }, 4244 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } } 4245 }; 4246 static const CostKindTblEntry SSE2CostTbl[] = { 4247 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } }, 4248 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } }, 4249 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } }, 4250 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } }, 4251 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } }, 4252 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } }, 4253 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } }, 4254 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } }, 4255 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } }, 4256 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } }, 4257 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } }, 4258 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } }, 4259 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } }, 4260 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } }, 4261 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } }, 4262 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } }, 4263 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } }, 4264 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } }, 4265 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } }, 4266 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } }, 4267 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } }, 4268 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } }, 4269 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } }, 4270 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } }, 4271 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } }, 4272 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } }, 4273 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } }, 4274 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } }, 4275 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } }, 4276 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } }, 4277 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } }, 4278 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } }, 4279 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } }, 4280 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } }, 4281 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } }, 4282 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } }, 4283 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } }, 4284 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } }, 4285 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } }, 4286 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } }, 4287 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } }, 4288 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } }, 4289 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } }, 4290 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } }, 4291 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } }, 4292 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } }, 4293 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } }, 4294 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } }, 4295 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } }, 4296 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } }, 4297 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } }, 4298 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } }, 4299 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } }, 4300 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } }, 4301 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } }, 4302 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } }, 4303 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } }, 4304 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } }, 4305 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } }, 4306 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } }, 4307 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } }, 4308 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } }, 4309 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } }, 4310 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, 4311 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } }, 4312 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ 4313 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ 4314 }; 4315 static const CostKindTblEntry SSE1CostTbl[] = { 4316 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, 4317 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } }, 4318 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/ 4319 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/ 4320 }; 4321 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets 4322 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } }, 4323 }; 4324 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets 4325 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } }, 4326 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } }, 4327 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } }, 4328 }; 4329 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets 4330 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } }, 4331 }; 4332 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets 4333 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } }, 4334 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } }, 4335 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } }, 4336 }; 4337 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets 4338 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt 4339 }; 4340 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets 4341 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt 4342 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext()) 4343 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext()) 4344 }; 4345 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets 4346 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV 4347 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } }, 4348 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } }, 4349 { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR 4350 { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR 4351 { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR 4352 { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR 4353 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR 4354 { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF 4355 { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF 4356 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF 4357 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF 4358 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF 4359 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } }, 4360 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } }, 4361 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } }, 4362 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } }, 4363 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } }, 4364 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } }, 4365 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } }, 4366 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } }, 4367 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } }, 4368 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } }, 4369 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } }, 4370 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } }, 4371 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } }, 4372 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } }, 4373 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } }, 4374 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } }, 4375 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } }, 4376 }; 4377 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets 4378 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV 4379 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV 4380 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA 4381 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } }, 4382 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } }, 4383 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } }, 4384 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } }, 4385 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL 4386 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV 4387 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV 4388 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV 4389 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR 4390 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR 4391 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR 4392 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH 4393 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH 4394 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH 4395 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF 4396 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF 4397 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF 4398 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } }, 4399 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } }, 4400 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } }, 4401 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } }, 4402 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } }, 4403 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } }, 4404 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } }, 4405 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } }, 4406 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } }, 4407 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } }, 4408 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } }, 4409 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } }, 4410 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } }, 4411 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } }, 4412 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } }, 4413 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } }, 4414 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } }, 4415 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } }, 4416 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } }, 4417 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } }, 4418 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } }, 4419 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } }, 4420 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } }, 4421 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } }, 4422 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } }, 4423 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } }, 4424 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } }, 4425 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } }, 4426 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } }, 4427 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } }, 4428 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } }, 4429 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } }, 4430 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } }, 4431 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } }, 4432 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } }, 4433 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } }, 4434 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } }, 4435 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } }, 4436 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } }, 4437 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } }, 4438 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } }, 4439 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } }, 4440 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } }, 4441 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } }, 4442 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } }, 4443 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } }, 4444 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } }, 4445 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } }, 4446 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } }, 4447 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } }, 4448 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } }, 4449 }; 4450 4451 Type *RetTy = ICA.getReturnType(); 4452 Type *OpTy = RetTy; 4453 Intrinsic::ID IID = ICA.getID(); 4454 unsigned ISD = ISD::DELETED_NODE; 4455 switch (IID) { 4456 default: 4457 break; 4458 case Intrinsic::abs: 4459 ISD = ISD::ABS; 4460 break; 4461 case Intrinsic::bitreverse: 4462 ISD = ISD::BITREVERSE; 4463 break; 4464 case Intrinsic::bswap: 4465 ISD = ISD::BSWAP; 4466 break; 4467 case Intrinsic::ctlz: 4468 ISD = ISD::CTLZ; 4469 break; 4470 case Intrinsic::ctpop: 4471 ISD = ISD::CTPOP; 4472 break; 4473 case Intrinsic::cttz: 4474 ISD = ISD::CTTZ; 4475 break; 4476 case Intrinsic::fshl: 4477 ISD = ISD::FSHL; 4478 if (!ICA.isTypeBasedOnly()) { 4479 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 4480 if (Args[0] == Args[1]) { 4481 ISD = ISD::ROTL; 4482 // Handle uniform constant rotation amounts. 4483 // TODO: Handle funnel-shift cases. 4484 const APInt *Amt; 4485 if (Args[2] && 4486 PatternMatch::match(Args[2], PatternMatch::m_APIntAllowPoison(Amt))) 4487 ISD = X86ISD::VROTLI; 4488 } 4489 } 4490 break; 4491 case Intrinsic::fshr: 4492 // FSHR has same costs so don't duplicate. 4493 ISD = ISD::FSHL; 4494 if (!ICA.isTypeBasedOnly()) { 4495 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 4496 if (Args[0] == Args[1]) { 4497 ISD = ISD::ROTR; 4498 // Handle uniform constant rotation amount. 4499 // TODO: Handle funnel-shift cases. 4500 const APInt *Amt; 4501 if (Args[2] && 4502 PatternMatch::match(Args[2], PatternMatch::m_APIntAllowPoison(Amt))) 4503 ISD = X86ISD::VROTLI; 4504 } 4505 } 4506 break; 4507 case Intrinsic::lrint: 4508 case Intrinsic::llrint: { 4509 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which 4510 // have the same costs as the CVTTP2SI (fptosi) instructions 4511 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes(); 4512 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0], 4513 TTI::CastContextHint::None, CostKind); 4514 } 4515 case Intrinsic::maxnum: 4516 case Intrinsic::minnum: 4517 // FMINNUM has same costs so don't duplicate. 4518 ISD = ISD::FMAXNUM; 4519 break; 4520 case Intrinsic::sadd_sat: 4521 ISD = ISD::SADDSAT; 4522 break; 4523 case Intrinsic::smax: 4524 ISD = ISD::SMAX; 4525 break; 4526 case Intrinsic::smin: 4527 ISD = ISD::SMIN; 4528 break; 4529 case Intrinsic::ssub_sat: 4530 ISD = ISD::SSUBSAT; 4531 break; 4532 case Intrinsic::uadd_sat: 4533 ISD = ISD::UADDSAT; 4534 break; 4535 case Intrinsic::umax: 4536 ISD = ISD::UMAX; 4537 break; 4538 case Intrinsic::umin: 4539 ISD = ISD::UMIN; 4540 break; 4541 case Intrinsic::usub_sat: 4542 ISD = ISD::USUBSAT; 4543 break; 4544 case Intrinsic::sqrt: 4545 ISD = ISD::FSQRT; 4546 break; 4547 case Intrinsic::sadd_with_overflow: 4548 case Intrinsic::ssub_with_overflow: 4549 // SSUBO has same costs so don't duplicate. 4550 ISD = ISD::SADDO; 4551 OpTy = RetTy->getContainedType(0); 4552 break; 4553 case Intrinsic::uadd_with_overflow: 4554 case Intrinsic::usub_with_overflow: 4555 // USUBO has same costs so don't duplicate. 4556 ISD = ISD::UADDO; 4557 OpTy = RetTy->getContainedType(0); 4558 break; 4559 case Intrinsic::smul_with_overflow: 4560 ISD = ISD::SMULO; 4561 OpTy = RetTy->getContainedType(0); 4562 break; 4563 case Intrinsic::umul_with_overflow: 4564 ISD = ISD::UMULO; 4565 OpTy = RetTy->getContainedType(0); 4566 break; 4567 } 4568 4569 if (ISD != ISD::DELETED_NODE) { 4570 auto adjustTableCost = [&](int ISD, unsigned Cost, 4571 std::pair<InstructionCost, MVT> LT, 4572 FastMathFlags FMF) -> InstructionCost { 4573 InstructionCost LegalizationCost = LT.first; 4574 MVT MTy = LT.second; 4575 4576 // If there are no NANs to deal with, then these are reduced to a 4577 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we 4578 // assume is used in the non-fast case. 4579 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { 4580 if (FMF.noNaNs()) 4581 return LegalizationCost * 1; 4582 } 4583 4584 // For cases where some ops can be folded into a load/store, assume free. 4585 if (MTy.isScalarInteger()) { 4586 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { 4587 if (const Instruction *II = ICA.getInst()) { 4588 if (II->hasOneUse() && isa<StoreInst>(II->user_back())) 4589 return TTI::TCC_Free; 4590 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { 4591 if (LI->hasOneUse()) 4592 return TTI::TCC_Free; 4593 } 4594 } 4595 } 4596 } 4597 4598 return LegalizationCost * (int)Cost; 4599 }; 4600 4601 // Legalize the type. 4602 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy); 4603 MVT MTy = LT.second; 4604 4605 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost. 4606 if (((ISD == ISD::CTTZ && !ST->hasBMI()) || 4607 (ISD == ISD::CTLZ && !ST->hasLZCNT())) && 4608 !MTy.isVector() && !ICA.isTypeBasedOnly()) { 4609 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 4610 if (auto *Cst = dyn_cast<ConstantInt>(Args[1])) 4611 if (Cst->isAllOnesValue()) 4612 ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF; 4613 } 4614 4615 // FSQRT is a single instruction. 4616 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize) 4617 return LT.first; 4618 4619 if (ST->useGLMDivSqrtCosts()) 4620 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) 4621 if (auto KindCost = Entry->Cost[CostKind]) 4622 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4623 4624 if (ST->useSLMArithCosts()) 4625 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 4626 if (auto KindCost = Entry->Cost[CostKind]) 4627 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4628 4629 if (ST->hasVBMI2()) 4630 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy)) 4631 if (auto KindCost = Entry->Cost[CostKind]) 4632 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4633 4634 if (ST->hasBITALG()) 4635 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy)) 4636 if (auto KindCost = Entry->Cost[CostKind]) 4637 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4638 4639 if (ST->hasVPOPCNTDQ()) 4640 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy)) 4641 if (auto KindCost = Entry->Cost[CostKind]) 4642 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4643 4644 if (ST->hasGFNI()) 4645 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy)) 4646 if (auto KindCost = Entry->Cost[CostKind]) 4647 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4648 4649 if (ST->hasCDI()) 4650 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) 4651 if (auto KindCost = Entry->Cost[CostKind]) 4652 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4653 4654 if (ST->hasBWI()) 4655 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 4656 if (auto KindCost = Entry->Cost[CostKind]) 4657 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4658 4659 if (ST->hasAVX512()) 4660 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 4661 if (auto KindCost = Entry->Cost[CostKind]) 4662 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4663 4664 if (ST->hasXOP()) 4665 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) 4666 if (auto KindCost = Entry->Cost[CostKind]) 4667 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4668 4669 if (ST->hasAVX2()) 4670 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 4671 if (auto KindCost = Entry->Cost[CostKind]) 4672 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4673 4674 if (ST->hasAVX()) 4675 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 4676 if (auto KindCost = Entry->Cost[CostKind]) 4677 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4678 4679 if (ST->hasSSE42()) 4680 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 4681 if (auto KindCost = Entry->Cost[CostKind]) 4682 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4683 4684 if (ST->hasSSE41()) 4685 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 4686 if (auto KindCost = Entry->Cost[CostKind]) 4687 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4688 4689 if (ST->hasSSSE3()) 4690 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) 4691 if (auto KindCost = Entry->Cost[CostKind]) 4692 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4693 4694 if (ST->hasSSE2()) 4695 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 4696 if (auto KindCost = Entry->Cost[CostKind]) 4697 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4698 4699 if (ST->hasSSE1()) 4700 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) 4701 if (auto KindCost = Entry->Cost[CostKind]) 4702 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4703 4704 if (ST->hasBMI()) { 4705 if (ST->is64Bit()) 4706 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) 4707 if (auto KindCost = Entry->Cost[CostKind]) 4708 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4709 4710 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) 4711 if (auto KindCost = Entry->Cost[CostKind]) 4712 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4713 } 4714 4715 if (ST->hasLZCNT()) { 4716 if (ST->is64Bit()) 4717 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) 4718 if (auto KindCost = Entry->Cost[CostKind]) 4719 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4720 4721 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) 4722 if (auto KindCost = Entry->Cost[CostKind]) 4723 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4724 } 4725 4726 if (ST->hasPOPCNT()) { 4727 if (ST->is64Bit()) 4728 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) 4729 if (auto KindCost = Entry->Cost[CostKind]) 4730 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4731 4732 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) 4733 if (auto KindCost = Entry->Cost[CostKind]) 4734 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4735 } 4736 4737 if (ST->is64Bit()) 4738 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) 4739 if (auto KindCost = Entry->Cost[CostKind]) 4740 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4741 4742 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) 4743 if (auto KindCost = Entry->Cost[CostKind]) 4744 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); 4745 4746 // Without arg data, we need to compute the expanded costs of custom lowered 4747 // intrinsics to prevent use of the (very low) default costs. 4748 if (ICA.isTypeBasedOnly() && 4749 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) { 4750 Type *CondTy = RetTy->getWithNewBitWidth(1); 4751 InstructionCost Cost = 0; 4752 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); 4753 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); 4754 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind); 4755 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind); 4756 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind); 4757 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 4758 CmpInst::ICMP_EQ, CostKind); 4759 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 4760 CmpInst::ICMP_EQ, CostKind); 4761 return Cost; 4762 } 4763 } 4764 4765 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 4766 } 4767 4768 InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 4769 TTI::TargetCostKind CostKind, 4770 unsigned Index, Value *Op0, 4771 Value *Op1) { 4772 static const CostTblEntry SLMCostTbl[] = { 4773 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, 4774 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, 4775 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 }, 4776 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 } 4777 }; 4778 4779 assert(Val->isVectorTy() && "This must be a vector type"); 4780 Type *ScalarType = Val->getScalarType(); 4781 InstructionCost RegisterFileMoveCost = 0; 4782 4783 // Non-immediate extraction/insertion can be handled as a sequence of 4784 // aliased loads+stores via the stack. 4785 if (Index == -1U && (Opcode == Instruction::ExtractElement || 4786 Opcode == Instruction::InsertElement)) { 4787 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: 4788 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. 4789 4790 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. 4791 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected"); 4792 Align VecAlign = DL.getPrefTypeAlign(Val); 4793 Align SclAlign = DL.getPrefTypeAlign(ScalarType); 4794 4795 // Extract - store vector to stack, load scalar. 4796 if (Opcode == Instruction::ExtractElement) { 4797 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + 4798 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, 4799 CostKind); 4800 } 4801 // Insert - store vector to stack, store scalar, load vector. 4802 if (Opcode == Instruction::InsertElement) { 4803 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + 4804 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, 4805 CostKind) + 4806 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind); 4807 } 4808 } 4809 4810 if (Index != -1U && (Opcode == Instruction::ExtractElement || 4811 Opcode == Instruction::InsertElement)) { 4812 // Extraction of vXi1 elements are now efficiently handled by MOVMSK. 4813 if (Opcode == Instruction::ExtractElement && 4814 ScalarType->getScalarSizeInBits() == 1 && 4815 cast<FixedVectorType>(Val)->getNumElements() > 1) 4816 return 1; 4817 4818 // Legalize the type. 4819 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 4820 4821 // This type is legalized to a scalar type. 4822 if (!LT.second.isVector()) 4823 return TTI::TCC_Free; 4824 4825 // The type may be split. Normalize the index to the new type. 4826 unsigned SizeInBits = LT.second.getSizeInBits(); 4827 unsigned NumElts = LT.second.getVectorNumElements(); 4828 unsigned SubNumElts = NumElts; 4829 Index = Index % NumElts; 4830 4831 // For >128-bit vectors, we need to extract higher 128-bit subvectors. 4832 // For inserts, we also need to insert the subvector back. 4833 if (SizeInBits > 128) { 4834 assert((SizeInBits % 128) == 0 && "Illegal vector"); 4835 unsigned NumSubVecs = SizeInBits / 128; 4836 SubNumElts = NumElts / NumSubVecs; 4837 if (SubNumElts <= Index) { 4838 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); 4839 Index %= SubNumElts; 4840 } 4841 } 4842 4843 MVT MScalarTy = LT.second.getScalarType(); 4844 auto IsCheapPInsrPExtrInsertPS = [&]() { 4845 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. 4846 // Inserting f32 into index0 is just movss. 4847 // Also, assume insertps is relatively cheap on all >= SSE41 targets. 4848 return (MScalarTy == MVT::i16 && ST->hasSSE2()) || 4849 (MScalarTy.isInteger() && ST->hasSSE41()) || 4850 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 && 4851 Opcode == Instruction::InsertElement) || 4852 (MScalarTy == MVT::f32 && ST->hasSSE41() && 4853 Opcode == Instruction::InsertElement); 4854 }; 4855 4856 if (Index == 0) { 4857 // Floating point scalars are already located in index #0. 4858 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume 4859 // true for all. 4860 if (ScalarType->isFloatingPointTy() && 4861 (Opcode != Instruction::InsertElement || !Op0 || 4862 isa<UndefValue>(Op0))) 4863 return RegisterFileMoveCost; 4864 4865 if (Opcode == Instruction::InsertElement && 4866 isa_and_nonnull<UndefValue>(Op0)) { 4867 // Consider the gather cost to be cheap. 4868 if (isa_and_nonnull<LoadInst>(Op1)) 4869 return RegisterFileMoveCost; 4870 if (!IsCheapPInsrPExtrInsertPS()) { 4871 // mov constant-to-GPR + movd/movq GPR -> XMM. 4872 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy()) 4873 return 2 + RegisterFileMoveCost; 4874 // Assume movd/movq GPR -> XMM is relatively cheap on all targets. 4875 return 1 + RegisterFileMoveCost; 4876 } 4877 } 4878 4879 // Assume movd/movq XMM -> GPR is relatively cheap on all targets. 4880 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) 4881 return 1 + RegisterFileMoveCost; 4882 } 4883 4884 int ISD = TLI->InstructionOpcodeToISD(Opcode); 4885 assert(ISD && "Unexpected vector opcode"); 4886 if (ST->useSLMArithCosts()) 4887 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) 4888 return Entry->Cost + RegisterFileMoveCost; 4889 4890 // Consider cheap cases. 4891 if (IsCheapPInsrPExtrInsertPS()) 4892 return 1 + RegisterFileMoveCost; 4893 4894 // For extractions we just need to shuffle the element to index 0, which 4895 // should be very cheap (assume cost = 1). For insertions we need to shuffle 4896 // the elements to its destination. In both cases we must handle the 4897 // subvector move(s). 4898 // If the vector type is already less than 128-bits then don't reduce it. 4899 // TODO: Under what circumstances should we shuffle using the full width? 4900 InstructionCost ShuffleCost = 1; 4901 if (Opcode == Instruction::InsertElement) { 4902 auto *SubTy = cast<VectorType>(Val); 4903 EVT VT = TLI->getValueType(DL, Val); 4904 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) 4905 SubTy = FixedVectorType::get(ScalarType, SubNumElts); 4906 ShuffleCost = 4907 getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, {}, CostKind, 0, SubTy); 4908 } 4909 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; 4910 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; 4911 } 4912 4913 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) + 4914 RegisterFileMoveCost; 4915 } 4916 4917 InstructionCost X86TTIImpl::getScalarizationOverhead( 4918 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, 4919 TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) { 4920 assert(DemandedElts.getBitWidth() == 4921 cast<FixedVectorType>(Ty)->getNumElements() && 4922 "Vector size mismatch"); 4923 4924 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 4925 MVT MScalarTy = LT.second.getScalarType(); 4926 unsigned LegalVectorBitWidth = LT.second.getSizeInBits(); 4927 InstructionCost Cost = 0; 4928 4929 constexpr unsigned LaneBitWidth = 128; 4930 assert((LegalVectorBitWidth < LaneBitWidth || 4931 (LegalVectorBitWidth % LaneBitWidth) == 0) && 4932 "Illegal vector"); 4933 4934 const int NumLegalVectors = *LT.first.getValue(); 4935 assert(NumLegalVectors >= 0 && "Negative cost!"); 4936 4937 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much 4938 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. 4939 if (Insert) { 4940 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || 4941 (MScalarTy.isInteger() && ST->hasSSE41()) || 4942 (MScalarTy == MVT::f32 && ST->hasSSE41())) { 4943 // For types we can insert directly, insertion into 128-bit sub vectors is 4944 // cheap, followed by a cheap chain of concatenations. 4945 if (LegalVectorBitWidth <= LaneBitWidth) { 4946 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, 4947 /*Extract*/ false, CostKind); 4948 } else { 4949 // In each 128-lane, if at least one index is demanded but not all 4950 // indices are demanded and this 128-lane is not the first 128-lane of 4951 // the legalized-vector, then this 128-lane needs a extracti128; If in 4952 // each 128-lane, there is at least one demanded index, this 128-lane 4953 // needs a inserti128. 4954 4955 // The following cases will help you build a better understanding: 4956 // Assume we insert several elements into a v8i32 vector in avx2, 4957 // Case#1: inserting into 1th index needs vpinsrd + inserti128. 4958 // Case#2: inserting into 5th index needs extracti128 + vpinsrd + 4959 // inserti128. 4960 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. 4961 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector"); 4962 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; 4963 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; 4964 unsigned NumLegalElts = 4965 LT.second.getVectorNumElements() * NumLegalVectors; 4966 assert(NumLegalElts >= DemandedElts.getBitWidth() && 4967 "Vector has been legalized to smaller element count"); 4968 assert((NumLegalElts % NumLanesTotal) == 0 && 4969 "Unexpected elts per lane"); 4970 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; 4971 4972 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts); 4973 auto *LaneTy = 4974 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane); 4975 4976 for (unsigned I = 0; I != NumLanesTotal; ++I) { 4977 APInt LaneEltMask = WidenedDemandedElts.extractBits( 4978 NumEltsPerLane, NumEltsPerLane * I); 4979 if (LaneEltMask.isZero()) 4980 continue; 4981 // FIXME: we don't need to extract if all non-demanded elements 4982 // are legalization-inserted padding. 4983 if (!LaneEltMask.isAllOnes()) 4984 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind, 4985 I * NumEltsPerLane, LaneTy); 4986 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert, 4987 /*Extract*/ false, CostKind); 4988 } 4989 4990 APInt AffectedLanes = 4991 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal); 4992 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask( 4993 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true); 4994 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) { 4995 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) { 4996 unsigned I = NumLegalLanes * LegalVec + Lane; 4997 // No need to insert unaffected lane; or lane 0 of each legal vector 4998 // iff ALL lanes of that vector were affected and will be inserted. 4999 if (!AffectedLanes[I] || 5000 (Lane == 0 && FullyAffectedLegalVectors[LegalVec])) 5001 continue; 5002 Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, {}, CostKind, 5003 I * NumEltsPerLane, LaneTy); 5004 } 5005 } 5006 } 5007 } else if (LT.second.isVector()) { 5008 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded 5009 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a 5010 // series of UNPCK followed by CONCAT_VECTORS - all of these can be 5011 // considered cheap. 5012 if (Ty->isIntOrIntVectorTy()) 5013 Cost += DemandedElts.popcount(); 5014 5015 // Get the smaller of the legalized or original pow2-extended number of 5016 // vector elements, which represents the number of unpacks we'll end up 5017 // performing. 5018 unsigned NumElts = LT.second.getVectorNumElements(); 5019 unsigned Pow2Elts = 5020 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements()); 5021 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first; 5022 } 5023 } 5024 5025 if (Extract) { 5026 // vXi1 can be efficiently extracted with MOVMSK. 5027 // TODO: AVX512 predicate mask handling. 5028 // NOTE: This doesn't work well for roundtrip scalarization. 5029 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { 5030 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements(); 5031 unsigned MaxElts = ST->hasAVX2() ? 32 : 16; 5032 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; 5033 return MOVMSKCost; 5034 } 5035 5036 if (LT.second.isVector()) { 5037 unsigned NumLegalElts = 5038 LT.second.getVectorNumElements() * NumLegalVectors; 5039 assert(NumLegalElts >= DemandedElts.getBitWidth() && 5040 "Vector has been legalized to smaller element count"); 5041 5042 // If we're extracting elements from a 128-bit subvector lane, 5043 // we only need to extract each lane once, not for every element. 5044 if (LegalVectorBitWidth > LaneBitWidth) { 5045 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; 5046 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; 5047 assert((NumLegalElts % NumLanesTotal) == 0 && 5048 "Unexpected elts per lane"); 5049 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; 5050 5051 // Add cost for each demanded 128-bit subvector extraction. 5052 // Luckily this is a lot easier than for insertion. 5053 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts); 5054 auto *LaneTy = 5055 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane); 5056 5057 for (unsigned I = 0; I != NumLanesTotal; ++I) { 5058 APInt LaneEltMask = WidenedDemandedElts.extractBits( 5059 NumEltsPerLane, I * NumEltsPerLane); 5060 if (LaneEltMask.isZero()) 5061 continue; 5062 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind, 5063 I * NumEltsPerLane, LaneTy); 5064 Cost += BaseT::getScalarizationOverhead( 5065 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind); 5066 } 5067 5068 return Cost; 5069 } 5070 } 5071 5072 // Fallback to default extraction. 5073 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false, 5074 Extract, CostKind); 5075 } 5076 5077 return Cost; 5078 } 5079 5080 InstructionCost 5081 X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, 5082 int VF, const APInt &DemandedDstElts, 5083 TTI::TargetCostKind CostKind) { 5084 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); 5085 // We don't differentiate element types here, only element bit width. 5086 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits); 5087 5088 auto bailout = [&]() { 5089 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, 5090 DemandedDstElts, CostKind); 5091 }; 5092 5093 // For now, only deal with AVX512 cases. 5094 if (!ST->hasAVX512()) 5095 return bailout(); 5096 5097 // Do we have a native shuffle for this element type, or should we promote? 5098 unsigned PromEltTyBits = EltTyBits; 5099 switch (EltTyBits) { 5100 case 32: 5101 case 64: 5102 break; // AVX512F. 5103 case 16: 5104 if (!ST->hasBWI()) 5105 PromEltTyBits = 32; // promote to i32, AVX512F. 5106 break; // AVX512BW 5107 case 8: 5108 if (!ST->hasVBMI()) 5109 PromEltTyBits = 32; // promote to i32, AVX512F. 5110 break; // AVX512VBMI 5111 case 1: 5112 // There is no support for shuffling i1 elements. We *must* promote. 5113 if (ST->hasBWI()) { 5114 if (ST->hasVBMI()) 5115 PromEltTyBits = 8; // promote to i8, AVX512VBMI. 5116 else 5117 PromEltTyBits = 16; // promote to i16, AVX512BW. 5118 break; 5119 } 5120 PromEltTyBits = 32; // promote to i32, AVX512F. 5121 break; 5122 default: 5123 return bailout(); 5124 } 5125 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits); 5126 5127 auto *SrcVecTy = FixedVectorType::get(EltTy, VF); 5128 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF); 5129 5130 int NumDstElements = VF * ReplicationFactor; 5131 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements); 5132 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); 5133 5134 // Legalize the types. 5135 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second; 5136 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second; 5137 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second; 5138 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second; 5139 // They should have legalized into vector types. 5140 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || 5141 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) 5142 return bailout(); 5143 5144 if (PromEltTyBits != EltTyBits) { 5145 // If we have to perform the shuffle with wider elt type than our data type, 5146 // then we will first need to anyext (we don't care about the new bits) 5147 // the source elements, and then truncate Dst elements. 5148 InstructionCost PromotionCost; 5149 PromotionCost += getCastInstrCost( 5150 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, 5151 TargetTransformInfo::CastContextHint::None, CostKind); 5152 PromotionCost += 5153 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy, 5154 /*Src=*/PromDstVecTy, 5155 TargetTransformInfo::CastContextHint::None, CostKind); 5156 return PromotionCost + getReplicationShuffleCost(PromEltTy, 5157 ReplicationFactor, VF, 5158 DemandedDstElts, CostKind); 5159 } 5160 5161 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && 5162 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && 5163 "We expect that the legalization doesn't affect the element width, " 5164 "doesn't coalesce/split elements."); 5165 5166 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); 5167 unsigned NumDstVectors = 5168 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec); 5169 5170 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec); 5171 5172 // Not all the produced Dst elements may be demanded. In our case, 5173 // given that a single Dst vector is formed by a single shuffle, 5174 // if all elements that will form a single Dst vector aren't demanded, 5175 // then we won't need to do that shuffle, so adjust the cost accordingly. 5176 APInt DemandedDstVectors = APIntOps::ScaleBitMask( 5177 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors); 5178 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount(); 5179 5180 InstructionCost SingleShuffleCost = getShuffleCost( 5181 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/{}, CostKind, 5182 /*Index=*/0, /*SubTp=*/nullptr); 5183 return NumDstVectorsDemanded * SingleShuffleCost; 5184 } 5185 5186 InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 5187 MaybeAlign Alignment, 5188 unsigned AddressSpace, 5189 TTI::TargetCostKind CostKind, 5190 TTI::OperandValueInfo OpInfo, 5191 const Instruction *I) { 5192 // TODO: Handle other cost kinds. 5193 if (CostKind != TTI::TCK_RecipThroughput) { 5194 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) { 5195 // Store instruction with index and scale costs 2 Uops. 5196 // Check the preceding GEP to identify non-const indices. 5197 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) { 5198 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) 5199 return TTI::TCC_Basic * 2; 5200 } 5201 } 5202 return TTI::TCC_Basic; 5203 } 5204 5205 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 5206 "Invalid Opcode"); 5207 // Type legalization can't handle structs 5208 if (TLI->getValueType(DL, Src, true) == MVT::Other) 5209 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 5210 CostKind, OpInfo, I); 5211 5212 // Legalize the type. 5213 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 5214 5215 auto *VTy = dyn_cast<FixedVectorType>(Src); 5216 5217 InstructionCost Cost = 0; 5218 5219 // Add a cost for constant load to vector. 5220 if (Opcode == Instruction::Store && OpInfo.isConstant()) 5221 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src), 5222 /*AddressSpace=*/0, CostKind, OpInfo); 5223 5224 // Handle the simple case of non-vectors. 5225 // NOTE: this assumes that legalization never creates vector from scalars! 5226 if (!VTy || !LT.second.isVector()) { 5227 // Each load/store unit costs 1. 5228 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1; 5229 } 5230 5231 bool IsLoad = Opcode == Instruction::Load; 5232 5233 Type *EltTy = VTy->getElementType(); 5234 5235 const int EltTyBits = DL.getTypeSizeInBits(EltTy); 5236 5237 // Source of truth: how many elements were there in the original IR vector? 5238 const unsigned SrcNumElt = VTy->getNumElements(); 5239 5240 // How far have we gotten? 5241 int NumEltRemaining = SrcNumElt; 5242 // Note that we intentionally capture by-reference, NumEltRemaining changes. 5243 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; 5244 5245 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8); 5246 5247 // Note that even if we can store 64 bits of an XMM, we still operate on XMM. 5248 const unsigned XMMBits = 128; 5249 if (XMMBits % EltTyBits != 0) 5250 // Vector size must be a multiple of the element size. I.e. no padding. 5251 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 5252 CostKind, OpInfo, I); 5253 const int NumEltPerXMM = XMMBits / EltTyBits; 5254 5255 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM); 5256 5257 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; 5258 NumEltRemaining > 0; CurrOpSizeBytes /= 2) { 5259 // How many elements would a single op deal with at once? 5260 if ((8 * CurrOpSizeBytes) % EltTyBits != 0) 5261 // Vector size must be a multiple of the element size. I.e. no padding. 5262 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 5263 CostKind, OpInfo, I); 5264 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; 5265 5266 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?"); 5267 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || 5268 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && 5269 "Unless we haven't halved the op size yet, " 5270 "we have less than two op's sized units of work left."); 5271 5272 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM 5273 ? FixedVectorType::get(EltTy, CurrNumEltPerOp) 5274 : XMMVecTy; 5275 5276 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && 5277 "After halving sizes, the vector elt count is no longer a multiple " 5278 "of number of elements per operation?"); 5279 auto *CoalescedVecTy = 5280 CurrNumEltPerOp == 1 5281 ? CurrVecTy 5282 : FixedVectorType::get( 5283 IntegerType::get(Src->getContext(), 5284 EltTyBits * CurrNumEltPerOp), 5285 CurrVecTy->getNumElements() / CurrNumEltPerOp); 5286 assert(DL.getTypeSizeInBits(CoalescedVecTy) == 5287 DL.getTypeSizeInBits(CurrVecTy) && 5288 "coalesciing elements doesn't change vector width."); 5289 5290 while (NumEltRemaining > 0) { 5291 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?"); 5292 5293 // Can we use this vector size, as per the remaining element count? 5294 // Iff the vector is naturally aligned, we can do a wide load regardless. 5295 if (NumEltRemaining < CurrNumEltPerOp && 5296 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) && 5297 CurrOpSizeBytes != 1) 5298 break; // Try smalled vector size. 5299 5300 // This isn't exactly right. We're using slow unaligned 32-byte accesses 5301 // as a proxy for a double-pumped AVX memory interface such as on 5302 // Sandybridge. 5303 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or 5304 // will be scalarized. 5305 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) 5306 Cost += 2; 5307 else if (CurrOpSizeBytes < 4) 5308 Cost += 2; 5309 else 5310 Cost += 1; 5311 5312 // If we're loading a uniform value, then we don't need to split the load, 5313 // loading just a single (widest) vector can be reused by all splits. 5314 if (IsLoad && OpInfo.isUniform()) 5315 return Cost; 5316 5317 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; 5318 5319 // If we have fully processed the previous reg, we need to replenish it. 5320 if (SubVecEltsLeft == 0) { 5321 SubVecEltsLeft += CurrVecTy->getNumElements(); 5322 // And that's free only for the 0'th subvector of a legalized vector. 5323 if (!Is0thSubVec) 5324 Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector 5325 : TTI::ShuffleKind::SK_ExtractSubvector, 5326 VTy, {}, CostKind, NumEltDone(), CurrVecTy); 5327 } 5328 5329 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, 5330 // for smaller widths (32/16/8) we have to insert/extract them separately. 5331 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, 5332 // but let's pretend that it is also true for 16/8 bit wide ops...) 5333 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { 5334 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; 5335 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && ""); 5336 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; 5337 APInt DemandedElts = 5338 APInt::getBitsSet(CoalescedVecTy->getNumElements(), 5339 CoalescedVecEltIdx, CoalescedVecEltIdx + 1); 5340 assert(DemandedElts.popcount() == 1 && "Inserting single value"); 5341 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad, 5342 !IsLoad, CostKind); 5343 } 5344 5345 SubVecEltsLeft -= CurrNumEltPerOp; 5346 NumEltRemaining -= CurrNumEltPerOp; 5347 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes); 5348 } 5349 } 5350 5351 assert(NumEltRemaining <= 0 && "Should have processed all the elements."); 5352 5353 return Cost; 5354 } 5355 5356 InstructionCost 5357 X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, 5358 unsigned AddressSpace, 5359 TTI::TargetCostKind CostKind) { 5360 bool IsLoad = (Instruction::Load == Opcode); 5361 bool IsStore = (Instruction::Store == Opcode); 5362 5363 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy); 5364 if (!SrcVTy) 5365 // To calculate scalar take the regular cost, without mask 5366 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind); 5367 5368 unsigned NumElem = SrcVTy->getNumElements(); 5369 auto *MaskTy = 5370 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); 5371 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || 5372 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { 5373 // Scalarization 5374 APInt DemandedElts = APInt::getAllOnes(NumElem); 5375 InstructionCost MaskSplitCost = getScalarizationOverhead( 5376 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind); 5377 InstructionCost ScalarCompareCost = getCmpSelInstrCost( 5378 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, 5379 CmpInst::BAD_ICMP_PREDICATE, CostKind); 5380 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); 5381 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); 5382 InstructionCost ValueSplitCost = getScalarizationOverhead( 5383 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind); 5384 InstructionCost MemopCost = 5385 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 5386 Alignment, AddressSpace, CostKind); 5387 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; 5388 } 5389 5390 // Legalize the type. 5391 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy); 5392 auto VT = TLI->getValueType(DL, SrcVTy); 5393 InstructionCost Cost = 0; 5394 MVT Ty = LT.second; 5395 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64) 5396 // APX masked load/store for scalar is cheap. 5397 return Cost + LT.first; 5398 5399 if (VT.isSimple() && Ty != VT.getSimpleVT() && 5400 LT.second.getVectorNumElements() == NumElem) 5401 // Promotion requires extend/truncate for data and a shuffle for mask. 5402 Cost += 5403 getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, {}, CostKind, 0, 5404 nullptr) + 5405 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, {}, CostKind, 0, nullptr); 5406 5407 else if (LT.first * Ty.getVectorNumElements() > NumElem) { 5408 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), 5409 Ty.getVectorNumElements()); 5410 // Expanding requires fill mask with zeroes 5411 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, {}, CostKind, 0, 5412 MaskTy); 5413 } 5414 5415 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. 5416 if (!ST->hasAVX512()) 5417 return Cost + LT.first * (IsLoad ? 2 : 8); 5418 5419 // AVX-512 masked load/store is cheaper 5420 return Cost + LT.first; 5421 } 5422 5423 InstructionCost 5424 X86TTIImpl::getPointersChainCost(ArrayRef<const Value *> Ptrs, 5425 const Value *Base, 5426 const TTI::PointersChainInfo &Info, 5427 Type *AccessTy, TTI::TargetCostKind CostKind) { 5428 if (Info.isSameBase() && Info.isKnownStride()) { 5429 // If all the pointers have known stride all the differences are translated 5430 // into constants. X86 memory addressing allows encoding it into 5431 // displacement. So we just need to take the base GEP cost. 5432 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) { 5433 SmallVector<const Value *> Indices(BaseGEP->indices()); 5434 return getGEPCost(BaseGEP->getSourceElementType(), 5435 BaseGEP->getPointerOperand(), Indices, nullptr, 5436 CostKind); 5437 } 5438 return TTI::TCC_Free; 5439 } 5440 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); 5441 } 5442 5443 InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, 5444 ScalarEvolution *SE, 5445 const SCEV *Ptr) { 5446 // Address computations in vectorized code with non-consecutive addresses will 5447 // likely result in more instructions compared to scalar code where the 5448 // computation can more often be merged into the index mode. The resulting 5449 // extra micro-ops can significantly decrease throughput. 5450 const unsigned NumVectorInstToHideOverhead = 10; 5451 5452 // Cost modeling of Strided Access Computation is hidden by the indexing 5453 // modes of X86 regardless of the stride value. We dont believe that there 5454 // is a difference between constant strided access in gerenal and constant 5455 // strided value which is less than or equal to 64. 5456 // Even in the case of (loop invariant) stride whose value is not known at 5457 // compile time, the address computation will not incur more than one extra 5458 // ADD instruction. 5459 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { 5460 // TODO: AVX2 is the current cut-off because we don't have correct 5461 // interleaving costs for prior ISA's. 5462 if (!BaseT::isStridedAccess(Ptr)) 5463 return NumVectorInstToHideOverhead; 5464 if (!BaseT::getConstantStrideStep(SE, Ptr)) 5465 return 1; 5466 } 5467 5468 return BaseT::getAddressComputationCost(Ty, SE, Ptr); 5469 } 5470 5471 InstructionCost 5472 X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 5473 std::optional<FastMathFlags> FMF, 5474 TTI::TargetCostKind CostKind) { 5475 if (TTI::requiresOrderedReduction(FMF)) 5476 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 5477 5478 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 5479 // and make it as the cost. 5480 5481 static const CostTblEntry SLMCostTbl[] = { 5482 { ISD::FADD, MVT::v2f64, 3 }, 5483 { ISD::ADD, MVT::v2i64, 5 }, 5484 }; 5485 5486 static const CostTblEntry SSE2CostTbl[] = { 5487 { ISD::FADD, MVT::v2f64, 2 }, 5488 { ISD::FADD, MVT::v2f32, 2 }, 5489 { ISD::FADD, MVT::v4f32, 4 }, 5490 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 5491 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 5492 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". 5493 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". 5494 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". 5495 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". 5496 { ISD::ADD, MVT::v2i8, 2 }, 5497 { ISD::ADD, MVT::v4i8, 2 }, 5498 { ISD::ADD, MVT::v8i8, 2 }, 5499 { ISD::ADD, MVT::v16i8, 3 }, 5500 }; 5501 5502 static const CostTblEntry AVX1CostTbl[] = { 5503 { ISD::FADD, MVT::v4f64, 3 }, 5504 { ISD::FADD, MVT::v4f32, 3 }, 5505 { ISD::FADD, MVT::v8f32, 4 }, 5506 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 5507 { ISD::ADD, MVT::v4i64, 3 }, 5508 { ISD::ADD, MVT::v8i32, 5 }, 5509 { ISD::ADD, MVT::v16i16, 5 }, 5510 { ISD::ADD, MVT::v32i8, 4 }, 5511 }; 5512 5513 int ISD = TLI->InstructionOpcodeToISD(Opcode); 5514 assert(ISD && "Invalid opcode"); 5515 5516 // Before legalizing the type, give a chance to look up illegal narrow types 5517 // in the table. 5518 // FIXME: Is there a better way to do this? 5519 EVT VT = TLI->getValueType(DL, ValTy); 5520 if (VT.isSimple()) { 5521 MVT MTy = VT.getSimpleVT(); 5522 if (ST->useSLMArithCosts()) 5523 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 5524 return Entry->Cost; 5525 5526 if (ST->hasAVX()) 5527 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 5528 return Entry->Cost; 5529 5530 if (ST->hasSSE2()) 5531 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 5532 return Entry->Cost; 5533 } 5534 5535 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 5536 5537 MVT MTy = LT.second; 5538 5539 auto *ValVTy = cast<FixedVectorType>(ValTy); 5540 5541 // Special case: vXi8 mul reductions are performed as vXi16. 5542 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { 5543 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16); 5544 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements()); 5545 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, 5546 TargetTransformInfo::CastContextHint::None, 5547 CostKind) + 5548 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind); 5549 } 5550 5551 InstructionCost ArithmeticCost = 0; 5552 if (LT.first != 1 && MTy.isVector() && 5553 MTy.getVectorNumElements() < ValVTy->getNumElements()) { 5554 // Type needs to be split. We need LT.first - 1 arithmetic ops. 5555 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), 5556 MTy.getVectorNumElements()); 5557 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); 5558 ArithmeticCost *= LT.first - 1; 5559 } 5560 5561 if (ST->useSLMArithCosts()) 5562 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 5563 return ArithmeticCost + Entry->Cost; 5564 5565 if (ST->hasAVX()) 5566 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 5567 return ArithmeticCost + Entry->Cost; 5568 5569 if (ST->hasSSE2()) 5570 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 5571 return ArithmeticCost + Entry->Cost; 5572 5573 // FIXME: These assume a naive kshift+binop lowering, which is probably 5574 // conservative in most cases. 5575 static const CostTblEntry AVX512BoolReduction[] = { 5576 { ISD::AND, MVT::v2i1, 3 }, 5577 { ISD::AND, MVT::v4i1, 5 }, 5578 { ISD::AND, MVT::v8i1, 7 }, 5579 { ISD::AND, MVT::v16i1, 9 }, 5580 { ISD::AND, MVT::v32i1, 11 }, 5581 { ISD::AND, MVT::v64i1, 13 }, 5582 { ISD::OR, MVT::v2i1, 3 }, 5583 { ISD::OR, MVT::v4i1, 5 }, 5584 { ISD::OR, MVT::v8i1, 7 }, 5585 { ISD::OR, MVT::v16i1, 9 }, 5586 { ISD::OR, MVT::v32i1, 11 }, 5587 { ISD::OR, MVT::v64i1, 13 }, 5588 }; 5589 5590 static const CostTblEntry AVX2BoolReduction[] = { 5591 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp 5592 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp 5593 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp 5594 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp 5595 }; 5596 5597 static const CostTblEntry AVX1BoolReduction[] = { 5598 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp 5599 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp 5600 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp 5601 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp 5602 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp 5603 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp 5604 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp 5605 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp 5606 }; 5607 5608 static const CostTblEntry SSE2BoolReduction[] = { 5609 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp 5610 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp 5611 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp 5612 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp 5613 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp 5614 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp 5615 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp 5616 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp 5617 }; 5618 5619 // Handle bool allof/anyof patterns. 5620 if (ValVTy->getElementType()->isIntegerTy(1)) { 5621 InstructionCost ArithmeticCost = 0; 5622 if (LT.first != 1 && MTy.isVector() && 5623 MTy.getVectorNumElements() < ValVTy->getNumElements()) { 5624 // Type needs to be split. We need LT.first - 1 arithmetic ops. 5625 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), 5626 MTy.getVectorNumElements()); 5627 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); 5628 ArithmeticCost *= LT.first - 1; 5629 } 5630 5631 if (ST->hasAVX512()) 5632 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) 5633 return ArithmeticCost + Entry->Cost; 5634 if (ST->hasAVX2()) 5635 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) 5636 return ArithmeticCost + Entry->Cost; 5637 if (ST->hasAVX()) 5638 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) 5639 return ArithmeticCost + Entry->Cost; 5640 if (ST->hasSSE2()) 5641 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) 5642 return ArithmeticCost + Entry->Cost; 5643 5644 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); 5645 } 5646 5647 unsigned NumVecElts = ValVTy->getNumElements(); 5648 unsigned ScalarSize = ValVTy->getScalarSizeInBits(); 5649 5650 // Special case power of 2 reductions where the scalar type isn't changed 5651 // by type legalization. 5652 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) 5653 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); 5654 5655 InstructionCost ReductionCost = 0; 5656 5657 auto *Ty = ValVTy; 5658 if (LT.first != 1 && MTy.isVector() && 5659 MTy.getVectorNumElements() < ValVTy->getNumElements()) { 5660 // Type needs to be split. We need LT.first - 1 arithmetic ops. 5661 Ty = FixedVectorType::get(ValVTy->getElementType(), 5662 MTy.getVectorNumElements()); 5663 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 5664 ReductionCost *= LT.first - 1; 5665 NumVecElts = MTy.getVectorNumElements(); 5666 } 5667 5668 // Now handle reduction with the legal type, taking into account size changes 5669 // at each level. 5670 while (NumVecElts > 1) { 5671 // Determine the size of the remaining vector we need to reduce. 5672 unsigned Size = NumVecElts * ScalarSize; 5673 NumVecElts /= 2; 5674 // If we're reducing from 256/512 bits, use an extract_subvector. 5675 if (Size > 128) { 5676 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); 5677 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, 5678 CostKind, NumVecElts, SubTy); 5679 Ty = SubTy; 5680 } else if (Size == 128) { 5681 // Reducing from 128 bits is a permute of v2f64/v2i64. 5682 FixedVectorType *ShufTy; 5683 if (ValVTy->isFloatingPointTy()) 5684 ShufTy = 5685 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2); 5686 else 5687 ShufTy = 5688 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); 5689 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {}, 5690 CostKind, 0, nullptr); 5691 } else if (Size == 64) { 5692 // Reducing from 64 bits is a shuffle of v4f32/v4i32. 5693 FixedVectorType *ShufTy; 5694 if (ValVTy->isFloatingPointTy()) 5695 ShufTy = 5696 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4); 5697 else 5698 ShufTy = 5699 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); 5700 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {}, 5701 CostKind, 0, nullptr); 5702 } else { 5703 // Reducing from smaller size is a shift by immediate. 5704 auto *ShiftTy = FixedVectorType::get( 5705 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size); 5706 ReductionCost += getArithmeticInstrCost( 5707 Instruction::LShr, ShiftTy, CostKind, 5708 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 5709 {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None}); 5710 } 5711 5712 // Add the arithmetic op for this level. 5713 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); 5714 } 5715 5716 // Add the final extract element to the cost. 5717 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 5718 CostKind, 0, nullptr, nullptr); 5719 } 5720 5721 InstructionCost X86TTIImpl::getMinMaxCost(Intrinsic::ID IID, Type *Ty, 5722 TTI::TargetCostKind CostKind, 5723 FastMathFlags FMF) { 5724 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF); 5725 return getIntrinsicInstrCost(ICA, CostKind); 5726 } 5727 5728 InstructionCost 5729 X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy, 5730 FastMathFlags FMF, 5731 TTI::TargetCostKind CostKind) { 5732 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 5733 5734 MVT MTy = LT.second; 5735 5736 int ISD; 5737 if (ValTy->isIntOrIntVectorTy()) { 5738 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN 5739 : ISD::SMIN; 5740 } else { 5741 assert(ValTy->isFPOrFPVectorTy() && 5742 "Expected float point or integer vector type."); 5743 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum) 5744 ? ISD::FMINNUM 5745 : ISD::FMINIMUM; 5746 } 5747 5748 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 5749 // and make it as the cost. 5750 5751 static const CostTblEntry SSE2CostTbl[] = { 5752 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw 5753 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw 5754 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw 5755 }; 5756 5757 static const CostTblEntry SSE41CostTbl[] = { 5758 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2 5759 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2 5760 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2 5761 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2 5762 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor 5763 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax 5764 {ISD::SMIN, MVT::v2i8, 3}, // pminsb 5765 {ISD::SMIN, MVT::v4i8, 5}, // pminsb 5766 {ISD::SMIN, MVT::v8i8, 7}, // pminsb 5767 {ISD::SMIN, MVT::v16i8, 6}, 5768 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2 5769 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2 5770 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2 5771 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax 5772 }; 5773 5774 static const CostTblEntry AVX1CostTbl[] = { 5775 {ISD::SMIN, MVT::v16i16, 6}, 5776 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax 5777 {ISD::SMIN, MVT::v32i8, 8}, 5778 {ISD::UMIN, MVT::v32i8, 8}, 5779 }; 5780 5781 static const CostTblEntry AVX512BWCostTbl[] = { 5782 {ISD::SMIN, MVT::v32i16, 8}, 5783 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax 5784 {ISD::SMIN, MVT::v64i8, 10}, 5785 {ISD::UMIN, MVT::v64i8, 10}, 5786 }; 5787 5788 // Before legalizing the type, give a chance to look up illegal narrow types 5789 // in the table. 5790 // FIXME: Is there a better way to do this? 5791 EVT VT = TLI->getValueType(DL, ValTy); 5792 if (VT.isSimple()) { 5793 MVT MTy = VT.getSimpleVT(); 5794 if (ST->hasBWI()) 5795 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 5796 return Entry->Cost; 5797 5798 if (ST->hasAVX()) 5799 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 5800 return Entry->Cost; 5801 5802 if (ST->hasSSE41()) 5803 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 5804 return Entry->Cost; 5805 5806 if (ST->hasSSE2()) 5807 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 5808 return Entry->Cost; 5809 } 5810 5811 auto *ValVTy = cast<FixedVectorType>(ValTy); 5812 unsigned NumVecElts = ValVTy->getNumElements(); 5813 5814 auto *Ty = ValVTy; 5815 InstructionCost MinMaxCost = 0; 5816 if (LT.first != 1 && MTy.isVector() && 5817 MTy.getVectorNumElements() < ValVTy->getNumElements()) { 5818 // Type needs to be split. We need LT.first - 1 operations ops. 5819 Ty = FixedVectorType::get(ValVTy->getElementType(), 5820 MTy.getVectorNumElements()); 5821 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF); 5822 MinMaxCost *= LT.first - 1; 5823 NumVecElts = MTy.getVectorNumElements(); 5824 } 5825 5826 if (ST->hasBWI()) 5827 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 5828 return MinMaxCost + Entry->Cost; 5829 5830 if (ST->hasAVX()) 5831 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 5832 return MinMaxCost + Entry->Cost; 5833 5834 if (ST->hasSSE41()) 5835 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 5836 return MinMaxCost + Entry->Cost; 5837 5838 if (ST->hasSSE2()) 5839 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 5840 return MinMaxCost + Entry->Cost; 5841 5842 unsigned ScalarSize = ValTy->getScalarSizeInBits(); 5843 5844 // Special case power of 2 reductions where the scalar type isn't changed 5845 // by type legalization. 5846 if (!isPowerOf2_32(ValVTy->getNumElements()) || 5847 ScalarSize != MTy.getScalarSizeInBits()) 5848 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind); 5849 5850 // Now handle reduction with the legal type, taking into account size changes 5851 // at each level. 5852 while (NumVecElts > 1) { 5853 // Determine the size of the remaining vector we need to reduce. 5854 unsigned Size = NumVecElts * ScalarSize; 5855 NumVecElts /= 2; 5856 // If we're reducing from 256/512 bits, use an extract_subvector. 5857 if (Size > 128) { 5858 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); 5859 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind, 5860 NumVecElts, SubTy); 5861 Ty = SubTy; 5862 } else if (Size == 128) { 5863 // Reducing from 128 bits is a permute of v2f64/v2i64. 5864 VectorType *ShufTy; 5865 if (ValTy->isFloatingPointTy()) 5866 ShufTy = 5867 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); 5868 else 5869 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); 5870 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {}, 5871 CostKind, 0, nullptr); 5872 } else if (Size == 64) { 5873 // Reducing from 64 bits is a shuffle of v4f32/v4i32. 5874 FixedVectorType *ShufTy; 5875 if (ValTy->isFloatingPointTy()) 5876 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); 5877 else 5878 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); 5879 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {}, 5880 CostKind, 0, nullptr); 5881 } else { 5882 // Reducing from smaller size is a shift by immediate. 5883 auto *ShiftTy = FixedVectorType::get( 5884 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size); 5885 MinMaxCost += getArithmeticInstrCost( 5886 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput, 5887 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 5888 {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None}); 5889 } 5890 5891 // Add the arithmetic op for this level. 5892 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF); 5893 } 5894 5895 // Add the final extract element to the cost. 5896 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 5897 CostKind, 0, nullptr, nullptr); 5898 } 5899 5900 /// Calculate the cost of materializing a 64-bit value. This helper 5901 /// method might only calculate a fraction of a larger immediate. Therefore it 5902 /// is valid to return a cost of ZERO. 5903 InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) { 5904 if (Val == 0) 5905 return TTI::TCC_Free; 5906 5907 if (isInt<32>(Val)) 5908 return TTI::TCC_Basic; 5909 5910 return 2 * TTI::TCC_Basic; 5911 } 5912 5913 InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 5914 TTI::TargetCostKind CostKind) { 5915 assert(Ty->isIntegerTy()); 5916 5917 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 5918 if (BitSize == 0) 5919 return ~0U; 5920 5921 // Never hoist constants larger than 128bit, because this might lead to 5922 // incorrect code generation or assertions in codegen. 5923 // Fixme: Create a cost model for types larger than i128 once the codegen 5924 // issues have been fixed. 5925 if (BitSize > 128) 5926 return TTI::TCC_Free; 5927 5928 if (Imm == 0) 5929 return TTI::TCC_Free; 5930 5931 // Sign-extend all constants to a multiple of 64-bit. 5932 APInt ImmVal = Imm; 5933 if (BitSize % 64 != 0) 5934 ImmVal = Imm.sext(alignTo(BitSize, 64)); 5935 5936 // Split the constant into 64-bit chunks and calculate the cost for each 5937 // chunk. 5938 InstructionCost Cost = 0; 5939 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 5940 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 5941 int64_t Val = Tmp.getSExtValue(); 5942 Cost += getIntImmCost(Val); 5943 } 5944 // We need at least one instruction to materialize the constant. 5945 return std::max<InstructionCost>(1, Cost); 5946 } 5947 5948 InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 5949 const APInt &Imm, Type *Ty, 5950 TTI::TargetCostKind CostKind, 5951 Instruction *Inst) { 5952 assert(Ty->isIntegerTy()); 5953 5954 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 5955 unsigned ImmBitWidth = Imm.getBitWidth(); 5956 5957 // There is no cost model for constants with a bit size of 0. Return TCC_Free 5958 // here, so that constant hoisting will ignore this constant. 5959 if (BitSize == 0) 5960 return TTI::TCC_Free; 5961 5962 unsigned ImmIdx = ~0U; 5963 switch (Opcode) { 5964 default: 5965 return TTI::TCC_Free; 5966 case Instruction::GetElementPtr: 5967 // Always hoist the base address of a GetElementPtr. This prevents the 5968 // creation of new constants for every base constant that gets constant 5969 // folded with the offset. 5970 if (Idx == 0) 5971 return 2 * TTI::TCC_Basic; 5972 return TTI::TCC_Free; 5973 case Instruction::Store: 5974 ImmIdx = 0; 5975 break; 5976 case Instruction::ICmp: 5977 // This is an imperfect hack to prevent constant hoisting of 5978 // compares that might be trying to check if a 64-bit value fits in 5979 // 32-bits. The backend can optimize these cases using a right shift by 32. 5980 // Ideally we would check the compare predicate here. There also other 5981 // similar immediates the backend can use shifts for. 5982 if (Idx == 1 && ImmBitWidth == 64) { 5983 uint64_t ImmVal = Imm.getZExtValue(); 5984 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) 5985 return TTI::TCC_Free; 5986 } 5987 ImmIdx = 1; 5988 break; 5989 case Instruction::And: 5990 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes 5991 // by using a 32-bit operation with implicit zero extension. Detect such 5992 // immediates here as the normal path expects bit 31 to be sign extended. 5993 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32)) 5994 return TTI::TCC_Free; 5995 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits. 5996 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() && 5997 Imm.isMask()) 5998 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535); 5999 ImmIdx = 1; 6000 break; 6001 case Instruction::Add: 6002 case Instruction::Sub: 6003 // For add/sub, we can use the opposite instruction for INT32_MIN. 6004 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000) 6005 return TTI::TCC_Free; 6006 ImmIdx = 1; 6007 break; 6008 case Instruction::UDiv: 6009 case Instruction::SDiv: 6010 case Instruction::URem: 6011 case Instruction::SRem: 6012 // Division by constant is typically expanded later into a different 6013 // instruction sequence. This completely changes the constants. 6014 // Report them as "free" to stop ConstantHoist from marking them as opaque. 6015 return TTI::TCC_Free; 6016 case Instruction::Mul: 6017 case Instruction::Or: 6018 case Instruction::Xor: 6019 ImmIdx = 1; 6020 break; 6021 // Always return TCC_Free for the shift value of a shift instruction. 6022 case Instruction::Shl: 6023 case Instruction::LShr: 6024 case Instruction::AShr: 6025 if (Idx == 1) 6026 return TTI::TCC_Free; 6027 break; 6028 case Instruction::Trunc: 6029 case Instruction::ZExt: 6030 case Instruction::SExt: 6031 case Instruction::IntToPtr: 6032 case Instruction::PtrToInt: 6033 case Instruction::BitCast: 6034 case Instruction::PHI: 6035 case Instruction::Call: 6036 case Instruction::Select: 6037 case Instruction::Ret: 6038 case Instruction::Load: 6039 break; 6040 } 6041 6042 if (Idx == ImmIdx) { 6043 uint64_t NumConstants = divideCeil(BitSize, 64); 6044 InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); 6045 return (Cost <= NumConstants * TTI::TCC_Basic) 6046 ? static_cast<int>(TTI::TCC_Free) 6047 : Cost; 6048 } 6049 6050 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); 6051 } 6052 6053 InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 6054 const APInt &Imm, Type *Ty, 6055 TTI::TargetCostKind CostKind) { 6056 assert(Ty->isIntegerTy()); 6057 6058 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 6059 // There is no cost model for constants with a bit size of 0. Return TCC_Free 6060 // here, so that constant hoisting will ignore this constant. 6061 if (BitSize == 0) 6062 return TTI::TCC_Free; 6063 6064 switch (IID) { 6065 default: 6066 return TTI::TCC_Free; 6067 case Intrinsic::sadd_with_overflow: 6068 case Intrinsic::uadd_with_overflow: 6069 case Intrinsic::ssub_with_overflow: 6070 case Intrinsic::usub_with_overflow: 6071 case Intrinsic::smul_with_overflow: 6072 case Intrinsic::umul_with_overflow: 6073 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32)) 6074 return TTI::TCC_Free; 6075 break; 6076 case Intrinsic::experimental_stackmap: 6077 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64))) 6078 return TTI::TCC_Free; 6079 break; 6080 case Intrinsic::experimental_patchpoint_void: 6081 case Intrinsic::experimental_patchpoint: 6082 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64))) 6083 return TTI::TCC_Free; 6084 break; 6085 } 6086 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); 6087 } 6088 6089 InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, 6090 TTI::TargetCostKind CostKind, 6091 const Instruction *I) { 6092 if (CostKind != TTI::TCK_RecipThroughput) 6093 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic; 6094 // Branches are assumed to be predicted. 6095 return TTI::TCC_Free; 6096 } 6097 6098 int X86TTIImpl::getGatherOverhead() const { 6099 // Some CPUs have more overhead for gather. The specified overhead is relative 6100 // to the Load operation. "2" is the number provided by Intel architects. This 6101 // parameter is used for cost estimation of Gather Op and comparison with 6102 // other alternatives. 6103 // TODO: Remove the explicit hasAVX512()?, That would mean we would only 6104 // enable gather with a -march. 6105 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) 6106 return 2; 6107 6108 return 1024; 6109 } 6110 6111 int X86TTIImpl::getScatterOverhead() const { 6112 if (ST->hasAVX512()) 6113 return 2; 6114 6115 return 1024; 6116 } 6117 6118 // Return an average cost of Gather / Scatter instruction, maybe improved later. 6119 InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, 6120 TTI::TargetCostKind CostKind, 6121 Type *SrcVTy, const Value *Ptr, 6122 Align Alignment, 6123 unsigned AddressSpace) { 6124 6125 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); 6126 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); 6127 6128 // Try to reduce index size from 64 bit (default for GEP) 6129 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the 6130 // operation will use 16 x 64 indices which do not fit in a zmm and needs 6131 // to split. Also check that the base pointer is the same for all lanes, 6132 // and that there's at most one variable index. 6133 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { 6134 unsigned IndexSize = DL.getPointerSizeInBits(); 6135 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); 6136 if (IndexSize < 64 || !GEP) 6137 return IndexSize; 6138 6139 unsigned NumOfVarIndices = 0; 6140 const Value *Ptrs = GEP->getPointerOperand(); 6141 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) 6142 return IndexSize; 6143 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) { 6144 if (isa<Constant>(GEP->getOperand(I))) 6145 continue; 6146 Type *IndxTy = GEP->getOperand(I)->getType(); 6147 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy)) 6148 IndxTy = IndexVTy->getElementType(); 6149 if ((IndxTy->getPrimitiveSizeInBits() == 64 && 6150 !isa<SExtInst>(GEP->getOperand(I))) || 6151 ++NumOfVarIndices > 1) 6152 return IndexSize; // 64 6153 } 6154 return (unsigned)32; 6155 }; 6156 6157 // Trying to reduce IndexSize to 32 bits for vector 16. 6158 // By default the IndexSize is equal to pointer size. 6159 unsigned IndexSize = (ST->hasAVX512() && VF >= 16) 6160 ? getIndexSizeInBits(Ptr, DL) 6161 : DL.getPointerSizeInBits(); 6162 6163 auto *IndexVTy = FixedVectorType::get( 6164 IntegerType::get(SrcVTy->getContext(), IndexSize), VF); 6165 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy); 6166 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy); 6167 InstructionCost::CostType SplitFactor = 6168 *std::max(IdxsLT.first, SrcLT.first).getValue(); 6169 if (SplitFactor > 1) { 6170 // Handle splitting of vector of pointers 6171 auto *SplitSrcTy = 6172 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); 6173 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr, 6174 Alignment, AddressSpace); 6175 } 6176 6177 // If we didn't split, this will be a single gather/scatter instruction. 6178 if (CostKind == TTI::TCK_CodeSize) 6179 return 1; 6180 6181 // The gather / scatter cost is given by Intel architects. It is a rough 6182 // number since we are looking at one instruction in a time. 6183 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead() 6184 : getScatterOverhead(); 6185 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 6186 MaybeAlign(Alignment), AddressSpace, 6187 CostKind); 6188 } 6189 6190 /// Calculate the cost of Gather / Scatter operation 6191 InstructionCost X86TTIImpl::getGatherScatterOpCost( 6192 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, 6193 Align Alignment, TTI::TargetCostKind CostKind, 6194 const Instruction *I = nullptr) { 6195 if ((Opcode == Instruction::Load && 6196 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) || 6197 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), 6198 Align(Alignment)))) || 6199 (Opcode == Instruction::Store && 6200 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) || 6201 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), 6202 Align(Alignment))))) 6203 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, 6204 Alignment, CostKind, I); 6205 6206 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); 6207 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); 6208 if (!PtrTy && Ptr->getType()->isVectorTy()) 6209 PtrTy = dyn_cast<PointerType>( 6210 cast<VectorType>(Ptr->getType())->getElementType()); 6211 assert(PtrTy && "Unexpected type for Ptr argument"); 6212 unsigned AddressSpace = PtrTy->getAddressSpace(); 6213 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment, 6214 AddressSpace); 6215 } 6216 6217 bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 6218 const TargetTransformInfo::LSRCost &C2) { 6219 // X86 specific here are "instruction number 1st priority". 6220 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 6221 C1.NumIVMuls, C1.NumBaseAdds, 6222 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 6223 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 6224 C2.NumIVMuls, C2.NumBaseAdds, 6225 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 6226 } 6227 6228 bool X86TTIImpl::canMacroFuseCmp() { 6229 return ST->hasMacroFusion() || ST->hasBranchFusion(); 6230 } 6231 6232 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { 6233 Type *ScalarTy = DataTy->getScalarType(); 6234 6235 // The backend can't handle a single element vector w/o CFCMOV. 6236 if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1) 6237 return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy); 6238 6239 if (!ST->hasAVX()) 6240 return false; 6241 6242 if (ScalarTy->isPointerTy()) 6243 return true; 6244 6245 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) 6246 return true; 6247 6248 if (ScalarTy->isHalfTy() && ST->hasBWI()) 6249 return true; 6250 6251 if (ScalarTy->isBFloatTy() && ST->hasBF16()) 6252 return true; 6253 6254 if (!ScalarTy->isIntegerTy()) 6255 return false; 6256 6257 unsigned IntWidth = ScalarTy->getIntegerBitWidth(); 6258 return IntWidth == 32 || IntWidth == 64 || 6259 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); 6260 } 6261 6262 bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) { 6263 return isLegalMaskedLoad(DataType, Alignment); 6264 } 6265 6266 bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { 6267 unsigned DataSize = DL.getTypeStoreSize(DataType); 6268 // The only supported nontemporal loads are for aligned vectors of 16 or 32 6269 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 6270 // (the equivalent stores only require AVX). 6271 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) 6272 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); 6273 6274 return false; 6275 } 6276 6277 bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { 6278 unsigned DataSize = DL.getTypeStoreSize(DataType); 6279 6280 // SSE4A supports nontemporal stores of float and double at arbitrary 6281 // alignment. 6282 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) 6283 return true; 6284 6285 // Besides the SSE4A subtarget exception above, only aligned stores are 6286 // available nontemporaly on any other subtarget. And only stores with a size 6287 // of 4..32 bytes (powers of 2, only) are permitted. 6288 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || 6289 !isPowerOf2_32(DataSize)) 6290 return false; 6291 6292 // 32-byte vector nontemporal stores are supported by AVX (the equivalent 6293 // loads require AVX2). 6294 if (DataSize == 32) 6295 return ST->hasAVX(); 6296 if (DataSize == 16) 6297 return ST->hasSSE1(); 6298 return true; 6299 } 6300 6301 bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy, 6302 ElementCount NumElements) const { 6303 // movddup 6304 return ST->hasSSE3() && !NumElements.isScalable() && 6305 NumElements.getFixedValue() == 2 && 6306 ElementTy == Type::getDoubleTy(ElementTy->getContext()); 6307 } 6308 6309 bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) { 6310 if (!isa<VectorType>(DataTy)) 6311 return false; 6312 6313 if (!ST->hasAVX512()) 6314 return false; 6315 6316 // The backend can't handle a single element vector. 6317 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1) 6318 return false; 6319 6320 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType(); 6321 6322 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) 6323 return true; 6324 6325 if (!ScalarTy->isIntegerTy()) 6326 return false; 6327 6328 unsigned IntWidth = ScalarTy->getIntegerBitWidth(); 6329 return IntWidth == 32 || IntWidth == 64 || 6330 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); 6331 } 6332 6333 bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) { 6334 return isLegalMaskedExpandLoad(DataTy, Alignment); 6335 } 6336 6337 bool X86TTIImpl::supportsGather() const { 6338 // Some CPUs have better gather performance than others. 6339 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only 6340 // enable gather with a -march. 6341 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); 6342 } 6343 6344 bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { 6345 // Gather / Scatter for vector 2 is not profitable on KNL / SKX 6346 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend 6347 // it to 8 elements, but zeroing upper bits of the mask vector will add more 6348 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: 6349 // Check, maybe the gather/scatter instruction is better in the VariableMask 6350 // case. 6351 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements(); 6352 return NumElts == 1 || 6353 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); 6354 } 6355 6356 bool X86TTIImpl::isLegalMaskedGatherScatter(Type *DataTy, Align Alignment) { 6357 Type *ScalarTy = DataTy->getScalarType(); 6358 if (ScalarTy->isPointerTy()) 6359 return true; 6360 6361 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) 6362 return true; 6363 6364 if (!ScalarTy->isIntegerTy()) 6365 return false; 6366 6367 unsigned IntWidth = ScalarTy->getIntegerBitWidth(); 6368 return IntWidth == 32 || IntWidth == 64; 6369 } 6370 6371 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { 6372 if (!supportsGather() || !ST->preferGather()) 6373 return false; 6374 return isLegalMaskedGatherScatter(DataTy, Alignment); 6375 } 6376 6377 bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, 6378 unsigned Opcode1, 6379 const SmallBitVector &OpcodeMask) const { 6380 // ADDSUBPS 4xf32 SSE3 6381 // VADDSUBPS 4xf32 AVX 6382 // VADDSUBPS 8xf32 AVX2 6383 // ADDSUBPD 2xf64 SSE3 6384 // VADDSUBPD 2xf64 AVX 6385 // VADDSUBPD 4xf64 AVX2 6386 6387 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements(); 6388 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible"); 6389 if (!isPowerOf2_32(NumElements)) 6390 return false; 6391 // Check the opcode pattern. We apply the mask on the opcode arguments and 6392 // then check if it is what we expect. 6393 for (int Lane : seq<int>(0, NumElements)) { 6394 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0; 6395 // We expect FSub for even lanes and FAdd for odd lanes. 6396 if (Lane % 2 == 0 && Opc != Instruction::FSub) 6397 return false; 6398 if (Lane % 2 == 1 && Opc != Instruction::FAdd) 6399 return false; 6400 } 6401 // Now check that the pattern is supported by the target ISA. 6402 Type *ElemTy = cast<VectorType>(VecTy)->getElementType(); 6403 if (ElemTy->isFloatTy()) 6404 return ST->hasSSE3() && NumElements % 4 == 0; 6405 if (ElemTy->isDoubleTy()) 6406 return ST->hasSSE3() && NumElements % 2 == 0; 6407 return false; 6408 } 6409 6410 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { 6411 // AVX2 doesn't support scatter 6412 if (!ST->hasAVX512() || !ST->preferScatter()) 6413 return false; 6414 return isLegalMaskedGatherScatter(DataType, Alignment); 6415 } 6416 6417 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { 6418 EVT VT = TLI->getValueType(DL, DataType); 6419 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); 6420 } 6421 6422 bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction* I) { 6423 // FDIV is always expensive, even if it has a very low uop count. 6424 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv? 6425 if (I->getOpcode() == Instruction::FDiv) 6426 return true; 6427 6428 return BaseT::isExpensiveToSpeculativelyExecute(I); 6429 } 6430 6431 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 6432 return false; 6433 } 6434 6435 bool X86TTIImpl::areInlineCompatible(const Function *Caller, 6436 const Function *Callee) const { 6437 const TargetMachine &TM = getTLI()->getTargetMachine(); 6438 6439 // Work this as a subsetting of subtarget features. 6440 const FeatureBitset &CallerBits = 6441 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 6442 const FeatureBitset &CalleeBits = 6443 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 6444 6445 // Check whether features are the same (apart from the ignore list). 6446 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; 6447 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; 6448 if (RealCallerBits == RealCalleeBits) 6449 return true; 6450 6451 // If the features are a subset, we need to additionally check for calls 6452 // that may become ABI-incompatible as a result of inlining. 6453 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) 6454 return false; 6455 6456 for (const Instruction &I : instructions(Callee)) { 6457 if (const auto *CB = dyn_cast<CallBase>(&I)) { 6458 // Having more target features is fine for inline ASM. 6459 if (CB->isInlineAsm()) 6460 continue; 6461 6462 SmallVector<Type *, 8> Types; 6463 for (Value *Arg : CB->args()) 6464 Types.push_back(Arg->getType()); 6465 if (!CB->getType()->isVoidTy()) 6466 Types.push_back(CB->getType()); 6467 6468 // Simple types are always ABI compatible. 6469 auto IsSimpleTy = [](Type *Ty) { 6470 return !Ty->isVectorTy() && !Ty->isAggregateType(); 6471 }; 6472 if (all_of(Types, IsSimpleTy)) 6473 continue; 6474 6475 if (Function *NestedCallee = CB->getCalledFunction()) { 6476 // Assume that intrinsics are always ABI compatible. 6477 if (NestedCallee->isIntrinsic()) 6478 continue; 6479 6480 // Do a precise compatibility check. 6481 if (!areTypesABICompatible(Caller, NestedCallee, Types)) 6482 return false; 6483 } else { 6484 // We don't know the target features of the callee, 6485 // assume it is incompatible. 6486 return false; 6487 } 6488 } 6489 } 6490 return true; 6491 } 6492 6493 bool X86TTIImpl::areTypesABICompatible(const Function *Caller, 6494 const Function *Callee, 6495 const ArrayRef<Type *> &Types) const { 6496 if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) 6497 return false; 6498 6499 // If we get here, we know the target features match. If one function 6500 // considers 512-bit vectors legal and the other does not, consider them 6501 // incompatible. 6502 const TargetMachine &TM = getTLI()->getTargetMachine(); 6503 6504 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == 6505 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs()) 6506 return true; 6507 6508 // Consider the arguments compatible if they aren't vectors or aggregates. 6509 // FIXME: Look at the size of vectors. 6510 // FIXME: Look at the element types of aggregates to see if there are vectors. 6511 return llvm::none_of(Types, 6512 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); }); 6513 } 6514 6515 X86TTIImpl::TTI::MemCmpExpansionOptions 6516 X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 6517 TTI::MemCmpExpansionOptions Options; 6518 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 6519 Options.NumLoadsPerBlock = 2; 6520 // All GPR and vector loads can be unaligned. 6521 Options.AllowOverlappingLoads = true; 6522 if (IsZeroCmp) { 6523 // Only enable vector loads for equality comparison. Right now the vector 6524 // version is not as fast for three way compare (see #33329). 6525 const unsigned PreferredWidth = ST->getPreferVectorWidth(); 6526 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512()) 6527 Options.LoadSizes.push_back(64); 6528 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); 6529 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); 6530 } 6531 if (ST->is64Bit()) { 6532 Options.LoadSizes.push_back(8); 6533 } 6534 Options.LoadSizes.push_back(4); 6535 Options.LoadSizes.push_back(2); 6536 Options.LoadSizes.push_back(1); 6537 return Options; 6538 } 6539 6540 bool X86TTIImpl::prefersVectorizedAddressing() const { 6541 return supportsGather(); 6542 } 6543 6544 bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const { 6545 return false; 6546 } 6547 6548 bool X86TTIImpl::enableInterleavedAccessVectorization() { 6549 // TODO: We expect this to be beneficial regardless of arch, 6550 // but there are currently some unexplained performance artifacts on Atom. 6551 // As a temporary solution, disable on Atom. 6552 return !(ST->isAtom()); 6553 } 6554 6555 // Get estimation for interleaved load/store operations and strided load. 6556 // \p Indices contains indices for strided load. 6557 // \p Factor - the factor of interleaving. 6558 // AVX-512 provides 3-src shuffles that significantly reduces the cost. 6559 InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( 6560 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, 6561 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, 6562 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { 6563 // VecTy for interleave memop is <VF*Factor x Elt>. 6564 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 6565 // VecTy = <12 x i32>. 6566 6567 // Calculate the number of memory operations (NumOfMemOps), required 6568 // for load/store the VecTy. 6569 MVT LegalVT = getTypeLegalizationCost(VecTy).second; 6570 unsigned VecTySize = DL.getTypeStoreSize(VecTy); 6571 unsigned LegalVTSize = LegalVT.getStoreSize(); 6572 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; 6573 6574 // Get the cost of one memory operation. 6575 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), 6576 LegalVT.getVectorNumElements()); 6577 InstructionCost MemOpCost; 6578 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; 6579 if (UseMaskedMemOp) 6580 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, 6581 AddressSpace, CostKind); 6582 else 6583 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment), 6584 AddressSpace, CostKind); 6585 6586 unsigned VF = VecTy->getNumElements() / Factor; 6587 MVT VT = 6588 MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF); 6589 6590 InstructionCost MaskCost; 6591 if (UseMaskedMemOp) { 6592 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements()); 6593 for (unsigned Index : Indices) { 6594 assert(Index < Factor && "Invalid index for interleaved memory op"); 6595 for (unsigned Elm = 0; Elm < VF; Elm++) 6596 DemandedLoadStoreElts.setBit(Index + Elm * Factor); 6597 } 6598 6599 Type *I1Type = Type::getInt1Ty(VecTy->getContext()); 6600 6601 MaskCost = getReplicationShuffleCost( 6602 I1Type, Factor, VF, 6603 UseMaskForGaps ? DemandedLoadStoreElts 6604 : APInt::getAllOnes(VecTy->getNumElements()), 6605 CostKind); 6606 6607 // The Gaps mask is invariant and created outside the loop, therefore the 6608 // cost of creating it is not accounted for here. However if we have both 6609 // a MaskForGaps and some other mask that guards the execution of the 6610 // memory access, we need to account for the cost of And-ing the two masks 6611 // inside the loop. 6612 if (UseMaskForGaps) { 6613 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements()); 6614 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind); 6615 } 6616 } 6617 6618 if (Opcode == Instruction::Load) { 6619 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) 6620 // contain the cost of the optimized shuffle sequence that the 6621 // X86InterleavedAccess pass will generate. 6622 // The cost of loads and stores are computed separately from the table. 6623 6624 // X86InterleavedAccess support only the following interleaved-access group. 6625 static const CostTblEntry AVX512InterleavedLoadTbl[] = { 6626 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 6627 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 6628 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 6629 }; 6630 6631 if (const auto *Entry = 6632 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) 6633 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; 6634 //If an entry does not exist, fallback to the default implementation. 6635 6636 // Kind of shuffle depends on number of loaded values. 6637 // If we load the entire data in one register, we can use a 1-src shuffle. 6638 // Otherwise, we'll merge 2 sources in each operation. 6639 TTI::ShuffleKind ShuffleKind = 6640 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; 6641 6642 InstructionCost ShuffleCost = 6643 getShuffleCost(ShuffleKind, SingleMemOpTy, {}, CostKind, 0, nullptr); 6644 6645 unsigned NumOfLoadsInInterleaveGrp = 6646 Indices.size() ? Indices.size() : Factor; 6647 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), 6648 VecTy->getNumElements() / Factor); 6649 InstructionCost NumOfResults = 6650 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp; 6651 6652 // About a half of the loads may be folded in shuffles when we have only 6653 // one result. If we have more than one result, or the loads are masked, 6654 // we do not fold loads at all. 6655 unsigned NumOfUnfoldedLoads = 6656 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; 6657 6658 // Get a number of shuffle operations per result. 6659 unsigned NumOfShufflesPerResult = 6660 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); 6661 6662 // The SK_MergeTwoSrc shuffle clobbers one of src operands. 6663 // When we have more than one destination, we need additional instructions 6664 // to keep sources. 6665 InstructionCost NumOfMoves = 0; 6666 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) 6667 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; 6668 6669 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + 6670 MaskCost + NumOfUnfoldedLoads * MemOpCost + 6671 NumOfMoves; 6672 6673 return Cost; 6674 } 6675 6676 // Store. 6677 assert(Opcode == Instruction::Store && 6678 "Expected Store Instruction at this point"); 6679 // X86InterleavedAccess support only the following interleaved-access group. 6680 static const CostTblEntry AVX512InterleavedStoreTbl[] = { 6681 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) 6682 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) 6683 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) 6684 6685 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) 6686 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) 6687 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) 6688 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) 6689 }; 6690 6691 if (const auto *Entry = 6692 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) 6693 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; 6694 //If an entry does not exist, fallback to the default implementation. 6695 6696 // There is no strided stores meanwhile. And store can't be folded in 6697 // shuffle. 6698 unsigned NumOfSources = Factor; // The number of values to be merged. 6699 InstructionCost ShuffleCost = getShuffleCost( 6700 TTI::SK_PermuteTwoSrc, SingleMemOpTy, {}, CostKind, 0, nullptr); 6701 unsigned NumOfShufflesPerStore = NumOfSources - 1; 6702 6703 // The SK_MergeTwoSrc shuffle clobbers one of src operands. 6704 // We need additional instructions to keep sources. 6705 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; 6706 InstructionCost Cost = 6707 MaskCost + 6708 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + 6709 NumOfMoves; 6710 return Cost; 6711 } 6712 6713 InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( 6714 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices, 6715 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 6716 bool UseMaskForCond, bool UseMaskForGaps) { 6717 auto *VecTy = cast<FixedVectorType>(BaseTy); 6718 6719 auto isSupportedOnAVX512 = [&](Type *VecTy) { 6720 Type *EltTy = cast<VectorType>(VecTy)->getElementType(); 6721 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || 6722 EltTy->isIntegerTy(32) || EltTy->isPointerTy()) 6723 return true; 6724 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy()) 6725 return ST->hasBWI(); 6726 if (EltTy->isBFloatTy()) 6727 return ST->hasBF16(); 6728 return false; 6729 }; 6730 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy)) 6731 return getInterleavedMemoryOpCostAVX512( 6732 Opcode, VecTy, Factor, Indices, Alignment, 6733 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); 6734 6735 if (UseMaskForCond || UseMaskForGaps) 6736 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 6737 Alignment, AddressSpace, CostKind, 6738 UseMaskForCond, UseMaskForGaps); 6739 6740 // Get estimation for interleaved load/store operations for SSE-AVX2. 6741 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow 6742 // computing the cost using a generic formula as a function of generic 6743 // shuffles. We therefore use a lookup table instead, filled according to 6744 // the instruction sequences that codegen currently generates. 6745 6746 // VecTy for interleave memop is <VF*Factor x Elt>. 6747 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 6748 // VecTy = <12 x i32>. 6749 MVT LegalVT = getTypeLegalizationCost(VecTy).second; 6750 6751 // This function can be called with VecTy=<6xi128>, Factor=3, in which case 6752 // the VF=2, while v2i128 is an unsupported MVT vector type 6753 // (see MachineValueType.h::getVectorVT()). 6754 if (!LegalVT.isVector()) 6755 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 6756 Alignment, AddressSpace, CostKind); 6757 6758 unsigned VF = VecTy->getNumElements() / Factor; 6759 Type *ScalarTy = VecTy->getElementType(); 6760 // Deduplicate entries, model floats/pointers as appropriately-sized integers. 6761 if (!ScalarTy->isIntegerTy()) 6762 ScalarTy = 6763 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy)); 6764 6765 // Get the cost of all the memory operations. 6766 // FIXME: discount dead loads. 6767 InstructionCost MemOpCosts = getMemoryOpCost( 6768 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); 6769 6770 auto *VT = FixedVectorType::get(ScalarTy, VF); 6771 EVT ETy = TLI->getValueType(DL, VT); 6772 if (!ETy.isSimple()) 6773 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 6774 Alignment, AddressSpace, CostKind); 6775 6776 // TODO: Complete for other data-types and strides. 6777 // Each combination of Stride, element bit width and VF results in a different 6778 // sequence; The cost tables are therefore accessed with: 6779 // Factor (stride) and VectorType=VFxiN. 6780 // The Cost accounts only for the shuffle sequence; 6781 // The cost of the loads/stores is accounted for separately. 6782 // 6783 static const CostTblEntry AVX2InterleavedLoadTbl[] = { 6784 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8 6785 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8 6786 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8 6787 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8 6788 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8 6789 6790 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16 6791 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16 6792 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16 6793 6794 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32 6795 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32 6796 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32 6797 6798 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64 6799 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64 6800 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64 6801 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64 6802 6803 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8 6804 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8 6805 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8 6806 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8 6807 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8 6808 6809 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16 6810 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16 6811 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16 6812 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16 6813 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16 6814 6815 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32 6816 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32 6817 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32 6818 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32 6819 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32 6820 6821 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64 6822 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64 6823 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64 6824 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64 6825 6826 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8 6827 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8 6828 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8 6829 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8 6830 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8 6831 6832 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16 6833 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16 6834 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16 6835 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16 6836 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16 6837 6838 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32 6839 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32 6840 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32 6841 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32 6842 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32 6843 6844 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64 6845 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64 6846 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64 6847 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64 6848 6849 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8 6850 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8 6851 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8 6852 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8 6853 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8 6854 6855 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16 6856 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16 6857 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16 6858 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16 6859 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16 6860 6861 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32 6862 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32 6863 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32 6864 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32 6865 6866 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64 6867 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64 6868 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64 6869 6870 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32 6871 }; 6872 6873 static const CostTblEntry SSSE3InterleavedLoadTbl[] = { 6874 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16 6875 }; 6876 6877 static const CostTblEntry SSE2InterleavedLoadTbl[] = { 6878 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16 6879 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16 6880 6881 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32 6882 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32 6883 6884 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64 6885 }; 6886 6887 static const CostTblEntry AVX2InterleavedStoreTbl[] = { 6888 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store) 6889 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store) 6890 6891 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store) 6892 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store) 6893 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store) 6894 6895 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store) 6896 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store) 6897 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store) 6898 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store) 6899 6900 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store) 6901 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store) 6902 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store) 6903 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store) 6904 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store) 6905 6906 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store) 6907 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store) 6908 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store) 6909 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store) 6910 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store) 6911 6912 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store) 6913 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store) 6914 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store) 6915 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store) 6916 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store) 6917 6918 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store) 6919 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store) 6920 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store) 6921 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store) 6922 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store) 6923 6924 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store) 6925 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store) 6926 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store) 6927 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store) 6928 6929 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store) 6930 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store) 6931 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store) 6932 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store) 6933 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store) 6934 6935 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store) 6936 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store) 6937 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store) 6938 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store) 6939 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store) 6940 6941 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store) 6942 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store) 6943 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store) 6944 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store) 6945 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store) 6946 6947 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store) 6948 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store) 6949 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store) 6950 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store) 6951 6952 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store) 6953 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store) 6954 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store) 6955 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store) 6956 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store) 6957 6958 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store) 6959 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store) 6960 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store) 6961 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store) 6962 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store) 6963 6964 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store) 6965 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store) 6966 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store) 6967 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store) 6968 6969 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store) 6970 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store) 6971 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store) 6972 }; 6973 6974 static const CostTblEntry SSE2InterleavedStoreTbl[] = { 6975 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store) 6976 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store) 6977 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store) 6978 6979 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store) 6980 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store) 6981 6982 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store) 6983 }; 6984 6985 if (Opcode == Instruction::Load) { 6986 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), 6987 MemOpCosts](const CostTblEntry *Entry) { 6988 // NOTE: this is just an approximation! 6989 // It can over/under -estimate the cost! 6990 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor); 6991 }; 6992 6993 if (ST->hasAVX2()) 6994 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, 6995 ETy.getSimpleVT())) 6996 return GetDiscountedCost(Entry); 6997 6998 if (ST->hasSSSE3()) 6999 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor, 7000 ETy.getSimpleVT())) 7001 return GetDiscountedCost(Entry); 7002 7003 if (ST->hasSSE2()) 7004 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor, 7005 ETy.getSimpleVT())) 7006 return GetDiscountedCost(Entry); 7007 } else { 7008 assert(Opcode == Instruction::Store && 7009 "Expected Store Instruction at this point"); 7010 assert((!Indices.size() || Indices.size() == Factor) && 7011 "Interleaved store only supports fully-interleaved groups."); 7012 if (ST->hasAVX2()) 7013 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, 7014 ETy.getSimpleVT())) 7015 return MemOpCosts + Entry->Cost; 7016 7017 if (ST->hasSSE2()) 7018 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor, 7019 ETy.getSimpleVT())) 7020 return MemOpCosts + Entry->Cost; 7021 } 7022 7023 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 7024 Alignment, AddressSpace, CostKind, 7025 UseMaskForCond, UseMaskForGaps); 7026 } 7027 7028 InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 7029 StackOffset BaseOffset, 7030 bool HasBaseReg, int64_t Scale, 7031 unsigned AddrSpace) const { 7032 // Scaling factors are not free at all. 7033 // An indexed folded instruction, i.e., inst (reg1, reg2, scale), 7034 // will take 2 allocations in the out of order engine instead of 1 7035 // for plain addressing mode, i.e. inst (reg1). 7036 // E.g., 7037 // vaddps (%rsi,%rdx), %ymm0, %ymm1 7038 // Requires two allocations (one for the load, one for the computation) 7039 // whereas: 7040 // vaddps (%rsi), %ymm0, %ymm1 7041 // Requires just 1 allocation, i.e., freeing allocations for other operations 7042 // and having less micro operations to execute. 7043 // 7044 // For some X86 architectures, this is even worse because for instance for 7045 // stores, the complex addressing mode forces the instruction to use the 7046 // "load" ports instead of the dedicated "store" port. 7047 // E.g., on Haswell: 7048 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. 7049 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. 7050 TargetLoweringBase::AddrMode AM; 7051 AM.BaseGV = BaseGV; 7052 AM.BaseOffs = BaseOffset.getFixed(); 7053 AM.HasBaseReg = HasBaseReg; 7054 AM.Scale = Scale; 7055 AM.ScalableOffset = BaseOffset.getScalable(); 7056 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) 7057 // Scale represents reg2 * scale, thus account for 1 7058 // as soon as we use a second register. 7059 return AM.Scale != 0; 7060 return -1; 7061 } 7062 7063 InstructionCost X86TTIImpl::getBranchMispredictPenalty() const { 7064 // TODO: Hook MispredictPenalty of SchedMachineModel into this. 7065 return 14; 7066 } 7067 7068 bool X86TTIImpl::isVectorShiftByScalarCheap(Type *Ty) const { 7069 unsigned Bits = Ty->getScalarSizeInBits(); 7070 7071 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts. 7072 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred. 7073 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64)) 7074 return false; 7075 7076 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable 7077 // shifts just as cheap as scalar ones. 7078 if (ST->hasAVX2() && (Bits == 32 || Bits == 64)) 7079 return false; 7080 7081 // AVX512BW has shifts such as vpsllvw. 7082 if (ST->hasBWI() && Bits == 16) 7083 return false; 7084 7085 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a 7086 // fully general vector. 7087 return true; 7088 } 7089 7090 unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, 7091 Type *ScalarValTy) const { 7092 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) { 7093 return 4; 7094 } 7095 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy); 7096 } 7097 7098 bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I, 7099 SmallVectorImpl<Use *> &Ops) const { 7100 using namespace llvm::PatternMatch; 7101 7102 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType()); 7103 if (!VTy) 7104 return false; 7105 7106 if (I->getOpcode() == Instruction::Mul && 7107 VTy->getElementType()->isIntegerTy(64)) { 7108 for (auto &Op : I->operands()) { 7109 // Make sure we are not already sinking this operand 7110 if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) 7111 continue; 7112 7113 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or 7114 // the PMULUDQ pattern where the input is a zext_inreg from vXi32. 7115 if (ST->hasSSE41() && 7116 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)), 7117 m_SpecificInt(32)))) { 7118 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0)); 7119 Ops.push_back(&Op); 7120 } else if (ST->hasSSE2() && 7121 match(Op.get(), 7122 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) { 7123 Ops.push_back(&Op); 7124 } 7125 } 7126 7127 return !Ops.empty(); 7128 } 7129 7130 // A uniform shift amount in a vector shift or funnel shift may be much 7131 // cheaper than a generic variable vector shift, so make that pattern visible 7132 // to SDAG by sinking the shuffle instruction next to the shift. 7133 int ShiftAmountOpNum = -1; 7134 if (I->isShift()) 7135 ShiftAmountOpNum = 1; 7136 else if (auto *II = dyn_cast<IntrinsicInst>(I)) { 7137 if (II->getIntrinsicID() == Intrinsic::fshl || 7138 II->getIntrinsicID() == Intrinsic::fshr) 7139 ShiftAmountOpNum = 2; 7140 } 7141 7142 if (ShiftAmountOpNum == -1) 7143 return false; 7144 7145 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum)); 7146 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 && 7147 isVectorShiftByScalarCheap(I->getType())) { 7148 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum)); 7149 return true; 7150 } 7151 7152 return false; 7153 } 7154