1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "PPCTargetTransformInfo.h" 10 #include "llvm/Analysis/TargetTransformInfo.h" 11 #include "llvm/CodeGen/BasicTTIImpl.h" 12 #include "llvm/CodeGen/CostTable.h" 13 #include "llvm/CodeGen/TargetLowering.h" 14 #include "llvm/Support/CommandLine.h" 15 #include "llvm/Support/Debug.h" 16 using namespace llvm; 17 18 #define DEBUG_TYPE "ppctti" 19 20 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting", 21 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); 22 23 // This is currently only used for the data prefetch pass which is only enabled 24 // for BG/Q by default. 25 static cl::opt<unsigned> 26 CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64), 27 cl::desc("The loop prefetch cache line size")); 28 29 static cl::opt<bool> 30 EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false), 31 cl::desc("Enable using coldcc calling conv for cold " 32 "internal functions")); 33 34 //===----------------------------------------------------------------------===// 35 // 36 // PPC cost model. 37 // 38 //===----------------------------------------------------------------------===// 39 40 TargetTransformInfo::PopcntSupportKind 41 PPCTTIImpl::getPopcntSupport(unsigned TyWidth) { 42 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 43 if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64) 44 return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ? 45 TTI::PSK_SlowHardware : TTI::PSK_FastHardware; 46 return TTI::PSK_Software; 47 } 48 49 int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { 50 if (DisablePPCConstHoist) 51 return BaseT::getIntImmCost(Imm, Ty); 52 53 assert(Ty->isIntegerTy()); 54 55 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 56 if (BitSize == 0) 57 return ~0U; 58 59 if (Imm == 0) 60 return TTI::TCC_Free; 61 62 if (Imm.getBitWidth() <= 64) { 63 if (isInt<16>(Imm.getSExtValue())) 64 return TTI::TCC_Basic; 65 66 if (isInt<32>(Imm.getSExtValue())) { 67 // A constant that can be materialized using lis. 68 if ((Imm.getZExtValue() & 0xFFFF) == 0) 69 return TTI::TCC_Basic; 70 71 return 2 * TTI::TCC_Basic; 72 } 73 } 74 75 return 4 * TTI::TCC_Basic; 76 } 77 78 int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 79 Type *Ty) { 80 if (DisablePPCConstHoist) 81 return BaseT::getIntImmCost(IID, Idx, Imm, Ty); 82 83 assert(Ty->isIntegerTy()); 84 85 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 86 if (BitSize == 0) 87 return ~0U; 88 89 switch (IID) { 90 default: 91 return TTI::TCC_Free; 92 case Intrinsic::sadd_with_overflow: 93 case Intrinsic::uadd_with_overflow: 94 case Intrinsic::ssub_with_overflow: 95 case Intrinsic::usub_with_overflow: 96 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue())) 97 return TTI::TCC_Free; 98 break; 99 case Intrinsic::experimental_stackmap: 100 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 101 return TTI::TCC_Free; 102 break; 103 case Intrinsic::experimental_patchpoint_void: 104 case Intrinsic::experimental_patchpoint_i64: 105 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 106 return TTI::TCC_Free; 107 break; 108 } 109 return PPCTTIImpl::getIntImmCost(Imm, Ty); 110 } 111 112 int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 113 Type *Ty) { 114 if (DisablePPCConstHoist) 115 return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty); 116 117 assert(Ty->isIntegerTy()); 118 119 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 120 if (BitSize == 0) 121 return ~0U; 122 123 unsigned ImmIdx = ~0U; 124 bool ShiftedFree = false, RunFree = false, UnsignedFree = false, 125 ZeroFree = false; 126 switch (Opcode) { 127 default: 128 return TTI::TCC_Free; 129 case Instruction::GetElementPtr: 130 // Always hoist the base address of a GetElementPtr. This prevents the 131 // creation of new constants for every base constant that gets constant 132 // folded with the offset. 133 if (Idx == 0) 134 return 2 * TTI::TCC_Basic; 135 return TTI::TCC_Free; 136 case Instruction::And: 137 RunFree = true; // (for the rotate-and-mask instructions) 138 LLVM_FALLTHROUGH; 139 case Instruction::Add: 140 case Instruction::Or: 141 case Instruction::Xor: 142 ShiftedFree = true; 143 LLVM_FALLTHROUGH; 144 case Instruction::Sub: 145 case Instruction::Mul: 146 case Instruction::Shl: 147 case Instruction::LShr: 148 case Instruction::AShr: 149 ImmIdx = 1; 150 break; 151 case Instruction::ICmp: 152 UnsignedFree = true; 153 ImmIdx = 1; 154 // Zero comparisons can use record-form instructions. 155 LLVM_FALLTHROUGH; 156 case Instruction::Select: 157 ZeroFree = true; 158 break; 159 case Instruction::PHI: 160 case Instruction::Call: 161 case Instruction::Ret: 162 case Instruction::Load: 163 case Instruction::Store: 164 break; 165 } 166 167 if (ZeroFree && Imm == 0) 168 return TTI::TCC_Free; 169 170 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) { 171 if (isInt<16>(Imm.getSExtValue())) 172 return TTI::TCC_Free; 173 174 if (RunFree) { 175 if (Imm.getBitWidth() <= 32 && 176 (isShiftedMask_32(Imm.getZExtValue()) || 177 isShiftedMask_32(~Imm.getZExtValue()))) 178 return TTI::TCC_Free; 179 180 if (ST->isPPC64() && 181 (isShiftedMask_64(Imm.getZExtValue()) || 182 isShiftedMask_64(~Imm.getZExtValue()))) 183 return TTI::TCC_Free; 184 } 185 186 if (UnsignedFree && isUInt<16>(Imm.getZExtValue())) 187 return TTI::TCC_Free; 188 189 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0) 190 return TTI::TCC_Free; 191 } 192 193 return PPCTTIImpl::getIntImmCost(Imm, Ty); 194 } 195 196 unsigned PPCTTIImpl::getUserCost(const User *U, 197 ArrayRef<const Value *> Operands) { 198 if (U->getType()->isVectorTy()) { 199 // Instructions that need to be split should cost more. 200 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, U->getType()); 201 return LT.first * BaseT::getUserCost(U, Operands); 202 } 203 204 return BaseT::getUserCost(U, Operands); 205 } 206 207 void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 208 TTI::UnrollingPreferences &UP) { 209 if (ST->getDarwinDirective() == PPC::DIR_A2) { 210 // The A2 is in-order with a deep pipeline, and concatenation unrolling 211 // helps expose latency-hiding opportunities to the instruction scheduler. 212 UP.Partial = UP.Runtime = true; 213 214 // We unroll a lot on the A2 (hundreds of instructions), and the benefits 215 // often outweigh the cost of a division to compute the trip count. 216 UP.AllowExpensiveTripCount = true; 217 } 218 219 BaseT::getUnrollingPreferences(L, SE, UP); 220 } 221 222 // This function returns true to allow using coldcc calling convention. 223 // Returning true results in coldcc being used for functions which are cold at 224 // all call sites when the callers of the functions are not calling any other 225 // non coldcc functions. 226 bool PPCTTIImpl::useColdCCForColdCall(Function &F) { 227 return EnablePPCColdCC; 228 } 229 230 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { 231 // On the A2, always unroll aggressively. For QPX unaligned loads, we depend 232 // on combining the loads generated for consecutive accesses, and failure to 233 // do so is particularly expensive. This makes it much more likely (compared 234 // to only using concatenation unrolling). 235 if (ST->getDarwinDirective() == PPC::DIR_A2) 236 return true; 237 238 return LoopHasReductions; 239 } 240 241 const PPCTTIImpl::TTI::MemCmpExpansionOptions * 242 PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { 243 static const auto Options = []() { 244 TTI::MemCmpExpansionOptions Options; 245 Options.LoadSizes.push_back(8); 246 Options.LoadSizes.push_back(4); 247 Options.LoadSizes.push_back(2); 248 Options.LoadSizes.push_back(1); 249 return Options; 250 }(); 251 return &Options; 252 } 253 254 bool PPCTTIImpl::enableInterleavedAccessVectorization() { 255 return true; 256 } 257 258 unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { 259 if (Vector && !ST->hasAltivec() && !ST->hasQPX()) 260 return 0; 261 return ST->hasVSX() ? 64 : 32; 262 } 263 264 unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { 265 if (Vector) { 266 if (ST->hasQPX()) return 256; 267 if (ST->hasAltivec()) return 128; 268 return 0; 269 } 270 271 if (ST->isPPC64()) 272 return 64; 273 return 32; 274 275 } 276 277 unsigned PPCTTIImpl::getCacheLineSize() { 278 // Check first if the user specified a custom line size. 279 if (CacheLineSize.getNumOccurrences() > 0) 280 return CacheLineSize; 281 282 // On P7, P8 or P9 we have a cache line size of 128. 283 unsigned Directive = ST->getDarwinDirective(); 284 if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 || 285 Directive == PPC::DIR_PWR9) 286 return 128; 287 288 // On other processors return a default of 64 bytes. 289 return 64; 290 } 291 292 unsigned PPCTTIImpl::getPrefetchDistance() { 293 // This seems like a reasonable default for the BG/Q (this pass is enabled, by 294 // default, only on the BG/Q). 295 return 300; 296 } 297 298 unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { 299 unsigned Directive = ST->getDarwinDirective(); 300 // The 440 has no SIMD support, but floating-point instructions 301 // have a 5-cycle latency, so unroll by 5x for latency hiding. 302 if (Directive == PPC::DIR_440) 303 return 5; 304 305 // The A2 has no SIMD support, but floating-point instructions 306 // have a 6-cycle latency, so unroll by 6x for latency hiding. 307 if (Directive == PPC::DIR_A2) 308 return 6; 309 310 // FIXME: For lack of any better information, do no harm... 311 if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) 312 return 1; 313 314 // For P7 and P8, floating-point instructions have a 6-cycle latency and 315 // there are two execution units, so unroll by 12x for latency hiding. 316 // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready 317 if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 || 318 Directive == PPC::DIR_PWR9) 319 return 12; 320 321 // For most things, modern systems have two execution units (and 322 // out-of-order execution). 323 return 2; 324 } 325 326 // Adjust the cost of vector instructions on targets which there is overlap 327 // between the vector and scalar units, thereby reducing the overall throughput 328 // of vector code wrt. scalar code. 329 int PPCTTIImpl::vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, 330 Type *Ty2) { 331 if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy()) 332 return Cost; 333 334 std::pair<int, MVT> LT1 = TLI->getTypeLegalizationCost(DL, Ty1); 335 // If type legalization involves splitting the vector, we don't want to 336 // double the cost at every step - only the last step. 337 if (LT1.first != 1 || !LT1.second.isVector()) 338 return Cost; 339 340 int ISD = TLI->InstructionOpcodeToISD(Opcode); 341 if (TLI->isOperationExpand(ISD, LT1.second)) 342 return Cost; 343 344 if (Ty2) { 345 std::pair<int, MVT> LT2 = TLI->getTypeLegalizationCost(DL, Ty2); 346 if (LT2.first != 1 || !LT2.second.isVector()) 347 return Cost; 348 } 349 350 return Cost * 2; 351 } 352 353 int PPCTTIImpl::getArithmeticInstrCost( 354 unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, 355 TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, 356 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) { 357 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 358 359 // Fallback to the default implementation. 360 int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, 361 Opd1PropInfo, Opd2PropInfo); 362 return vectorCostAdjustment(Cost, Opcode, Ty, nullptr); 363 } 364 365 int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 366 Type *SubTp) { 367 // Legalize the type. 368 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 369 370 // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations 371 // (at least in the sense that there need only be one non-loop-invariant 372 // instruction). We need one such shuffle instruction for each actual 373 // register (this is not true for arbitrary shuffles, but is true for the 374 // structured types of shuffles covered by TTI::ShuffleKind). 375 return vectorCostAdjustment(LT.first, Instruction::ShuffleVector, Tp, 376 nullptr); 377 } 378 379 int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 380 const Instruction *I) { 381 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 382 383 int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src); 384 return vectorCostAdjustment(Cost, Opcode, Dst, Src); 385 } 386 387 int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 388 const Instruction *I) { 389 int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); 390 return vectorCostAdjustment(Cost, Opcode, ValTy, nullptr); 391 } 392 393 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { 394 assert(Val->isVectorTy() && "This must be a vector type"); 395 396 int ISD = TLI->InstructionOpcodeToISD(Opcode); 397 assert(ISD && "Invalid opcode"); 398 399 int Cost = BaseT::getVectorInstrCost(Opcode, Val, Index); 400 Cost = vectorCostAdjustment(Cost, Opcode, Val, nullptr); 401 402 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { 403 // Double-precision scalars are already located in index #0 (or #1 if LE). 404 if (ISD == ISD::EXTRACT_VECTOR_ELT && Index == ST->isLittleEndian() ? 1 : 0) 405 return 0; 406 407 return Cost; 408 409 } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { 410 // Floating point scalars are already located in index #0. 411 if (Index == 0) 412 return 0; 413 414 return Cost; 415 416 } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) { 417 if (ST->hasP9Altivec()) { 418 if (ISD == ISD::INSERT_VECTOR_ELT) 419 // A move-to VSR and a permute/insert. Assume vector operation cost 420 // for both (cost will be 2x on P9). 421 return vectorCostAdjustment(2, Opcode, Val, nullptr); 422 423 // It's an extract. Maybe we can do a cheap move-from VSR. 424 unsigned EltSize = Val->getScalarSizeInBits(); 425 if (EltSize == 64) { 426 unsigned MfvsrdIndex = ST->isLittleEndian() ? 1 : 0; 427 if (Index == MfvsrdIndex) 428 return 1; 429 } else if (EltSize == 32) { 430 unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1; 431 if (Index == MfvsrwzIndex) 432 return 1; 433 } 434 435 // We need a vector extract (or mfvsrld). Assume vector operation cost. 436 // The cost of the load constant for a vector extract is disregarded 437 // (invariant, easily schedulable). 438 return vectorCostAdjustment(1, Opcode, Val, nullptr); 439 440 } else if (ST->hasDirectMove()) 441 // Assume permute has standard cost. 442 // Assume move-to/move-from VSR have 2x standard cost. 443 return 3; 444 } 445 446 // Estimated cost of a load-hit-store delay. This was obtained 447 // experimentally as a minimum needed to prevent unprofitable 448 // vectorization for the paq8p benchmark. It may need to be 449 // raised further if other unprofitable cases remain. 450 unsigned LHSPenalty = 2; 451 if (ISD == ISD::INSERT_VECTOR_ELT) 452 LHSPenalty += 7; 453 454 // Vector element insert/extract with Altivec is very expensive, 455 // because they require store and reload with the attendant 456 // processor stall for load-hit-store. Until VSX is available, 457 // these need to be estimated as very costly. 458 if (ISD == ISD::EXTRACT_VECTOR_ELT || 459 ISD == ISD::INSERT_VECTOR_ELT) 460 return LHSPenalty + Cost; 461 462 return Cost; 463 } 464 465 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 466 unsigned AddressSpace, const Instruction *I) { 467 // Legalize the type. 468 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 469 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 470 "Invalid Opcode"); 471 472 int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); 473 Cost = vectorCostAdjustment(Cost, Opcode, Src, nullptr); 474 475 bool IsAltivecType = ST->hasAltivec() && 476 (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 || 477 LT.second == MVT::v4i32 || LT.second == MVT::v4f32); 478 bool IsVSXType = ST->hasVSX() && 479 (LT.second == MVT::v2f64 || LT.second == MVT::v2i64); 480 bool IsQPXType = ST->hasQPX() && 481 (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); 482 483 // VSX has 32b/64b load instructions. Legalization can handle loading of 484 // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and 485 // PPCTargetLowering can't compute the cost appropriately. So here we 486 // explicitly check this case. 487 unsigned MemBytes = Src->getPrimitiveSizeInBits(); 488 if (Opcode == Instruction::Load && ST->hasVSX() && IsAltivecType && 489 (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32))) 490 return 1; 491 492 // Aligned loads and stores are easy. 493 unsigned SrcBytes = LT.second.getStoreSize(); 494 if (!SrcBytes || !Alignment || Alignment >= SrcBytes) 495 return Cost; 496 497 // If we can use the permutation-based load sequence, then this is also 498 // relatively cheap (not counting loop-invariant instructions): one load plus 499 // one permute (the last load in a series has extra cost, but we're 500 // neglecting that here). Note that on the P7, we could do unaligned loads 501 // for Altivec types using the VSX instructions, but that's more expensive 502 // than using the permutation-based load sequence. On the P8, that's no 503 // longer true. 504 if (Opcode == Instruction::Load && 505 ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) && 506 Alignment >= LT.second.getScalarType().getStoreSize()) 507 return Cost + LT.first; // Add the cost of the permutations. 508 509 // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the 510 // P7, unaligned vector loads are more expensive than the permutation-based 511 // load sequence, so that might be used instead, but regardless, the net cost 512 // is about the same (not counting loop-invariant instructions). 513 if (IsVSXType || (ST->hasVSX() && IsAltivecType)) 514 return Cost; 515 516 // Newer PPC supports unaligned memory access. 517 if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0)) 518 return Cost; 519 520 // PPC in general does not support unaligned loads and stores. They'll need 521 // to be decomposed based on the alignment factor. 522 523 // Add the cost of each scalar load or store. 524 Cost += LT.first*(SrcBytes/Alignment-1); 525 526 // For a vector type, there is also scalarization overhead (only for 527 // stores, loads are expanded using the vector-load + permutation sequence, 528 // which is much less expensive). 529 if (Src->isVectorTy() && Opcode == Instruction::Store) 530 for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) 531 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); 532 533 return Cost; 534 } 535 536 int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, 537 unsigned Factor, 538 ArrayRef<unsigned> Indices, 539 unsigned Alignment, 540 unsigned AddressSpace, 541 bool UseMaskForCond, 542 bool UseMaskForGaps) { 543 if (UseMaskForCond || UseMaskForGaps) 544 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 545 Alignment, AddressSpace, 546 UseMaskForCond, UseMaskForGaps); 547 548 assert(isa<VectorType>(VecTy) && 549 "Expect a vector type for interleaved memory op"); 550 551 // Legalize the type. 552 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy); 553 554 // Firstly, the cost of load/store operation. 555 int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); 556 557 // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations 558 // (at least in the sense that there need only be one non-loop-invariant 559 // instruction). For each result vector, we need one shuffle per incoming 560 // vector (except that the first shuffle can take two incoming vectors 561 // because it does not need to take itself). 562 Cost += Factor*(LT.first-1); 563 564 return Cost; 565 } 566 567