1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "PPCTargetTransformInfo.h" 10 #include "llvm/Analysis/TargetTransformInfo.h" 11 #include "llvm/CodeGen/BasicTTIImpl.h" 12 #include "llvm/CodeGen/CostTable.h" 13 #include "llvm/CodeGen/TargetLowering.h" 14 #include "llvm/Support/CommandLine.h" 15 #include "llvm/Support/Debug.h" 16 using namespace llvm; 17 18 #define DEBUG_TYPE "ppctti" 19 20 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting", 21 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); 22 23 // This is currently only used for the data prefetch pass which is only enabled 24 // for BG/Q by default. 25 static cl::opt<unsigned> 26 CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64), 27 cl::desc("The loop prefetch cache line size")); 28 29 static cl::opt<bool> 30 EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false), 31 cl::desc("Enable using coldcc calling conv for cold " 32 "internal functions")); 33 34 //===----------------------------------------------------------------------===// 35 // 36 // PPC cost model. 37 // 38 //===----------------------------------------------------------------------===// 39 40 TargetTransformInfo::PopcntSupportKind 41 PPCTTIImpl::getPopcntSupport(unsigned TyWidth) { 42 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 43 if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64) 44 return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ? 45 TTI::PSK_SlowHardware : TTI::PSK_FastHardware; 46 return TTI::PSK_Software; 47 } 48 49 int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { 50 if (DisablePPCConstHoist) 51 return BaseT::getIntImmCost(Imm, Ty); 52 53 assert(Ty->isIntegerTy()); 54 55 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 56 if (BitSize == 0) 57 return ~0U; 58 59 if (Imm == 0) 60 return TTI::TCC_Free; 61 62 if (Imm.getBitWidth() <= 64) { 63 if (isInt<16>(Imm.getSExtValue())) 64 return TTI::TCC_Basic; 65 66 if (isInt<32>(Imm.getSExtValue())) { 67 // A constant that can be materialized using lis. 68 if ((Imm.getZExtValue() & 0xFFFF) == 0) 69 return TTI::TCC_Basic; 70 71 return 2 * TTI::TCC_Basic; 72 } 73 } 74 75 return 4 * TTI::TCC_Basic; 76 } 77 78 int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 79 Type *Ty) { 80 if (DisablePPCConstHoist) 81 return BaseT::getIntImmCost(IID, Idx, Imm, Ty); 82 83 assert(Ty->isIntegerTy()); 84 85 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 86 if (BitSize == 0) 87 return ~0U; 88 89 switch (IID) { 90 default: 91 return TTI::TCC_Free; 92 case Intrinsic::sadd_with_overflow: 93 case Intrinsic::uadd_with_overflow: 94 case Intrinsic::ssub_with_overflow: 95 case Intrinsic::usub_with_overflow: 96 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue())) 97 return TTI::TCC_Free; 98 break; 99 case Intrinsic::experimental_stackmap: 100 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 101 return TTI::TCC_Free; 102 break; 103 case Intrinsic::experimental_patchpoint_void: 104 case Intrinsic::experimental_patchpoint_i64: 105 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 106 return TTI::TCC_Free; 107 break; 108 } 109 return PPCTTIImpl::getIntImmCost(Imm, Ty); 110 } 111 112 int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 113 Type *Ty) { 114 if (DisablePPCConstHoist) 115 return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty); 116 117 assert(Ty->isIntegerTy()); 118 119 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 120 if (BitSize == 0) 121 return ~0U; 122 123 unsigned ImmIdx = ~0U; 124 bool ShiftedFree = false, RunFree = false, UnsignedFree = false, 125 ZeroFree = false; 126 switch (Opcode) { 127 default: 128 return TTI::TCC_Free; 129 case Instruction::GetElementPtr: 130 // Always hoist the base address of a GetElementPtr. This prevents the 131 // creation of new constants for every base constant that gets constant 132 // folded with the offset. 133 if (Idx == 0) 134 return 2 * TTI::TCC_Basic; 135 return TTI::TCC_Free; 136 case Instruction::And: 137 RunFree = true; // (for the rotate-and-mask instructions) 138 LLVM_FALLTHROUGH; 139 case Instruction::Add: 140 case Instruction::Or: 141 case Instruction::Xor: 142 ShiftedFree = true; 143 LLVM_FALLTHROUGH; 144 case Instruction::Sub: 145 case Instruction::Mul: 146 case Instruction::Shl: 147 case Instruction::LShr: 148 case Instruction::AShr: 149 ImmIdx = 1; 150 break; 151 case Instruction::ICmp: 152 UnsignedFree = true; 153 ImmIdx = 1; 154 // Zero comparisons can use record-form instructions. 155 LLVM_FALLTHROUGH; 156 case Instruction::Select: 157 ZeroFree = true; 158 break; 159 case Instruction::PHI: 160 case Instruction::Call: 161 case Instruction::Ret: 162 case Instruction::Load: 163 case Instruction::Store: 164 break; 165 } 166 167 if (ZeroFree && Imm == 0) 168 return TTI::TCC_Free; 169 170 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) { 171 if (isInt<16>(Imm.getSExtValue())) 172 return TTI::TCC_Free; 173 174 if (RunFree) { 175 if (Imm.getBitWidth() <= 32 && 176 (isShiftedMask_32(Imm.getZExtValue()) || 177 isShiftedMask_32(~Imm.getZExtValue()))) 178 return TTI::TCC_Free; 179 180 if (ST->isPPC64() && 181 (isShiftedMask_64(Imm.getZExtValue()) || 182 isShiftedMask_64(~Imm.getZExtValue()))) 183 return TTI::TCC_Free; 184 } 185 186 if (UnsignedFree && isUInt<16>(Imm.getZExtValue())) 187 return TTI::TCC_Free; 188 189 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0) 190 return TTI::TCC_Free; 191 } 192 193 return PPCTTIImpl::getIntImmCost(Imm, Ty); 194 } 195 196 unsigned PPCTTIImpl::getUserCost(const User *U, 197 ArrayRef<const Value *> Operands) { 198 if (U->getType()->isVectorTy()) { 199 // Instructions that need to be split should cost more. 200 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, U->getType()); 201 return LT.first * BaseT::getUserCost(U, Operands); 202 } 203 204 return BaseT::getUserCost(U, Operands); 205 } 206 207 void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 208 TTI::UnrollingPreferences &UP) { 209 if (ST->getDarwinDirective() == PPC::DIR_A2) { 210 // The A2 is in-order with a deep pipeline, and concatenation unrolling 211 // helps expose latency-hiding opportunities to the instruction scheduler. 212 UP.Partial = UP.Runtime = true; 213 214 // We unroll a lot on the A2 (hundreds of instructions), and the benefits 215 // often outweigh the cost of a division to compute the trip count. 216 UP.AllowExpensiveTripCount = true; 217 } 218 219 BaseT::getUnrollingPreferences(L, SE, UP); 220 } 221 222 // This function returns true to allow using coldcc calling convention. 223 // Returning true results in coldcc being used for functions which are cold at 224 // all call sites when the callers of the functions are not calling any other 225 // non coldcc functions. 226 bool PPCTTIImpl::useColdCCForColdCall(Function &F) { 227 return EnablePPCColdCC; 228 } 229 230 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { 231 // On the A2, always unroll aggressively. For QPX unaligned loads, we depend 232 // on combining the loads generated for consecutive accesses, and failure to 233 // do so is particularly expensive. This makes it much more likely (compared 234 // to only using concatenation unrolling). 235 if (ST->getDarwinDirective() == PPC::DIR_A2) 236 return true; 237 238 return LoopHasReductions; 239 } 240 241 const PPCTTIImpl::TTI::MemCmpExpansionOptions * 242 PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { 243 static const auto Options = []() { 244 TTI::MemCmpExpansionOptions Options; 245 Options.LoadSizes.push_back(8); 246 Options.LoadSizes.push_back(4); 247 Options.LoadSizes.push_back(2); 248 Options.LoadSizes.push_back(1); 249 return Options; 250 }(); 251 return &Options; 252 } 253 254 bool PPCTTIImpl::enableInterleavedAccessVectorization() { 255 return true; 256 } 257 258 unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { 259 if (Vector && !ST->hasAltivec() && !ST->hasQPX()) 260 return 0; 261 return ST->hasVSX() ? 64 : 32; 262 } 263 264 unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { 265 if (Vector) { 266 if (ST->hasQPX()) return 256; 267 if (ST->hasAltivec()) return 128; 268 return 0; 269 } 270 271 if (ST->isPPC64()) 272 return 64; 273 return 32; 274 275 } 276 277 unsigned PPCTTIImpl::getCacheLineSize() { 278 // Check first if the user specified a custom line size. 279 if (CacheLineSize.getNumOccurrences() > 0) 280 return CacheLineSize; 281 282 // On P7, P8 or P9 we have a cache line size of 128. 283 unsigned Directive = ST->getDarwinDirective(); 284 if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 || 285 Directive == PPC::DIR_PWR9) 286 return 128; 287 288 // On other processors return a default of 64 bytes. 289 return 64; 290 } 291 292 unsigned PPCTTIImpl::getPrefetchDistance() { 293 // This seems like a reasonable default for the BG/Q (this pass is enabled, by 294 // default, only on the BG/Q). 295 return 300; 296 } 297 298 unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { 299 unsigned Directive = ST->getDarwinDirective(); 300 // The 440 has no SIMD support, but floating-point instructions 301 // have a 5-cycle latency, so unroll by 5x for latency hiding. 302 if (Directive == PPC::DIR_440) 303 return 5; 304 305 // The A2 has no SIMD support, but floating-point instructions 306 // have a 6-cycle latency, so unroll by 6x for latency hiding. 307 if (Directive == PPC::DIR_A2) 308 return 6; 309 310 // FIXME: For lack of any better information, do no harm... 311 if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) 312 return 1; 313 314 // For P7 and P8, floating-point instructions have a 6-cycle latency and 315 // there are two execution units, so unroll by 12x for latency hiding. 316 // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready 317 if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 || 318 Directive == PPC::DIR_PWR9) 319 return 12; 320 321 // For most things, modern systems have two execution units (and 322 // out-of-order execution). 323 return 2; 324 } 325 326 int PPCTTIImpl::getArithmeticInstrCost( 327 unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, 328 TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, 329 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) { 330 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 331 332 // Fallback to the default implementation. 333 return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, 334 Opd1PropInfo, Opd2PropInfo); 335 } 336 337 int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 338 Type *SubTp) { 339 // Legalize the type. 340 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 341 342 // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations 343 // (at least in the sense that there need only be one non-loop-invariant 344 // instruction). We need one such shuffle instruction for each actual 345 // register (this is not true for arbitrary shuffles, but is true for the 346 // structured types of shuffles covered by TTI::ShuffleKind). 347 return LT.first; 348 } 349 350 int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 351 const Instruction *I) { 352 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 353 354 return BaseT::getCastInstrCost(Opcode, Dst, Src); 355 } 356 357 int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 358 const Instruction *I) { 359 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); 360 } 361 362 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { 363 assert(Val->isVectorTy() && "This must be a vector type"); 364 365 int ISD = TLI->InstructionOpcodeToISD(Opcode); 366 assert(ISD && "Invalid opcode"); 367 368 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { 369 // Double-precision scalars are already located in index #0. 370 if (Index == 0) 371 return 0; 372 373 return BaseT::getVectorInstrCost(Opcode, Val, Index); 374 } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { 375 // Floating point scalars are already located in index #0. 376 if (Index == 0) 377 return 0; 378 379 return BaseT::getVectorInstrCost(Opcode, Val, Index); 380 } 381 382 // Estimated cost of a load-hit-store delay. This was obtained 383 // experimentally as a minimum needed to prevent unprofitable 384 // vectorization for the paq8p benchmark. It may need to be 385 // raised further if other unprofitable cases remain. 386 unsigned LHSPenalty = 2; 387 if (ISD == ISD::INSERT_VECTOR_ELT) 388 LHSPenalty += 7; 389 390 // Vector element insert/extract with Altivec is very expensive, 391 // because they require store and reload with the attendant 392 // processor stall for load-hit-store. Until VSX is available, 393 // these need to be estimated as very costly. 394 if (ISD == ISD::EXTRACT_VECTOR_ELT || 395 ISD == ISD::INSERT_VECTOR_ELT) 396 return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index); 397 398 return BaseT::getVectorInstrCost(Opcode, Val, Index); 399 } 400 401 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 402 unsigned AddressSpace, const Instruction *I) { 403 // Legalize the type. 404 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 405 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 406 "Invalid Opcode"); 407 408 int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); 409 410 bool IsAltivecType = ST->hasAltivec() && 411 (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 || 412 LT.second == MVT::v4i32 || LT.second == MVT::v4f32); 413 bool IsVSXType = ST->hasVSX() && 414 (LT.second == MVT::v2f64 || LT.second == MVT::v2i64); 415 bool IsQPXType = ST->hasQPX() && 416 (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); 417 418 // VSX has 32b/64b load instructions. Legalization can handle loading of 419 // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and 420 // PPCTargetLowering can't compute the cost appropriately. So here we 421 // explicitly check this case. 422 unsigned MemBytes = Src->getPrimitiveSizeInBits(); 423 if (Opcode == Instruction::Load && ST->hasVSX() && IsAltivecType && 424 (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32))) 425 return 1; 426 427 // Aligned loads and stores are easy. 428 unsigned SrcBytes = LT.second.getStoreSize(); 429 if (!SrcBytes || !Alignment || Alignment >= SrcBytes) 430 return Cost; 431 432 // If we can use the permutation-based load sequence, then this is also 433 // relatively cheap (not counting loop-invariant instructions): one load plus 434 // one permute (the last load in a series has extra cost, but we're 435 // neglecting that here). Note that on the P7, we could do unaligned loads 436 // for Altivec types using the VSX instructions, but that's more expensive 437 // than using the permutation-based load sequence. On the P8, that's no 438 // longer true. 439 if (Opcode == Instruction::Load && 440 ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) && 441 Alignment >= LT.second.getScalarType().getStoreSize()) 442 return Cost + LT.first; // Add the cost of the permutations. 443 444 // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the 445 // P7, unaligned vector loads are more expensive than the permutation-based 446 // load sequence, so that might be used instead, but regardless, the net cost 447 // is about the same (not counting loop-invariant instructions). 448 if (IsVSXType || (ST->hasVSX() && IsAltivecType)) 449 return Cost; 450 451 // Newer PPC supports unaligned memory access. 452 if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0)) 453 return Cost; 454 455 // PPC in general does not support unaligned loads and stores. They'll need 456 // to be decomposed based on the alignment factor. 457 458 // Add the cost of each scalar load or store. 459 Cost += LT.first*(SrcBytes/Alignment-1); 460 461 // For a vector type, there is also scalarization overhead (only for 462 // stores, loads are expanded using the vector-load + permutation sequence, 463 // which is much less expensive). 464 if (Src->isVectorTy() && Opcode == Instruction::Store) 465 for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) 466 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); 467 468 return Cost; 469 } 470 471 int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, 472 unsigned Factor, 473 ArrayRef<unsigned> Indices, 474 unsigned Alignment, 475 unsigned AddressSpace, 476 bool UseMaskForCond, 477 bool UseMaskForGaps) { 478 if (UseMaskForCond || UseMaskForGaps) 479 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 480 Alignment, AddressSpace, 481 UseMaskForCond, UseMaskForGaps); 482 483 assert(isa<VectorType>(VecTy) && 484 "Expect a vector type for interleaved memory op"); 485 486 // Legalize the type. 487 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy); 488 489 // Firstly, the cost of load/store operation. 490 int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); 491 492 // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations 493 // (at least in the sense that there need only be one non-loop-invariant 494 // instruction). For each result vector, we need one shuffle per incoming 495 // vector (except that the first shuffle can take two incoming vectors 496 // because it does not need to take itself). 497 Cost += Factor*(LT.first-1); 498 499 return Cost; 500 } 501 502