1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "PPCTargetTransformInfo.h" 11 #include "llvm/Analysis/TargetTransformInfo.h" 12 #include "llvm/CodeGen/BasicTTIImpl.h" 13 #include "llvm/Support/CommandLine.h" 14 #include "llvm/Support/Debug.h" 15 #include "llvm/Target/CostTable.h" 16 #include "llvm/Target/TargetLowering.h" 17 using namespace llvm; 18 19 #define DEBUG_TYPE "ppctti" 20 21 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting", 22 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); 23 24 // This is currently only used for the data prefetch pass which is only enabled 25 // for BG/Q by default. 26 static cl::opt<unsigned> 27 CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64), 28 cl::desc("The loop prefetch cache line size")); 29 30 //===----------------------------------------------------------------------===// 31 // 32 // PPC cost model. 33 // 34 //===----------------------------------------------------------------------===// 35 36 TargetTransformInfo::PopcntSupportKind 37 PPCTTIImpl::getPopcntSupport(unsigned TyWidth) { 38 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 39 if (ST->hasPOPCNTD() && TyWidth <= 64) 40 return TTI::PSK_FastHardware; 41 return TTI::PSK_Software; 42 } 43 44 int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { 45 if (DisablePPCConstHoist) 46 return BaseT::getIntImmCost(Imm, Ty); 47 48 assert(Ty->isIntegerTy()); 49 50 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 51 if (BitSize == 0) 52 return ~0U; 53 54 if (Imm == 0) 55 return TTI::TCC_Free; 56 57 if (Imm.getBitWidth() <= 64) { 58 if (isInt<16>(Imm.getSExtValue())) 59 return TTI::TCC_Basic; 60 61 if (isInt<32>(Imm.getSExtValue())) { 62 // A constant that can be materialized using lis. 63 if ((Imm.getZExtValue() & 0xFFFF) == 0) 64 return TTI::TCC_Basic; 65 66 return 2 * TTI::TCC_Basic; 67 } 68 } 69 70 return 4 * TTI::TCC_Basic; 71 } 72 73 int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 74 Type *Ty) { 75 if (DisablePPCConstHoist) 76 return BaseT::getIntImmCost(IID, Idx, Imm, Ty); 77 78 assert(Ty->isIntegerTy()); 79 80 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 81 if (BitSize == 0) 82 return ~0U; 83 84 switch (IID) { 85 default: 86 return TTI::TCC_Free; 87 case Intrinsic::sadd_with_overflow: 88 case Intrinsic::uadd_with_overflow: 89 case Intrinsic::ssub_with_overflow: 90 case Intrinsic::usub_with_overflow: 91 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue())) 92 return TTI::TCC_Free; 93 break; 94 case Intrinsic::experimental_stackmap: 95 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 96 return TTI::TCC_Free; 97 break; 98 case Intrinsic::experimental_patchpoint_void: 99 case Intrinsic::experimental_patchpoint_i64: 100 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 101 return TTI::TCC_Free; 102 break; 103 } 104 return PPCTTIImpl::getIntImmCost(Imm, Ty); 105 } 106 107 int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 108 Type *Ty) { 109 if (DisablePPCConstHoist) 110 return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty); 111 112 assert(Ty->isIntegerTy()); 113 114 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 115 if (BitSize == 0) 116 return ~0U; 117 118 unsigned ImmIdx = ~0U; 119 bool ShiftedFree = false, RunFree = false, UnsignedFree = false, 120 ZeroFree = false; 121 switch (Opcode) { 122 default: 123 return TTI::TCC_Free; 124 case Instruction::GetElementPtr: 125 // Always hoist the base address of a GetElementPtr. This prevents the 126 // creation of new constants for every base constant that gets constant 127 // folded with the offset. 128 if (Idx == 0) 129 return 2 * TTI::TCC_Basic; 130 return TTI::TCC_Free; 131 case Instruction::And: 132 RunFree = true; // (for the rotate-and-mask instructions) 133 // Fallthrough... 134 case Instruction::Add: 135 case Instruction::Or: 136 case Instruction::Xor: 137 ShiftedFree = true; 138 // Fallthrough... 139 case Instruction::Sub: 140 case Instruction::Mul: 141 case Instruction::Shl: 142 case Instruction::LShr: 143 case Instruction::AShr: 144 ImmIdx = 1; 145 break; 146 case Instruction::ICmp: 147 UnsignedFree = true; 148 ImmIdx = 1; 149 // Fallthrough... (zero comparisons can use record-form instructions) 150 case Instruction::Select: 151 ZeroFree = true; 152 break; 153 case Instruction::PHI: 154 case Instruction::Call: 155 case Instruction::Ret: 156 case Instruction::Load: 157 case Instruction::Store: 158 break; 159 } 160 161 if (ZeroFree && Imm == 0) 162 return TTI::TCC_Free; 163 164 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) { 165 if (isInt<16>(Imm.getSExtValue())) 166 return TTI::TCC_Free; 167 168 if (RunFree) { 169 if (Imm.getBitWidth() <= 32 && 170 (isShiftedMask_32(Imm.getZExtValue()) || 171 isShiftedMask_32(~Imm.getZExtValue()))) 172 return TTI::TCC_Free; 173 174 if (ST->isPPC64() && 175 (isShiftedMask_64(Imm.getZExtValue()) || 176 isShiftedMask_64(~Imm.getZExtValue()))) 177 return TTI::TCC_Free; 178 } 179 180 if (UnsignedFree && isUInt<16>(Imm.getZExtValue())) 181 return TTI::TCC_Free; 182 183 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0) 184 return TTI::TCC_Free; 185 } 186 187 return PPCTTIImpl::getIntImmCost(Imm, Ty); 188 } 189 190 void PPCTTIImpl::getUnrollingPreferences(Loop *L, 191 TTI::UnrollingPreferences &UP) { 192 if (ST->getDarwinDirective() == PPC::DIR_A2) { 193 // The A2 is in-order with a deep pipeline, and concatenation unrolling 194 // helps expose latency-hiding opportunities to the instruction scheduler. 195 UP.Partial = UP.Runtime = true; 196 197 // We unroll a lot on the A2 (hundreds of instructions), and the benefits 198 // often outweigh the cost of a division to compute the trip count. 199 UP.AllowExpensiveTripCount = true; 200 } 201 202 BaseT::getUnrollingPreferences(L, UP); 203 } 204 205 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { 206 // On the A2, always unroll aggressively. For QPX unaligned loads, we depend 207 // on combining the loads generated for consecutive accesses, and failure to 208 // do so is particularly expensive. This makes it much more likely (compared 209 // to only using concatenation unrolling). 210 if (ST->getDarwinDirective() == PPC::DIR_A2) 211 return true; 212 213 return LoopHasReductions; 214 } 215 216 bool PPCTTIImpl::enableInterleavedAccessVectorization() { 217 return true; 218 } 219 220 unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { 221 if (Vector && !ST->hasAltivec() && !ST->hasQPX()) 222 return 0; 223 return ST->hasVSX() ? 64 : 32; 224 } 225 226 unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) { 227 if (Vector) { 228 if (ST->hasQPX()) return 256; 229 if (ST->hasAltivec()) return 128; 230 return 0; 231 } 232 233 if (ST->isPPC64()) 234 return 64; 235 return 32; 236 237 } 238 239 unsigned PPCTTIImpl::getCacheLineSize() { 240 // This is currently only used for the data prefetch pass which is only 241 // enabled for BG/Q by default. 242 return CacheLineSize; 243 } 244 245 unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { 246 unsigned Directive = ST->getDarwinDirective(); 247 // The 440 has no SIMD support, but floating-point instructions 248 // have a 5-cycle latency, so unroll by 5x for latency hiding. 249 if (Directive == PPC::DIR_440) 250 return 5; 251 252 // The A2 has no SIMD support, but floating-point instructions 253 // have a 6-cycle latency, so unroll by 6x for latency hiding. 254 if (Directive == PPC::DIR_A2) 255 return 6; 256 257 // FIXME: For lack of any better information, do no harm... 258 if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) 259 return 1; 260 261 // For P7 and P8, floating-point instructions have a 6-cycle latency and 262 // there are two execution units, so unroll by 12x for latency hiding. 263 if (Directive == PPC::DIR_PWR7 || 264 Directive == PPC::DIR_PWR8) 265 return 12; 266 267 // For most things, modern systems have two execution units (and 268 // out-of-order execution). 269 return 2; 270 } 271 272 int PPCTTIImpl::getArithmeticInstrCost( 273 unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, 274 TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, 275 TTI::OperandValueProperties Opd2PropInfo) { 276 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 277 278 // Fallback to the default implementation. 279 return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, 280 Opd1PropInfo, Opd2PropInfo); 281 } 282 283 int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 284 Type *SubTp) { 285 // Legalize the type. 286 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 287 288 // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations 289 // (at least in the sense that there need only be one non-loop-invariant 290 // instruction). We need one such shuffle instruction for each actual 291 // register (this is not true for arbitrary shuffles, but is true for the 292 // structured types of shuffles covered by TTI::ShuffleKind). 293 return LT.first; 294 } 295 296 int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { 297 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 298 299 return BaseT::getCastInstrCost(Opcode, Dst, Src); 300 } 301 302 int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { 303 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); 304 } 305 306 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { 307 assert(Val->isVectorTy() && "This must be a vector type"); 308 309 int ISD = TLI->InstructionOpcodeToISD(Opcode); 310 assert(ISD && "Invalid opcode"); 311 312 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { 313 // Double-precision scalars are already located in index #0. 314 if (Index == 0) 315 return 0; 316 317 return BaseT::getVectorInstrCost(Opcode, Val, Index); 318 } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { 319 // Floating point scalars are already located in index #0. 320 if (Index == 0) 321 return 0; 322 323 return BaseT::getVectorInstrCost(Opcode, Val, Index); 324 } 325 326 // Estimated cost of a load-hit-store delay. This was obtained 327 // experimentally as a minimum needed to prevent unprofitable 328 // vectorization for the paq8p benchmark. It may need to be 329 // raised further if other unprofitable cases remain. 330 unsigned LHSPenalty = 2; 331 if (ISD == ISD::INSERT_VECTOR_ELT) 332 LHSPenalty += 7; 333 334 // Vector element insert/extract with Altivec is very expensive, 335 // because they require store and reload with the attendant 336 // processor stall for load-hit-store. Until VSX is available, 337 // these need to be estimated as very costly. 338 if (ISD == ISD::EXTRACT_VECTOR_ELT || 339 ISD == ISD::INSERT_VECTOR_ELT) 340 return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index); 341 342 return BaseT::getVectorInstrCost(Opcode, Val, Index); 343 } 344 345 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 346 unsigned AddressSpace) { 347 // Legalize the type. 348 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 349 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 350 "Invalid Opcode"); 351 352 int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); 353 354 // Aligned loads and stores are easy. 355 unsigned SrcBytes = LT.second.getStoreSize(); 356 if (!SrcBytes || !Alignment || Alignment >= SrcBytes) 357 return Cost; 358 359 bool IsAltivecType = ST->hasAltivec() && 360 (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 || 361 LT.second == MVT::v4i32 || LT.second == MVT::v4f32); 362 bool IsVSXType = ST->hasVSX() && 363 (LT.second == MVT::v2f64 || LT.second == MVT::v2i64); 364 bool IsQPXType = ST->hasQPX() && 365 (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); 366 367 // If we can use the permutation-based load sequence, then this is also 368 // relatively cheap (not counting loop-invariant instructions): one load plus 369 // one permute (the last load in a series has extra cost, but we're 370 // neglecting that here). Note that on the P7, we should do unaligned loads 371 // for Altivec types using the VSX instructions, but that's more expensive 372 // than using the permutation-based load sequence. On the P8, that's no 373 // longer true. 374 if (Opcode == Instruction::Load && 375 ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) && 376 Alignment >= LT.second.getScalarType().getStoreSize()) 377 return Cost + LT.first; // Add the cost of the permutations. 378 379 // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the 380 // P7, unaligned vector loads are more expensive than the permutation-based 381 // load sequence, so that might be used instead, but regardless, the net cost 382 // is about the same (not counting loop-invariant instructions). 383 if (IsVSXType || (ST->hasVSX() && IsAltivecType)) 384 return Cost; 385 386 // PPC in general does not support unaligned loads and stores. They'll need 387 // to be decomposed based on the alignment factor. 388 389 // Add the cost of each scalar load or store. 390 Cost += LT.first*(SrcBytes/Alignment-1); 391 392 // For a vector type, there is also scalarization overhead (only for 393 // stores, loads are expanded using the vector-load + permutation sequence, 394 // which is much less expensive). 395 if (Src->isVectorTy() && Opcode == Instruction::Store) 396 for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) 397 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); 398 399 return Cost; 400 } 401 402 int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, 403 unsigned Factor, 404 ArrayRef<unsigned> Indices, 405 unsigned Alignment, 406 unsigned AddressSpace) { 407 assert(isa<VectorType>(VecTy) && 408 "Expect a vector type for interleaved memory op"); 409 410 // Legalize the type. 411 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy); 412 413 // Firstly, the cost of load/store operation. 414 int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); 415 416 // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations 417 // (at least in the sense that there need only be one non-loop-invariant 418 // instruction). For each result vector, we need one shuffle per incoming 419 // vector (except that the first shuffle can take two incoming vectors 420 // because it does not need to take itself). 421 Cost += Factor*(LT.first-1); 422 423 return Cost; 424 } 425 426