1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI pass ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This file implements a TargetTransformInfo analysis pass specific to the 11 /// PPC target machine. It uses the target's detailed information to provide 12 /// more precise answers to certain TTI queries, while letting the target 13 /// independent and default TTI implementations handle the rest. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "PPC.h" 18 #include "PPCTargetMachine.h" 19 #include "llvm/Analysis/TargetTransformInfo.h" 20 #include "llvm/Support/CommandLine.h" 21 #include "llvm/Support/Debug.h" 22 #include "llvm/Target/CostTable.h" 23 #include "llvm/Target/TargetLowering.h" 24 using namespace llvm; 25 26 #define DEBUG_TYPE "ppctti" 27 28 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting", 29 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); 30 31 // Declare the pass initialization routine locally as target-specific passes 32 // don't have a target-wide initialization entry point, and so we rely on the 33 // pass constructor initialization. 34 namespace llvm { 35 void initializePPCTTIPass(PassRegistry &); 36 } 37 38 namespace { 39 40 class PPCTTI final : public ImmutablePass, public TargetTransformInfo { 41 const TargetMachine *TM; 42 const PPCSubtarget *ST; 43 const PPCTargetLowering *TLI; 44 45 public: 46 PPCTTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) { 47 llvm_unreachable("This pass cannot be directly constructed"); 48 } 49 50 PPCTTI(const PPCTargetMachine *TM) 51 : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), 52 TLI(TM->getSubtargetImpl()->getTargetLowering()) { 53 initializePPCTTIPass(*PassRegistry::getPassRegistry()); 54 } 55 56 void initializePass() override { 57 pushTTIStack(this); 58 } 59 60 void getAnalysisUsage(AnalysisUsage &AU) const override { 61 TargetTransformInfo::getAnalysisUsage(AU); 62 } 63 64 /// Pass identification. 65 static char ID; 66 67 /// Provide necessary pointer adjustments for the two base classes. 68 void *getAdjustedAnalysisPointer(const void *ID) override { 69 if (ID == &TargetTransformInfo::ID) 70 return (TargetTransformInfo*)this; 71 return this; 72 } 73 74 /// \name Scalar TTI Implementations 75 /// @{ 76 unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; 77 78 unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 79 Type *Ty) const override; 80 unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 81 Type *Ty) const override; 82 83 PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; 84 void getUnrollingPreferences(const Function *F, Loop *L, 85 UnrollingPreferences &UP) const override; 86 87 /// @} 88 89 /// \name Vector TTI Implementations 90 /// @{ 91 92 unsigned getNumberOfRegisters(bool Vector) const override; 93 unsigned getRegisterBitWidth(bool Vector) const override; 94 unsigned getMaxInterleaveFactor() const override; 95 unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, 96 OperandValueKind, OperandValueProperties, 97 OperandValueProperties) const override; 98 unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, 99 int Index, Type *SubTp) const override; 100 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, 101 Type *Src) const override; 102 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 103 Type *CondTy) const override; 104 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, 105 unsigned Index) const override; 106 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 107 unsigned AddressSpace) const override; 108 109 /// @} 110 }; 111 112 } // end anonymous namespace 113 114 INITIALIZE_AG_PASS(PPCTTI, TargetTransformInfo, "ppctti", 115 "PPC Target Transform Info", true, true, false) 116 char PPCTTI::ID = 0; 117 118 ImmutablePass * 119 llvm::createPPCTargetTransformInfoPass(const PPCTargetMachine *TM) { 120 return new PPCTTI(TM); 121 } 122 123 124 //===----------------------------------------------------------------------===// 125 // 126 // PPC cost model. 127 // 128 //===----------------------------------------------------------------------===// 129 130 PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const { 131 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 132 if (ST->hasPOPCNTD() && TyWidth <= 64) 133 return PSK_FastHardware; 134 return PSK_Software; 135 } 136 137 unsigned PPCTTI::getIntImmCost(const APInt &Imm, Type *Ty) const { 138 if (DisablePPCConstHoist) 139 return TargetTransformInfo::getIntImmCost(Imm, Ty); 140 141 assert(Ty->isIntegerTy()); 142 143 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 144 if (BitSize == 0) 145 return ~0U; 146 147 if (Imm == 0) 148 return TCC_Free; 149 150 if (Imm.getBitWidth() <= 64) { 151 if (isInt<16>(Imm.getSExtValue())) 152 return TCC_Basic; 153 154 if (isInt<32>(Imm.getSExtValue())) { 155 // A constant that can be materialized using lis. 156 if ((Imm.getZExtValue() & 0xFFFF) == 0) 157 return TCC_Basic; 158 159 return 2 * TCC_Basic; 160 } 161 } 162 163 return 4 * TCC_Basic; 164 } 165 166 unsigned PPCTTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, 167 const APInt &Imm, Type *Ty) const { 168 if (DisablePPCConstHoist) 169 return TargetTransformInfo::getIntImmCost(IID, Idx, Imm, Ty); 170 171 assert(Ty->isIntegerTy()); 172 173 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 174 if (BitSize == 0) 175 return ~0U; 176 177 switch (IID) { 178 default: return TCC_Free; 179 case Intrinsic::sadd_with_overflow: 180 case Intrinsic::uadd_with_overflow: 181 case Intrinsic::ssub_with_overflow: 182 case Intrinsic::usub_with_overflow: 183 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue())) 184 return TCC_Free; 185 break; 186 case Intrinsic::experimental_stackmap: 187 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 188 return TCC_Free; 189 break; 190 case Intrinsic::experimental_patchpoint_void: 191 case Intrinsic::experimental_patchpoint_i64: 192 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 193 return TCC_Free; 194 break; 195 } 196 return PPCTTI::getIntImmCost(Imm, Ty); 197 } 198 199 unsigned PPCTTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 200 Type *Ty) const { 201 if (DisablePPCConstHoist) 202 return TargetTransformInfo::getIntImmCost(Opcode, Idx, Imm, Ty); 203 204 assert(Ty->isIntegerTy()); 205 206 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 207 if (BitSize == 0) 208 return ~0U; 209 210 unsigned ImmIdx = ~0U; 211 bool ShiftedFree = false, RunFree = false, UnsignedFree = false, 212 ZeroFree = false; 213 switch (Opcode) { 214 default: return TCC_Free; 215 case Instruction::GetElementPtr: 216 // Always hoist the base address of a GetElementPtr. This prevents the 217 // creation of new constants for every base constant that gets constant 218 // folded with the offset. 219 if (Idx == 0) 220 return 2 * TCC_Basic; 221 return TCC_Free; 222 case Instruction::And: 223 RunFree = true; // (for the rotate-and-mask instructions) 224 // Fallthrough... 225 case Instruction::Add: 226 case Instruction::Or: 227 case Instruction::Xor: 228 ShiftedFree = true; 229 // Fallthrough... 230 case Instruction::Sub: 231 case Instruction::Mul: 232 case Instruction::Shl: 233 case Instruction::LShr: 234 case Instruction::AShr: 235 ImmIdx = 1; 236 break; 237 case Instruction::ICmp: 238 UnsignedFree = true; 239 ImmIdx = 1; 240 // Fallthrough... (zero comparisons can use record-form instructions) 241 case Instruction::Select: 242 ZeroFree = true; 243 break; 244 case Instruction::PHI: 245 case Instruction::Call: 246 case Instruction::Ret: 247 case Instruction::Load: 248 case Instruction::Store: 249 break; 250 } 251 252 if (ZeroFree && Imm == 0) 253 return TCC_Free; 254 255 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) { 256 if (isInt<16>(Imm.getSExtValue())) 257 return TCC_Free; 258 259 if (RunFree) { 260 if (Imm.getBitWidth() <= 32 && 261 (isShiftedMask_32(Imm.getZExtValue()) || 262 isShiftedMask_32(~Imm.getZExtValue()))) 263 return TCC_Free; 264 265 266 if (ST->isPPC64() && 267 (isShiftedMask_64(Imm.getZExtValue()) || 268 isShiftedMask_64(~Imm.getZExtValue()))) 269 return TCC_Free; 270 } 271 272 if (UnsignedFree && isUInt<16>(Imm.getZExtValue())) 273 return TCC_Free; 274 275 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0) 276 return TCC_Free; 277 } 278 279 return PPCTTI::getIntImmCost(Imm, Ty); 280 } 281 282 void PPCTTI::getUnrollingPreferences(const Function *F, Loop *L, 283 UnrollingPreferences &UP) const { 284 if (TM->getSubtarget<PPCSubtarget>(F).getDarwinDirective() == PPC::DIR_A2) { 285 // The A2 is in-order with a deep pipeline, and concatenation unrolling 286 // helps expose latency-hiding opportunities to the instruction scheduler. 287 UP.Partial = UP.Runtime = true; 288 } 289 290 TargetTransformInfo::getUnrollingPreferences(F, L, UP); 291 } 292 293 unsigned PPCTTI::getNumberOfRegisters(bool Vector) const { 294 if (Vector && !ST->hasAltivec()) 295 return 0; 296 return ST->hasVSX() ? 64 : 32; 297 } 298 299 unsigned PPCTTI::getRegisterBitWidth(bool Vector) const { 300 if (Vector) { 301 if (ST->hasAltivec()) return 128; 302 return 0; 303 } 304 305 if (ST->isPPC64()) 306 return 64; 307 return 32; 308 309 } 310 311 unsigned PPCTTI::getMaxInterleaveFactor() const { 312 unsigned Directive = ST->getDarwinDirective(); 313 // The 440 has no SIMD support, but floating-point instructions 314 // have a 5-cycle latency, so unroll by 5x for latency hiding. 315 if (Directive == PPC::DIR_440) 316 return 5; 317 318 // The A2 has no SIMD support, but floating-point instructions 319 // have a 6-cycle latency, so unroll by 6x for latency hiding. 320 if (Directive == PPC::DIR_A2) 321 return 6; 322 323 // FIXME: For lack of any better information, do no harm... 324 if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) 325 return 1; 326 327 // For most things, modern systems have two execution units (and 328 // out-of-order execution). 329 return 2; 330 } 331 332 unsigned PPCTTI::getArithmeticInstrCost( 333 unsigned Opcode, Type *Ty, OperandValueKind Op1Info, 334 OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, 335 OperandValueProperties Opd2PropInfo) const { 336 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 337 338 // Fallback to the default implementation. 339 return TargetTransformInfo::getArithmeticInstrCost( 340 Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo); 341 } 342 343 unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, 344 Type *SubTp) const { 345 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); 346 } 347 348 unsigned PPCTTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { 349 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 350 351 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 352 } 353 354 unsigned PPCTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 355 Type *CondTy) const { 356 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 357 } 358 359 unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val, 360 unsigned Index) const { 361 assert(Val->isVectorTy() && "This must be a vector type"); 362 363 int ISD = TLI->InstructionOpcodeToISD(Opcode); 364 assert(ISD && "Invalid opcode"); 365 366 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { 367 // Double-precision scalars are already located in index #0. 368 if (Index == 0) 369 return 0; 370 371 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 372 } 373 374 // Estimated cost of a load-hit-store delay. This was obtained 375 // experimentally as a minimum needed to prevent unprofitable 376 // vectorization for the paq8p benchmark. It may need to be 377 // raised further if other unprofitable cases remain. 378 unsigned LHSPenalty = 2; 379 if (ISD == ISD::INSERT_VECTOR_ELT) 380 LHSPenalty += 7; 381 382 // Vector element insert/extract with Altivec is very expensive, 383 // because they require store and reload with the attendant 384 // processor stall for load-hit-store. Until VSX is available, 385 // these need to be estimated as very costly. 386 if (ISD == ISD::EXTRACT_VECTOR_ELT || 387 ISD == ISD::INSERT_VECTOR_ELT) 388 return LHSPenalty + 389 TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 390 391 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 392 } 393 394 unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 395 unsigned AddressSpace) const { 396 // Legalize the type. 397 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 398 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 399 "Invalid Opcode"); 400 401 unsigned Cost = 402 TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); 403 404 // VSX loads/stores support unaligned access. 405 if (ST->hasVSX()) { 406 if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64) 407 return Cost; 408 } 409 410 bool UnalignedAltivec = 411 Src->isVectorTy() && 412 Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() && 413 LT.second.getSizeInBits() == 128 && 414 Opcode == Instruction::Load; 415 416 // PPC in general does not support unaligned loads and stores. They'll need 417 // to be decomposed based on the alignment factor. 418 unsigned SrcBytes = LT.second.getStoreSize(); 419 if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) { 420 Cost += LT.first*(SrcBytes/Alignment-1); 421 422 // For a vector type, there is also scalarization overhead (only for 423 // stores, loads are expanded using the vector-load + permutation sequence, 424 // which is much less expensive). 425 if (Src->isVectorTy() && Opcode == Instruction::Store) 426 for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) 427 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); 428 } 429 430 return Cost; 431 } 432 433