1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI pass ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This file implements a TargetTransformInfo analysis pass specific to the 11 /// PPC target machine. It uses the target's detailed information to provide 12 /// more precise answers to certain TTI queries, while letting the target 13 /// independent and default TTI implementations handle the rest. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "PPC.h" 18 #include "PPCTargetMachine.h" 19 #include "llvm/Analysis/TargetTransformInfo.h" 20 #include "llvm/Support/CommandLine.h" 21 #include "llvm/Support/Debug.h" 22 #include "llvm/Target/CostTable.h" 23 #include "llvm/Target/TargetLowering.h" 24 using namespace llvm; 25 26 #define DEBUG_TYPE "ppctti" 27 28 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting", 29 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); 30 31 // Declare the pass initialization routine locally as target-specific passes 32 // don't have a target-wide initialization entry point, and so we rely on the 33 // pass constructor initialization. 34 namespace llvm { 35 void initializePPCTTIPass(PassRegistry &); 36 } 37 38 namespace { 39 40 class PPCTTI final : public ImmutablePass, public TargetTransformInfo { 41 const TargetMachine *TM; 42 const PPCSubtarget *ST; 43 const PPCTargetLowering *TLI; 44 45 public: 46 PPCTTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) { 47 llvm_unreachable("This pass cannot be directly constructed"); 48 } 49 50 PPCTTI(const PPCTargetMachine *TM) 51 : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), 52 TLI(TM->getSubtargetImpl()->getTargetLowering()) { 53 initializePPCTTIPass(*PassRegistry::getPassRegistry()); 54 } 55 56 void initializePass() override { 57 pushTTIStack(this); 58 } 59 60 void getAnalysisUsage(AnalysisUsage &AU) const override { 61 TargetTransformInfo::getAnalysisUsage(AU); 62 } 63 64 /// Pass identification. 65 static char ID; 66 67 /// Provide necessary pointer adjustments for the two base classes. 68 void *getAdjustedAnalysisPointer(const void *ID) override { 69 if (ID == &TargetTransformInfo::ID) 70 return (TargetTransformInfo*)this; 71 return this; 72 } 73 74 /// \name Scalar TTI Implementations 75 /// @{ 76 unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; 77 78 unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 79 Type *Ty) const override; 80 unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 81 Type *Ty) const override; 82 83 PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; 84 void getUnrollingPreferences(const Function *F, Loop *L, 85 UnrollingPreferences &UP) const override; 86 87 /// @} 88 89 /// \name Vector TTI Implementations 90 /// @{ 91 92 unsigned getNumberOfRegisters(bool Vector) const override; 93 unsigned getRegisterBitWidth(bool Vector) const override; 94 unsigned getMaxInterleaveFactor() const override; 95 unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, 96 OperandValueKind, OperandValueProperties, 97 OperandValueProperties) const override; 98 unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, 99 int Index, Type *SubTp) const override; 100 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, 101 Type *Src) const override; 102 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 103 Type *CondTy) const override; 104 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, 105 unsigned Index) const override; 106 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 107 unsigned AddressSpace) const override; 108 109 /// @} 110 }; 111 112 } // end anonymous namespace 113 114 INITIALIZE_AG_PASS(PPCTTI, TargetTransformInfo, "ppctti", 115 "PPC Target Transform Info", true, true, false) 116 char PPCTTI::ID = 0; 117 118 ImmutablePass * 119 llvm::createPPCTargetTransformInfoPass(const PPCTargetMachine *TM) { 120 return new PPCTTI(TM); 121 } 122 123 124 //===----------------------------------------------------------------------===// 125 // 126 // PPC cost model. 127 // 128 //===----------------------------------------------------------------------===// 129 130 PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const { 131 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 132 if (ST->hasPOPCNTD() && TyWidth <= 64) 133 return PSK_FastHardware; 134 return PSK_Software; 135 } 136 137 unsigned PPCTTI::getIntImmCost(const APInt &Imm, Type *Ty) const { 138 if (DisablePPCConstHoist) 139 return TargetTransformInfo::getIntImmCost(Imm, Ty); 140 141 assert(Ty->isIntegerTy()); 142 143 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 144 if (BitSize == 0) 145 return ~0U; 146 147 if (Imm == 0) 148 return TCC_Free; 149 150 if (Imm.getBitWidth() <= 64) { 151 if (isInt<16>(Imm.getSExtValue())) 152 return TCC_Basic; 153 154 if (isInt<32>(Imm.getSExtValue())) { 155 // A constant that can be materialized using lis. 156 if ((Imm.getZExtValue() & 0xFFFF) == 0) 157 return TCC_Basic; 158 159 return 2 * TCC_Basic; 160 } 161 } 162 163 return 4 * TCC_Basic; 164 } 165 166 unsigned PPCTTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, 167 const APInt &Imm, Type *Ty) const { 168 if (DisablePPCConstHoist) 169 return TargetTransformInfo::getIntImmCost(IID, Idx, Imm, Ty); 170 171 assert(Ty->isIntegerTy()); 172 173 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 174 if (BitSize == 0) 175 return ~0U; 176 177 switch (IID) { 178 default: return TCC_Free; 179 case Intrinsic::sadd_with_overflow: 180 case Intrinsic::uadd_with_overflow: 181 case Intrinsic::ssub_with_overflow: 182 case Intrinsic::usub_with_overflow: 183 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue())) 184 return TCC_Free; 185 break; 186 } 187 return PPCTTI::getIntImmCost(Imm, Ty); 188 } 189 190 unsigned PPCTTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 191 Type *Ty) const { 192 if (DisablePPCConstHoist) 193 return TargetTransformInfo::getIntImmCost(Opcode, Idx, Imm, Ty); 194 195 assert(Ty->isIntegerTy()); 196 197 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 198 if (BitSize == 0) 199 return ~0U; 200 201 unsigned ImmIdx = ~0U; 202 bool ShiftedFree = false, RunFree = false, UnsignedFree = false, 203 ZeroFree = false; 204 switch (Opcode) { 205 default: return TCC_Free; 206 case Instruction::GetElementPtr: 207 // Always hoist the base address of a GetElementPtr. This prevents the 208 // creation of new constants for every base constant that gets constant 209 // folded with the offset. 210 if (Idx == 0) 211 return 2 * TCC_Basic; 212 return TCC_Free; 213 case Instruction::And: 214 RunFree = true; // (for the rotate-and-mask instructions) 215 // Fallthrough... 216 case Instruction::Add: 217 case Instruction::Or: 218 case Instruction::Xor: 219 ShiftedFree = true; 220 // Fallthrough... 221 case Instruction::Sub: 222 case Instruction::Mul: 223 case Instruction::Shl: 224 case Instruction::LShr: 225 case Instruction::AShr: 226 ImmIdx = 1; 227 break; 228 case Instruction::ICmp: 229 UnsignedFree = true; 230 ImmIdx = 1; 231 // Fallthrough... (zero comparisons can use record-form instructions) 232 case Instruction::Select: 233 ZeroFree = true; 234 break; 235 case Instruction::PHI: 236 case Instruction::Call: 237 case Instruction::Ret: 238 case Instruction::Load: 239 case Instruction::Store: 240 break; 241 } 242 243 if (ZeroFree && Imm == 0) 244 return TCC_Free; 245 246 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) { 247 if (isInt<16>(Imm.getSExtValue())) 248 return TCC_Free; 249 250 if (RunFree) { 251 if (Imm.getBitWidth() <= 32 && 252 (isShiftedMask_32(Imm.getZExtValue()) || 253 isShiftedMask_32(~Imm.getZExtValue()))) 254 return TCC_Free; 255 256 257 if (ST->isPPC64() && 258 (isShiftedMask_64(Imm.getZExtValue()) || 259 isShiftedMask_64(~Imm.getZExtValue()))) 260 return TCC_Free; 261 } 262 263 if (UnsignedFree && isUInt<16>(Imm.getZExtValue())) 264 return TCC_Free; 265 266 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0) 267 return TCC_Free; 268 } 269 270 return PPCTTI::getIntImmCost(Imm, Ty); 271 } 272 273 void PPCTTI::getUnrollingPreferences(const Function *F, Loop *L, 274 UnrollingPreferences &UP) const { 275 if (TM->getSubtarget<PPCSubtarget>(F).getDarwinDirective() == PPC::DIR_A2) { 276 // The A2 is in-order with a deep pipeline, and concatenation unrolling 277 // helps expose latency-hiding opportunities to the instruction scheduler. 278 UP.Partial = UP.Runtime = true; 279 } 280 281 TargetTransformInfo::getUnrollingPreferences(F, L, UP); 282 } 283 284 unsigned PPCTTI::getNumberOfRegisters(bool Vector) const { 285 if (Vector && !ST->hasAltivec()) 286 return 0; 287 return ST->hasVSX() ? 64 : 32; 288 } 289 290 unsigned PPCTTI::getRegisterBitWidth(bool Vector) const { 291 if (Vector) { 292 if (ST->hasAltivec()) return 128; 293 return 0; 294 } 295 296 if (ST->isPPC64()) 297 return 64; 298 return 32; 299 300 } 301 302 unsigned PPCTTI::getMaxInterleaveFactor() const { 303 unsigned Directive = ST->getDarwinDirective(); 304 // The 440 has no SIMD support, but floating-point instructions 305 // have a 5-cycle latency, so unroll by 5x for latency hiding. 306 if (Directive == PPC::DIR_440) 307 return 5; 308 309 // The A2 has no SIMD support, but floating-point instructions 310 // have a 6-cycle latency, so unroll by 6x for latency hiding. 311 if (Directive == PPC::DIR_A2) 312 return 6; 313 314 // FIXME: For lack of any better information, do no harm... 315 if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) 316 return 1; 317 318 // For most things, modern systems have two execution units (and 319 // out-of-order execution). 320 return 2; 321 } 322 323 unsigned PPCTTI::getArithmeticInstrCost( 324 unsigned Opcode, Type *Ty, OperandValueKind Op1Info, 325 OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, 326 OperandValueProperties Opd2PropInfo) const { 327 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 328 329 // Fallback to the default implementation. 330 return TargetTransformInfo::getArithmeticInstrCost( 331 Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo); 332 } 333 334 unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, 335 Type *SubTp) const { 336 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); 337 } 338 339 unsigned PPCTTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { 340 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 341 342 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 343 } 344 345 unsigned PPCTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 346 Type *CondTy) const { 347 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 348 } 349 350 unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val, 351 unsigned Index) const { 352 assert(Val->isVectorTy() && "This must be a vector type"); 353 354 int ISD = TLI->InstructionOpcodeToISD(Opcode); 355 assert(ISD && "Invalid opcode"); 356 357 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { 358 // Double-precision scalars are already located in index #0. 359 if (Index == 0) 360 return 0; 361 362 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 363 } 364 365 // Estimated cost of a load-hit-store delay. This was obtained 366 // experimentally as a minimum needed to prevent unprofitable 367 // vectorization for the paq8p benchmark. It may need to be 368 // raised further if other unprofitable cases remain. 369 unsigned LHSPenalty = 2; 370 if (ISD == ISD::INSERT_VECTOR_ELT) 371 LHSPenalty += 7; 372 373 // Vector element insert/extract with Altivec is very expensive, 374 // because they require store and reload with the attendant 375 // processor stall for load-hit-store. Until VSX is available, 376 // these need to be estimated as very costly. 377 if (ISD == ISD::EXTRACT_VECTOR_ELT || 378 ISD == ISD::INSERT_VECTOR_ELT) 379 return LHSPenalty + 380 TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 381 382 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 383 } 384 385 unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 386 unsigned AddressSpace) const { 387 // Legalize the type. 388 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 389 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 390 "Invalid Opcode"); 391 392 unsigned Cost = 393 TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); 394 395 // VSX loads/stores support unaligned access. 396 if (ST->hasVSX()) { 397 if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64) 398 return Cost; 399 } 400 401 bool UnalignedAltivec = 402 Src->isVectorTy() && 403 Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() && 404 LT.second.getSizeInBits() == 128 && 405 Opcode == Instruction::Load; 406 407 // PPC in general does not support unaligned loads and stores. They'll need 408 // to be decomposed based on the alignment factor. 409 unsigned SrcBytes = LT.second.getStoreSize(); 410 if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) { 411 Cost += LT.first*(SrcBytes/Alignment-1); 412 413 // For a vector type, there is also scalarization overhead (only for 414 // stores, loads are expanded using the vector-load + permutation sequence, 415 // which is much less expensive). 416 if (Src->isVectorTy() && Opcode == Instruction::Store) 417 for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) 418 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); 419 } 420 421 return Cost; 422 } 423 424