1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a TargetTransformInfo analysis pass specific to the 10 // SystemZ target machine. It uses the target's detailed information to provide 11 // more precise answers to certain TTI queries, while letting the target 12 // independent and default TTI implementations handle the rest. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "SystemZTargetTransformInfo.h" 17 #include "llvm/Analysis/TargetTransformInfo.h" 18 #include "llvm/CodeGen/BasicTTIImpl.h" 19 #include "llvm/CodeGen/TargetLowering.h" 20 #include "llvm/IR/DerivedTypes.h" 21 #include "llvm/IR/IntrinsicInst.h" 22 #include "llvm/IR/Intrinsics.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Support/InstructionCost.h" 25 #include "llvm/Support/MathExtras.h" 26 27 using namespace llvm; 28 29 #define DEBUG_TYPE "systemztti" 30 31 //===----------------------------------------------------------------------===// 32 // 33 // SystemZ cost model. 34 // 35 //===----------------------------------------------------------------------===// 36 37 static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) { 38 bool UsedAsMemCpySource = false; 39 for (const User *U : V->users()) 40 if (const Instruction *User = dyn_cast<Instruction>(U)) { 41 if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) { 42 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse); 43 continue; 44 } 45 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) { 46 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) { 47 UsedAsMemCpySource = true; 48 continue; 49 } 50 } 51 OtherUse = true; 52 } 53 return UsedAsMemCpySource; 54 } 55 56 static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores, 57 unsigned &NumLoads, const Function *F) { 58 if (!isa<PointerType>(Ptr->getType())) 59 return; 60 for (const User *U : Ptr->users()) 61 if (const Instruction *User = dyn_cast<Instruction>(U)) { 62 if (User->getParent()->getParent() == F) { 63 if (const auto *SI = dyn_cast<StoreInst>(User)) { 64 if (SI->getPointerOperand() == Ptr && !SI->isVolatile()) 65 NumStores++; 66 } else if (const auto *LI = dyn_cast<LoadInst>(User)) { 67 if (LI->getPointerOperand() == Ptr && !LI->isVolatile()) 68 NumLoads++; 69 } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(User)) { 70 if (GEP->getPointerOperand() == Ptr) 71 countNumMemAccesses(GEP, NumStores, NumLoads, F); 72 } 73 } 74 } 75 } 76 77 unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase *CB) const { 78 unsigned Bonus = 0; 79 const Function *Caller = CB->getParent()->getParent(); 80 const Function *Callee = CB->getCalledFunction(); 81 if (!Callee) 82 return 0; 83 const Module *M = Caller->getParent(); 84 85 // Increase the threshold if an incoming argument is used only as a memcpy 86 // source. 87 for (const Argument &Arg : Callee->args()) { 88 bool OtherUse = false; 89 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse) { 90 Bonus = 1000; 91 break; 92 } 93 } 94 95 // Give bonus for globals used much in both caller and callee. 96 std::set<const GlobalVariable *> CalleeGlobals; 97 std::set<const GlobalVariable *> CallerGlobals; 98 for (const GlobalVariable &Global : M->globals()) 99 for (const User *U : Global.users()) 100 if (const Instruction *User = dyn_cast<Instruction>(U)) { 101 if (User->getParent()->getParent() == Callee) 102 CalleeGlobals.insert(&Global); 103 if (User->getParent()->getParent() == Caller) 104 CallerGlobals.insert(&Global); 105 } 106 for (auto *GV : CalleeGlobals) 107 if (CallerGlobals.count(GV)) { 108 unsigned CalleeStores = 0, CalleeLoads = 0; 109 unsigned CallerStores = 0, CallerLoads = 0; 110 countNumMemAccesses(GV, CalleeStores, CalleeLoads, Callee); 111 countNumMemAccesses(GV, CallerStores, CallerLoads, Caller); 112 if ((CalleeStores + CalleeLoads) > 10 && 113 (CallerStores + CallerLoads) > 10) { 114 Bonus = 1000; 115 break; 116 } 117 } 118 119 // Give bonus when Callee accesses an Alloca of Caller heavily. 120 unsigned NumStores = 0; 121 unsigned NumLoads = 0; 122 for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) { 123 Value *CallerArg = CB->getArgOperand(OpIdx); 124 Argument *CalleeArg = Callee->getArg(OpIdx); 125 if (isa<AllocaInst>(CallerArg)) 126 countNumMemAccesses(CalleeArg, NumStores, NumLoads, Callee); 127 } 128 if (NumLoads > 10) 129 Bonus += NumLoads * 50; 130 if (NumStores > 10) 131 Bonus += NumStores * 50; 132 Bonus = std::min(Bonus, unsigned(1000)); 133 134 LLVM_DEBUG(if (Bonus) 135 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";); 136 return Bonus; 137 } 138 139 InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 140 TTI::TargetCostKind CostKind) { 141 assert(Ty->isIntegerTy()); 142 143 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 144 // There is no cost model for constants with a bit size of 0. Return TCC_Free 145 // here, so that constant hoisting will ignore this constant. 146 if (BitSize == 0) 147 return TTI::TCC_Free; 148 // No cost model for operations on integers larger than 128 bit implemented yet. 149 if ((!ST->hasVector() && BitSize > 64) || BitSize > 128) 150 return TTI::TCC_Free; 151 152 if (Imm == 0) 153 return TTI::TCC_Free; 154 155 if (Imm.getBitWidth() <= 64) { 156 // Constants loaded via lgfi. 157 if (isInt<32>(Imm.getSExtValue())) 158 return TTI::TCC_Basic; 159 // Constants loaded via llilf. 160 if (isUInt<32>(Imm.getZExtValue())) 161 return TTI::TCC_Basic; 162 // Constants loaded via llihf: 163 if ((Imm.getZExtValue() & 0xffffffff) == 0) 164 return TTI::TCC_Basic; 165 166 return 2 * TTI::TCC_Basic; 167 } 168 169 // i128 immediates loads from Constant Pool 170 return 2 * TTI::TCC_Basic; 171 } 172 173 InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 174 const APInt &Imm, Type *Ty, 175 TTI::TargetCostKind CostKind, 176 Instruction *Inst) { 177 assert(Ty->isIntegerTy()); 178 179 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 180 // There is no cost model for constants with a bit size of 0. Return TCC_Free 181 // here, so that constant hoisting will ignore this constant. 182 if (BitSize == 0) 183 return TTI::TCC_Free; 184 // No cost model for operations on integers larger than 64 bit implemented yet. 185 if (BitSize > 64) 186 return TTI::TCC_Free; 187 188 switch (Opcode) { 189 default: 190 return TTI::TCC_Free; 191 case Instruction::GetElementPtr: 192 // Always hoist the base address of a GetElementPtr. This prevents the 193 // creation of new constants for every base constant that gets constant 194 // folded with the offset. 195 if (Idx == 0) 196 return 2 * TTI::TCC_Basic; 197 return TTI::TCC_Free; 198 case Instruction::Store: 199 if (Idx == 0 && Imm.getBitWidth() <= 64) { 200 // Any 8-bit immediate store can by implemented via mvi. 201 if (BitSize == 8) 202 return TTI::TCC_Free; 203 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi. 204 if (isInt<16>(Imm.getSExtValue())) 205 return TTI::TCC_Free; 206 } 207 break; 208 case Instruction::ICmp: 209 if (Idx == 1 && Imm.getBitWidth() <= 64) { 210 // Comparisons against signed 32-bit immediates implemented via cgfi. 211 if (isInt<32>(Imm.getSExtValue())) 212 return TTI::TCC_Free; 213 // Comparisons against unsigned 32-bit immediates implemented via clgfi. 214 if (isUInt<32>(Imm.getZExtValue())) 215 return TTI::TCC_Free; 216 } 217 break; 218 case Instruction::Add: 219 case Instruction::Sub: 220 if (Idx == 1 && Imm.getBitWidth() <= 64) { 221 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates. 222 if (isUInt<32>(Imm.getZExtValue())) 223 return TTI::TCC_Free; 224 // Or their negation, by swapping addition vs. subtraction. 225 if (isUInt<32>(-Imm.getSExtValue())) 226 return TTI::TCC_Free; 227 } 228 break; 229 case Instruction::Mul: 230 if (Idx == 1 && Imm.getBitWidth() <= 64) { 231 // We use msgfi to multiply by 32-bit signed immediates. 232 if (isInt<32>(Imm.getSExtValue())) 233 return TTI::TCC_Free; 234 } 235 break; 236 case Instruction::Or: 237 case Instruction::Xor: 238 if (Idx == 1 && Imm.getBitWidth() <= 64) { 239 // Masks supported by oilf/xilf. 240 if (isUInt<32>(Imm.getZExtValue())) 241 return TTI::TCC_Free; 242 // Masks supported by oihf/xihf. 243 if ((Imm.getZExtValue() & 0xffffffff) == 0) 244 return TTI::TCC_Free; 245 } 246 break; 247 case Instruction::And: 248 if (Idx == 1 && Imm.getBitWidth() <= 64) { 249 // Any 32-bit AND operation can by implemented via nilf. 250 if (BitSize <= 32) 251 return TTI::TCC_Free; 252 // 64-bit masks supported by nilf. 253 if (isUInt<32>(~Imm.getZExtValue())) 254 return TTI::TCC_Free; 255 // 64-bit masks supported by nilh. 256 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff) 257 return TTI::TCC_Free; 258 // Some 64-bit AND operations can be implemented via risbg. 259 const SystemZInstrInfo *TII = ST->getInstrInfo(); 260 unsigned Start, End; 261 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End)) 262 return TTI::TCC_Free; 263 } 264 break; 265 case Instruction::Shl: 266 case Instruction::LShr: 267 case Instruction::AShr: 268 // Always return TCC_Free for the shift value of a shift instruction. 269 if (Idx == 1) 270 return TTI::TCC_Free; 271 break; 272 case Instruction::UDiv: 273 case Instruction::SDiv: 274 case Instruction::URem: 275 case Instruction::SRem: 276 case Instruction::Trunc: 277 case Instruction::ZExt: 278 case Instruction::SExt: 279 case Instruction::IntToPtr: 280 case Instruction::PtrToInt: 281 case Instruction::BitCast: 282 case Instruction::PHI: 283 case Instruction::Call: 284 case Instruction::Select: 285 case Instruction::Ret: 286 case Instruction::Load: 287 break; 288 } 289 290 return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind); 291 } 292 293 InstructionCost 294 SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 295 const APInt &Imm, Type *Ty, 296 TTI::TargetCostKind CostKind) { 297 assert(Ty->isIntegerTy()); 298 299 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 300 // There is no cost model for constants with a bit size of 0. Return TCC_Free 301 // here, so that constant hoisting will ignore this constant. 302 if (BitSize == 0) 303 return TTI::TCC_Free; 304 // No cost model for operations on integers larger than 64 bit implemented yet. 305 if (BitSize > 64) 306 return TTI::TCC_Free; 307 308 switch (IID) { 309 default: 310 return TTI::TCC_Free; 311 case Intrinsic::sadd_with_overflow: 312 case Intrinsic::uadd_with_overflow: 313 case Intrinsic::ssub_with_overflow: 314 case Intrinsic::usub_with_overflow: 315 // These get expanded to include a normal addition/subtraction. 316 if (Idx == 1 && Imm.getBitWidth() <= 64) { 317 if (isUInt<32>(Imm.getZExtValue())) 318 return TTI::TCC_Free; 319 if (isUInt<32>(-Imm.getSExtValue())) 320 return TTI::TCC_Free; 321 } 322 break; 323 case Intrinsic::smul_with_overflow: 324 case Intrinsic::umul_with_overflow: 325 // These get expanded to include a normal multiplication. 326 if (Idx == 1 && Imm.getBitWidth() <= 64) { 327 if (isInt<32>(Imm.getSExtValue())) 328 return TTI::TCC_Free; 329 } 330 break; 331 case Intrinsic::experimental_stackmap: 332 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 333 return TTI::TCC_Free; 334 break; 335 case Intrinsic::experimental_patchpoint_void: 336 case Intrinsic::experimental_patchpoint: 337 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 338 return TTI::TCC_Free; 339 break; 340 } 341 return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind); 342 } 343 344 TargetTransformInfo::PopcntSupportKind 345 SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) { 346 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2"); 347 if (ST->hasPopulationCount() && TyWidth <= 64) 348 return TTI::PSK_FastHardware; 349 return TTI::PSK_Software; 350 } 351 352 void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 353 TTI::UnrollingPreferences &UP, 354 OptimizationRemarkEmitter *ORE) { 355 // Find out if L contains a call, what the machine instruction count 356 // estimate is, and how many stores there are. 357 bool HasCall = false; 358 InstructionCost NumStores = 0; 359 for (auto &BB : L->blocks()) 360 for (auto &I : *BB) { 361 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) { 362 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 363 if (isLoweredToCall(F)) 364 HasCall = true; 365 if (F->getIntrinsicID() == Intrinsic::memcpy || 366 F->getIntrinsicID() == Intrinsic::memset) 367 NumStores++; 368 } else { // indirect call. 369 HasCall = true; 370 } 371 } 372 if (isa<StoreInst>(&I)) { 373 Type *MemAccessTy = I.getOperand(0)->getType(); 374 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, 375 std::nullopt, 0, TTI::TCK_RecipThroughput); 376 } 377 } 378 379 // The z13 processor will run out of store tags if too many stores 380 // are fed into it too quickly. Therefore make sure there are not 381 // too many stores in the resulting unrolled loop. 382 unsigned const NumStoresVal = *NumStores.getValue(); 383 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX); 384 385 if (HasCall) { 386 // Only allow full unrolling if loop has any calls. 387 UP.FullUnrollMaxCount = Max; 388 UP.MaxCount = 1; 389 return; 390 } 391 392 UP.MaxCount = Max; 393 if (UP.MaxCount <= 1) 394 return; 395 396 // Allow partial and runtime trip count unrolling. 397 UP.Partial = UP.Runtime = true; 398 399 UP.PartialThreshold = 75; 400 UP.DefaultUnrollRuntimeCount = 4; 401 402 // Allow expensive instructions in the pre-header of the loop. 403 UP.AllowExpensiveTripCount = true; 404 405 UP.Force = true; 406 } 407 408 void SystemZTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 409 TTI::PeelingPreferences &PP) { 410 BaseT::getPeelingPreferences(L, SE, PP); 411 } 412 413 bool SystemZTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 414 const TargetTransformInfo::LSRCost &C2) { 415 // SystemZ specific: check instruction count (first), and don't care about 416 // ImmCost, since offsets are checked explicitly. 417 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 418 C1.NumIVMuls, C1.NumBaseAdds, 419 C1.ScaleCost, C1.SetupCost) < 420 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 421 C2.NumIVMuls, C2.NumBaseAdds, 422 C2.ScaleCost, C2.SetupCost); 423 } 424 425 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const { 426 bool Vector = (ClassID == 1); 427 if (!Vector) 428 // Discount the stack pointer. Also leave out %r0, since it can't 429 // be used in an address. 430 return 14; 431 if (ST->hasVector()) 432 return 32; 433 return 0; 434 } 435 436 TypeSize 437 SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 438 switch (K) { 439 case TargetTransformInfo::RGK_Scalar: 440 return TypeSize::getFixed(64); 441 case TargetTransformInfo::RGK_FixedWidthVector: 442 return TypeSize::getFixed(ST->hasVector() ? 128 : 0); 443 case TargetTransformInfo::RGK_ScalableVector: 444 return TypeSize::getScalable(0); 445 } 446 447 llvm_unreachable("Unsupported register kind"); 448 } 449 450 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses, 451 unsigned NumStridedMemAccesses, 452 unsigned NumPrefetches, 453 bool HasCall) const { 454 // Don't prefetch a loop with many far apart accesses. 455 if (NumPrefetches > 16) 456 return UINT_MAX; 457 458 // Emit prefetch instructions for smaller strides in cases where we think 459 // the hardware prefetcher might not be able to keep up. 460 if (NumStridedMemAccesses > 32 && !HasCall && 461 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses) 462 return 1; 463 464 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048; 465 } 466 467 bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { 468 EVT VT = TLI->getValueType(DL, DataType); 469 return (VT.isScalarInteger() && TLI->isTypeLegal(VT)); 470 } 471 472 static bool isFreeEltLoad(Value *Op) { 473 if (isa<LoadInst>(Op) && Op->hasOneUse()) { 474 const Instruction *UserI = cast<Instruction>(*Op->user_begin()); 475 return !isa<StoreInst>(UserI); // Prefer MVC 476 } 477 return false; 478 } 479 480 InstructionCost SystemZTTIImpl::getScalarizationOverhead( 481 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, 482 TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) { 483 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements(); 484 InstructionCost Cost = 0; 485 486 if (Insert && Ty->isIntOrIntVectorTy(64)) { 487 // VLVGP will insert two GPRs with one instruction, while VLE will load 488 // an element directly with no extra cost 489 assert((VL.empty() || VL.size() == NumElts) && 490 "Type does not match the number of values."); 491 InstructionCost CurrVectorCost = 0; 492 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 493 if (DemandedElts[Idx] && !(VL.size() && isFreeEltLoad(VL[Idx]))) 494 ++CurrVectorCost; 495 if (Idx % 2 == 1) { 496 Cost += std::min(InstructionCost(1), CurrVectorCost); 497 CurrVectorCost = 0; 498 } 499 } 500 Insert = false; 501 } 502 503 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, 504 CostKind, VL); 505 return Cost; 506 } 507 508 // Return the bit size for the scalar type or vector element 509 // type. getScalarSizeInBits() returns 0 for a pointer type. 510 static unsigned getScalarSizeInBits(Type *Ty) { 511 unsigned Size = 512 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits()); 513 assert(Size > 0 && "Element must have non-zero size."); 514 return Size; 515 } 516 517 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector 518 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of 519 // 3. 520 static unsigned getNumVectorRegs(Type *Ty) { 521 auto *VTy = cast<FixedVectorType>(Ty); 522 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements(); 523 assert(WideBits > 0 && "Could not compute size of vector"); 524 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U)); 525 } 526 527 InstructionCost SystemZTTIImpl::getArithmeticInstrCost( 528 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 529 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 530 ArrayRef<const Value *> Args, 531 const Instruction *CxtI) { 532 533 // TODO: Handle more cost kinds. 534 if (CostKind != TTI::TCK_RecipThroughput) 535 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 536 Op2Info, Args, CxtI); 537 538 // TODO: return a good value for BB-VECTORIZER that includes the 539 // immediate loads, which we do not want to count for the loop 540 // vectorizer, since they are hopefully hoisted out of the loop. This 541 // would require a new parameter 'InLoop', but not sure if constant 542 // args are common enough to motivate this. 543 544 unsigned ScalarBits = Ty->getScalarSizeInBits(); 545 546 // There are thre cases of division and remainder: Dividing with a register 547 // needs a divide instruction. A divisor which is a power of two constant 548 // can be implemented with a sequence of shifts. Any other constant needs a 549 // multiply and shifts. 550 const unsigned DivInstrCost = 20; 551 const unsigned DivMulSeqCost = 10; 552 const unsigned SDivPow2Cost = 4; 553 554 bool SignedDivRem = 555 Opcode == Instruction::SDiv || Opcode == Instruction::SRem; 556 bool UnsignedDivRem = 557 Opcode == Instruction::UDiv || Opcode == Instruction::URem; 558 559 // Check for a constant divisor. 560 bool DivRemConst = false; 561 bool DivRemConstPow2 = false; 562 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) { 563 if (const Constant *C = dyn_cast<Constant>(Args[1])) { 564 const ConstantInt *CVal = 565 (C->getType()->isVectorTy() 566 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue()) 567 : dyn_cast<const ConstantInt>(C)); 568 if (CVal && (CVal->getValue().isPowerOf2() || 569 CVal->getValue().isNegatedPowerOf2())) 570 DivRemConstPow2 = true; 571 else 572 DivRemConst = true; 573 } 574 } 575 576 if (!Ty->isVectorTy()) { 577 // These FP operations are supported with a dedicated instruction for 578 // float, double and fp128 (base implementation assumes float generally 579 // costs 2). 580 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || 581 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) 582 return 1; 583 584 // There is no native support for FRem. 585 if (Opcode == Instruction::FRem) 586 return LIBCALL_COST; 587 588 // Give discount for some combined logical operations if supported. 589 if (Args.size() == 2) { 590 if (Opcode == Instruction::Xor) { 591 for (const Value *A : Args) { 592 if (const Instruction *I = dyn_cast<Instruction>(A)) 593 if (I->hasOneUse() && 594 (I->getOpcode() == Instruction::Or || 595 I->getOpcode() == Instruction::And || 596 I->getOpcode() == Instruction::Xor)) 597 if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) || 598 (isInt128InVR(Ty) && 599 (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1()))) 600 return 0; 601 } 602 } 603 else if (Opcode == Instruction::And || Opcode == Instruction::Or) { 604 for (const Value *A : Args) { 605 if (const Instruction *I = dyn_cast<Instruction>(A)) 606 if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) && 607 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) || 608 (isInt128InVR(Ty) && 609 (Opcode == Instruction::And || ST->hasVectorEnhancements1())))) 610 return 0; 611 } 612 } 613 } 614 615 // Or requires one instruction, although it has custom handling for i64. 616 if (Opcode == Instruction::Or) 617 return 1; 618 619 if (Opcode == Instruction::Xor && ScalarBits == 1) { 620 if (ST->hasLoadStoreOnCond2()) 621 return 5; // 2 * (li 0; loc 1); xor 622 return 7; // 2 * ipm sequences ; xor ; shift ; compare 623 } 624 625 if (DivRemConstPow2) 626 return (SignedDivRem ? SDivPow2Cost : 1); 627 if (DivRemConst) 628 return DivMulSeqCost; 629 if (SignedDivRem || UnsignedDivRem) 630 return DivInstrCost; 631 } 632 else if (ST->hasVector()) { 633 auto *VTy = cast<FixedVectorType>(Ty); 634 unsigned VF = VTy->getNumElements(); 635 unsigned NumVectors = getNumVectorRegs(Ty); 636 637 // These vector operations are custom handled, but are still supported 638 // with one instruction per vector, regardless of element size. 639 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr || 640 Opcode == Instruction::AShr) { 641 return NumVectors; 642 } 643 644 if (DivRemConstPow2) 645 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1)); 646 if (DivRemConst) { 647 SmallVector<Type *> Tys(Args.size(), Ty); 648 return VF * DivMulSeqCost + 649 BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind); 650 } 651 if (SignedDivRem || UnsignedDivRem) { 652 if (ST->hasVectorEnhancements3() && ScalarBits >= 32) 653 return NumVectors * DivInstrCost; 654 else if (VF > 4) 655 // Temporary hack: disable high vectorization factors with integer 656 // division/remainder, which will get scalarized and handled with 657 // GR128 registers. The mischeduler is not clever enough to avoid 658 // spilling yet. 659 return 1000; 660 } 661 662 // These FP operations are supported with a single vector instruction for 663 // double (base implementation assumes float generally costs 2). For 664 // FP128, the scalar cost is 1, and there is no overhead since the values 665 // are already in scalar registers. 666 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || 667 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) { 668 switch (ScalarBits) { 669 case 32: { 670 // The vector enhancements facility 1 provides v4f32 instructions. 671 if (ST->hasVectorEnhancements1()) 672 return NumVectors; 673 // Return the cost of multiple scalar invocation plus the cost of 674 // inserting and extracting the values. 675 InstructionCost ScalarCost = 676 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); 677 SmallVector<Type *> Tys(Args.size(), Ty); 678 InstructionCost Cost = 679 (VF * ScalarCost) + 680 BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind); 681 // FIXME: VF 2 for these FP operations are currently just as 682 // expensive as for VF 4. 683 if (VF == 2) 684 Cost *= 2; 685 return Cost; 686 } 687 case 64: 688 case 128: 689 return NumVectors; 690 default: 691 break; 692 } 693 } 694 695 // There is no native support for FRem. 696 if (Opcode == Instruction::FRem) { 697 SmallVector<Type *> Tys(Args.size(), Ty); 698 InstructionCost Cost = 699 (VF * LIBCALL_COST) + 700 BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind); 701 // FIXME: VF 2 for float is currently just as expensive as for VF 4. 702 if (VF == 2 && ScalarBits == 32) 703 Cost *= 2; 704 return Cost; 705 } 706 } 707 708 // Fallback to the default implementation. 709 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 710 Args, CxtI); 711 } 712 713 InstructionCost SystemZTTIImpl::getShuffleCost( 714 TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask, 715 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, 716 ArrayRef<const Value *> Args, const Instruction *CxtI) { 717 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); 718 if (ST->hasVector()) { 719 unsigned NumVectors = getNumVectorRegs(Tp); 720 721 // TODO: Since fp32 is expanded, the shuffle cost should always be 0. 722 723 // FP128 values are always in scalar registers, so there is no work 724 // involved with a shuffle, except for broadcast. In that case register 725 // moves are done with a single instruction per element. 726 if (Tp->getScalarType()->isFP128Ty()) 727 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0); 728 729 switch (Kind) { 730 case TargetTransformInfo::SK_ExtractSubvector: 731 // ExtractSubvector Index indicates start offset. 732 733 // Extracting a subvector from first index is a noop. 734 return (Index == 0 ? 0 : NumVectors); 735 736 case TargetTransformInfo::SK_Broadcast: 737 // Loop vectorizer calls here to figure out the extra cost of 738 // broadcasting a loaded value to all elements of a vector. Since vlrep 739 // loads and replicates with a single instruction, adjust the returned 740 // value. 741 return NumVectors - 1; 742 743 default: 744 745 // SystemZ supports single instruction permutation / replication. 746 return NumVectors; 747 } 748 } 749 750 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); 751 } 752 753 // Return the log2 difference of the element sizes of the two vector types. 754 static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) { 755 unsigned Bits0 = Ty0->getScalarSizeInBits(); 756 unsigned Bits1 = Ty1->getScalarSizeInBits(); 757 758 if (Bits1 > Bits0) 759 return (Log2_32(Bits1) - Log2_32(Bits0)); 760 761 return (Log2_32(Bits0) - Log2_32(Bits1)); 762 } 763 764 // Return the number of instructions needed to truncate SrcTy to DstTy. 765 unsigned SystemZTTIImpl:: 766 getVectorTruncCost(Type *SrcTy, Type *DstTy) { 767 assert (SrcTy->isVectorTy() && DstTy->isVectorTy()); 768 assert(SrcTy->getPrimitiveSizeInBits().getFixedValue() > 769 DstTy->getPrimitiveSizeInBits().getFixedValue() && 770 "Packing must reduce size of vector type."); 771 assert(cast<FixedVectorType>(SrcTy)->getNumElements() == 772 cast<FixedVectorType>(DstTy)->getNumElements() && 773 "Packing should not change number of elements."); 774 775 // TODO: Since fp32 is expanded, the extract cost should always be 0. 776 777 unsigned NumParts = getNumVectorRegs(SrcTy); 778 if (NumParts <= 2) 779 // Up to 2 vector registers can be truncated efficiently with pack or 780 // permute. The latter requires an immediate mask to be loaded, which 781 // typically gets hoisted out of a loop. TODO: return a good value for 782 // BB-VECTORIZER that includes the immediate loads, which we do not want 783 // to count for the loop vectorizer. 784 return 1; 785 786 unsigned Cost = 0; 787 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy); 788 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements(); 789 for (unsigned P = 0; P < Log2Diff; ++P) { 790 if (NumParts > 1) 791 NumParts /= 2; 792 Cost += NumParts; 793 } 794 795 // Currently, a general mix of permutes and pack instructions is output by 796 // isel, which follow the cost computation above except for this case which 797 // is one instruction less: 798 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 && 799 DstTy->getScalarSizeInBits() == 8) 800 Cost--; 801 802 return Cost; 803 } 804 805 // Return the cost of converting a vector bitmask produced by a compare 806 // (SrcTy), to the type of the select or extend instruction (DstTy). 807 unsigned SystemZTTIImpl:: 808 getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) { 809 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() && 810 "Should only be called with vector types."); 811 812 unsigned PackCost = 0; 813 unsigned SrcScalarBits = SrcTy->getScalarSizeInBits(); 814 unsigned DstScalarBits = DstTy->getScalarSizeInBits(); 815 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy); 816 if (SrcScalarBits > DstScalarBits) 817 // The bitmask will be truncated. 818 PackCost = getVectorTruncCost(SrcTy, DstTy); 819 else if (SrcScalarBits < DstScalarBits) { 820 unsigned DstNumParts = getNumVectorRegs(DstTy); 821 // Each vector select needs its part of the bitmask unpacked. 822 PackCost = Log2Diff * DstNumParts; 823 // Extra cost for moving part of mask before unpacking. 824 PackCost += DstNumParts - 1; 825 } 826 827 return PackCost; 828 } 829 830 // Return the type of the compared operands. This is needed to compute the 831 // cost for a Select / ZExt or SExt instruction. 832 static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) { 833 Type *OpTy = nullptr; 834 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0))) 835 OpTy = CI->getOperand(0)->getType(); 836 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0))) 837 if (LogicI->getNumOperands() == 2) 838 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0))) 839 if (isa<CmpInst>(LogicI->getOperand(1))) 840 OpTy = CI0->getOperand(0)->getType(); 841 842 if (OpTy != nullptr) { 843 if (VF == 1) { 844 assert (!OpTy->isVectorTy() && "Expected scalar type"); 845 return OpTy; 846 } 847 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may 848 // be either scalar or already vectorized with a same or lesser VF. 849 Type *ElTy = OpTy->getScalarType(); 850 return FixedVectorType::get(ElTy, VF); 851 } 852 853 return nullptr; 854 } 855 856 // Get the cost of converting a boolean vector to a vector with same width 857 // and element size as Dst, plus the cost of zero extending if needed. 858 unsigned SystemZTTIImpl:: 859 getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, 860 const Instruction *I) { 861 auto *DstVTy = cast<FixedVectorType>(Dst); 862 unsigned VF = DstVTy->getNumElements(); 863 unsigned Cost = 0; 864 // If we know what the widths of the compared operands, get any cost of 865 // converting it to match Dst. Otherwise assume same widths. 866 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); 867 if (CmpOpTy != nullptr) 868 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst); 869 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP) 870 // One 'vn' per dst vector with an immediate mask. 871 Cost += getNumVectorRegs(Dst); 872 return Cost; 873 } 874 875 InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 876 Type *Src, 877 TTI::CastContextHint CCH, 878 TTI::TargetCostKind CostKind, 879 const Instruction *I) { 880 // FIXME: Can the logic below also be used for these cost kinds? 881 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) { 882 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 883 return BaseCost == 0 ? BaseCost : 1; 884 } 885 886 unsigned DstScalarBits = Dst->getScalarSizeInBits(); 887 unsigned SrcScalarBits = Src->getScalarSizeInBits(); 888 889 if (!Src->isVectorTy()) { 890 assert (!Dst->isVectorTy()); 891 892 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) { 893 if (Src->isIntegerTy(128)) 894 return LIBCALL_COST; 895 if (SrcScalarBits >= 32 || 896 (I != nullptr && isa<LoadInst>(I->getOperand(0)))) 897 return 1; 898 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/; 899 } 900 901 if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) && 902 Dst->isIntegerTy(128)) 903 return LIBCALL_COST; 904 905 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) { 906 if (Src->isIntegerTy(1)) { 907 if (DstScalarBits == 128) { 908 if (Opcode == Instruction::SExt && ST->hasVectorEnhancements3()) 909 return 0;/*VCEQQ*/ 910 return 5 /*branch seq.*/; 911 } 912 913 if (ST->hasLoadStoreOnCond2()) 914 return 2; // li 0; loc 1 915 916 // This should be extension of a compare i1 result, which is done with 917 // ipm and a varying sequence of instructions. 918 unsigned Cost = 0; 919 if (Opcode == Instruction::SExt) 920 Cost = (DstScalarBits < 64 ? 3 : 4); 921 if (Opcode == Instruction::ZExt) 922 Cost = 3; 923 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr); 924 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy()) 925 // If operands of an fp-type was compared, this costs +1. 926 Cost++; 927 return Cost; 928 } 929 else if (isInt128InVR(Dst)) { 930 // Extensions from GPR to i128 (in VR) typically costs two instructions, 931 // but a zero-extending load would be just one extra instruction. 932 if (Opcode == Instruction::ZExt && I != nullptr) 933 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) 934 if (Ld->hasOneUse()) 935 return 1; 936 return 2; 937 } 938 } 939 940 if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) { 941 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) 942 if (Ld->hasOneUse()) 943 return 0; // Will be converted to GPR load. 944 bool OnlyTruncatingStores = true; 945 for (const User *U : I->users()) 946 if (!isa<StoreInst>(U)) { 947 OnlyTruncatingStores = false; 948 break; 949 } 950 if (OnlyTruncatingStores) 951 return 0; 952 return 2; // Vector element extraction. 953 } 954 } 955 else if (ST->hasVector()) { 956 // Vector to scalar cast. 957 auto *SrcVecTy = cast<FixedVectorType>(Src); 958 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst); 959 if (!DstVecTy) { 960 // TODO: tune vector-to-scalar cast. 961 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 962 } 963 unsigned VF = SrcVecTy->getNumElements(); 964 unsigned NumDstVectors = getNumVectorRegs(Dst); 965 unsigned NumSrcVectors = getNumVectorRegs(Src); 966 967 if (Opcode == Instruction::Trunc) { 968 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits()) 969 return 0; // Check for NOOP conversions. 970 return getVectorTruncCost(Src, Dst); 971 } 972 973 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 974 if (SrcScalarBits >= 8) { 975 // ZExt will use either a single unpack or a vector permute. 976 if (Opcode == Instruction::ZExt) 977 return NumDstVectors; 978 979 // SExt will be handled with one unpack per doubling of width. 980 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst); 981 982 // For types that spans multiple vector registers, some additional 983 // instructions are used to setup the unpacking. 984 unsigned NumSrcVectorOps = 985 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors) 986 : (NumDstVectors / 2)); 987 988 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps; 989 } 990 else if (SrcScalarBits == 1) 991 return getBoolVecToIntConversionCost(Opcode, Dst, I); 992 } 993 994 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP || 995 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) { 996 // TODO: Fix base implementation which could simplify things a bit here 997 // (seems to miss on differentiating on scalar/vector types). 998 999 // Only 64 bit vector conversions are natively supported before z15. 1000 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) { 1001 if (SrcScalarBits == DstScalarBits) 1002 return NumDstVectors; 1003 1004 if (SrcScalarBits == 1) 1005 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors; 1006 } 1007 1008 // Return the cost of multiple scalar invocation plus the cost of 1009 // inserting and extracting the values. Base implementation does not 1010 // realize float->int gets scalarized. 1011 InstructionCost ScalarCost = getCastInstrCost( 1012 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind); 1013 InstructionCost TotCost = VF * ScalarCost; 1014 bool NeedsInserts = true, NeedsExtracts = true; 1015 // FP128 registers do not get inserted or extracted. 1016 if (DstScalarBits == 128 && 1017 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)) 1018 NeedsInserts = false; 1019 if (SrcScalarBits == 128 && 1020 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) 1021 NeedsExtracts = false; 1022 1023 TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false, 1024 NeedsExtracts, CostKind); 1025 TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts, 1026 /*Extract*/ false, CostKind); 1027 1028 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. 1029 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) 1030 TotCost *= 2; 1031 1032 return TotCost; 1033 } 1034 1035 if (Opcode == Instruction::FPTrunc) { 1036 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements. 1037 return VF /*ldxbr/lexbr*/ + 1038 BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true, 1039 /*Extract*/ false, CostKind); 1040 else // double -> float 1041 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/); 1042 } 1043 1044 if (Opcode == Instruction::FPExt) { 1045 if (SrcScalarBits == 32 && DstScalarBits == 64) { 1046 // float -> double is very rare and currently unoptimized. Instead of 1047 // using vldeb, which can do two at a time, all conversions are 1048 // scalarized. 1049 return VF * 2; 1050 } 1051 // -> fp128. VF * lxdb/lxeb + extraction of elements. 1052 return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false, 1053 /*Extract*/ true, CostKind); 1054 } 1055 } 1056 1057 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1058 } 1059 1060 // Scalar i8 / i16 operations will typically be made after first extending 1061 // the operands to i32. 1062 static unsigned getOperandsExtensionCost(const Instruction *I) { 1063 unsigned ExtCost = 0; 1064 for (Value *Op : I->operands()) 1065 // A load of i8 or i16 sign/zero extends to i32. 1066 if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op)) 1067 ExtCost++; 1068 1069 return ExtCost; 1070 } 1071 1072 InstructionCost SystemZTTIImpl::getCmpSelInstrCost( 1073 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, 1074 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, 1075 TTI::OperandValueInfo Op2Info, const Instruction *I) { 1076 if (CostKind != TTI::TCK_RecipThroughput) 1077 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1078 Op1Info, Op2Info); 1079 1080 if (!ValTy->isVectorTy()) { 1081 switch (Opcode) { 1082 case Instruction::ICmp: { 1083 // A loaded value compared with 0 with multiple users becomes Load and 1084 // Test. The load is then not foldable, so return 0 cost for the ICmp. 1085 unsigned ScalarBits = ValTy->getScalarSizeInBits(); 1086 if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64)) 1087 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) 1088 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1))) 1089 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() && 1090 C->isZero()) 1091 return 0; 1092 1093 unsigned Cost = 1; 1094 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16) 1095 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2); 1096 return Cost; 1097 } 1098 case Instruction::Select: 1099 if (ValTy->isFloatingPointTy()) 1100 return 4; // No LOC for FP - costs a conditional jump. 1101 1102 // When selecting based on an i128 comparison, LOC / VSEL is possible 1103 // if i128 comparisons are directly supported. 1104 if (I != nullptr) 1105 if (ICmpInst *CI = dyn_cast<ICmpInst>(I->getOperand(0))) 1106 if (CI->getOperand(0)->getType()->isIntegerTy(128)) 1107 return ST->hasVectorEnhancements3() ? 1 : 4; 1108 1109 // Load On Condition / Select Register available, except for i128. 1110 return !isInt128InVR(ValTy) ? 1 : 4; 1111 } 1112 } 1113 else if (ST->hasVector()) { 1114 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements(); 1115 1116 // Called with a compare instruction. 1117 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { 1118 unsigned PredicateExtraCost = 0; 1119 if (I != nullptr) { 1120 // Some predicates cost one or two extra instructions. 1121 switch (cast<CmpInst>(I)->getPredicate()) { 1122 case CmpInst::Predicate::ICMP_NE: 1123 case CmpInst::Predicate::ICMP_UGE: 1124 case CmpInst::Predicate::ICMP_ULE: 1125 case CmpInst::Predicate::ICMP_SGE: 1126 case CmpInst::Predicate::ICMP_SLE: 1127 PredicateExtraCost = 1; 1128 break; 1129 case CmpInst::Predicate::FCMP_ONE: 1130 case CmpInst::Predicate::FCMP_ORD: 1131 case CmpInst::Predicate::FCMP_UEQ: 1132 case CmpInst::Predicate::FCMP_UNO: 1133 PredicateExtraCost = 2; 1134 break; 1135 default: 1136 break; 1137 } 1138 } 1139 1140 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of 1141 // floats. FIXME: <2 x float> generates same code as <4 x float>. 1142 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1); 1143 unsigned NumVecs_cmp = getNumVectorRegs(ValTy); 1144 1145 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost)); 1146 return Cost; 1147 } 1148 else { // Called with a select instruction. 1149 assert (Opcode == Instruction::Select); 1150 1151 // We can figure out the extra cost of packing / unpacking if the 1152 // instruction was passed and the compare instruction is found. 1153 unsigned PackCost = 0; 1154 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); 1155 if (CmpOpTy != nullptr) 1156 PackCost = 1157 getVectorBitmaskConversionCost(CmpOpTy, ValTy); 1158 1159 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost; 1160 } 1161 } 1162 1163 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1164 Op1Info, Op2Info); 1165 } 1166 1167 InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1168 TTI::TargetCostKind CostKind, 1169 unsigned Index, Value *Op0, 1170 Value *Op1) { 1171 if (Opcode == Instruction::InsertElement) { 1172 // Vector Element Load. 1173 if (Op1 != nullptr && isFreeEltLoad(Op1)) 1174 return 0; 1175 1176 // vlvgp will insert two grs into a vector register, so count half the 1177 // number of instructions as an estimate when we don't have the full 1178 // picture (as in getScalarizationOverhead()). 1179 if (Val->isIntOrIntVectorTy(64)) 1180 return ((Index % 2 == 0) ? 1 : 0); 1181 } 1182 1183 if (Opcode == Instruction::ExtractElement) { 1184 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1); 1185 1186 // Give a slight penalty for moving out of vector pipeline to FXU unit. 1187 if (Index == 0 && Val->isIntOrIntVectorTy()) 1188 Cost += 1; 1189 1190 return Cost; 1191 } 1192 1193 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1194 } 1195 1196 // Check if a load may be folded as a memory operand in its user. 1197 bool SystemZTTIImpl:: 1198 isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) { 1199 if (!Ld->hasOneUse()) 1200 return false; 1201 FoldedValue = Ld; 1202 const Instruction *UserI = cast<Instruction>(*Ld->user_begin()); 1203 unsigned LoadedBits = getScalarSizeInBits(Ld->getType()); 1204 unsigned TruncBits = 0; 1205 unsigned SExtBits = 0; 1206 unsigned ZExtBits = 0; 1207 if (UserI->hasOneUse()) { 1208 unsigned UserBits = UserI->getType()->getScalarSizeInBits(); 1209 if (isa<TruncInst>(UserI)) 1210 TruncBits = UserBits; 1211 else if (isa<SExtInst>(UserI)) 1212 SExtBits = UserBits; 1213 else if (isa<ZExtInst>(UserI)) 1214 ZExtBits = UserBits; 1215 } 1216 if (TruncBits || SExtBits || ZExtBits) { 1217 FoldedValue = UserI; 1218 UserI = cast<Instruction>(*UserI->user_begin()); 1219 // Load (single use) -> trunc/extend (single use) -> UserI 1220 } 1221 if ((UserI->getOpcode() == Instruction::Sub || 1222 UserI->getOpcode() == Instruction::SDiv || 1223 UserI->getOpcode() == Instruction::UDiv) && 1224 UserI->getOperand(1) != FoldedValue) 1225 return false; // Not commutative, only RHS foldable. 1226 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an 1227 // extension was made of the load. 1228 unsigned LoadOrTruncBits = 1229 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits)); 1230 switch (UserI->getOpcode()) { 1231 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64 1232 case Instruction::Sub: 1233 case Instruction::ICmp: 1234 if (LoadedBits == 32 && ZExtBits == 64) 1235 return true; 1236 [[fallthrough]]; 1237 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64 1238 if (UserI->getOpcode() != Instruction::ICmp) { 1239 if (LoadedBits == 16 && 1240 (SExtBits == 32 || 1241 (SExtBits == 64 && ST->hasMiscellaneousExtensions2()))) 1242 return true; 1243 if (LoadOrTruncBits == 16) 1244 return true; 1245 } 1246 [[fallthrough]]; 1247 case Instruction::SDiv:// SE: 32->64 1248 if (LoadedBits == 32 && SExtBits == 64) 1249 return true; 1250 [[fallthrough]]; 1251 case Instruction::UDiv: 1252 case Instruction::And: 1253 case Instruction::Or: 1254 case Instruction::Xor: 1255 // This also makes sense for float operations, but disabled for now due 1256 // to regressions. 1257 // case Instruction::FCmp: 1258 // case Instruction::FAdd: 1259 // case Instruction::FSub: 1260 // case Instruction::FMul: 1261 // case Instruction::FDiv: 1262 1263 // All possible extensions of memory checked above. 1264 1265 // Comparison between memory and immediate. 1266 if (UserI->getOpcode() == Instruction::ICmp) 1267 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1))) 1268 if (CI->getValue().isIntN(16)) 1269 return true; 1270 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64); 1271 break; 1272 } 1273 return false; 1274 } 1275 1276 static bool isBswapIntrinsicCall(const Value *V) { 1277 if (const Instruction *I = dyn_cast<Instruction>(V)) 1278 if (auto *CI = dyn_cast<CallInst>(I)) 1279 if (auto *F = CI->getCalledFunction()) 1280 if (F->getIntrinsicID() == Intrinsic::bswap) 1281 return true; 1282 return false; 1283 } 1284 1285 InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1286 MaybeAlign Alignment, 1287 unsigned AddressSpace, 1288 TTI::TargetCostKind CostKind, 1289 TTI::OperandValueInfo OpInfo, 1290 const Instruction *I) { 1291 assert(!Src->isVoidTy() && "Invalid type"); 1292 1293 // TODO: Handle other cost kinds. 1294 if (CostKind != TTI::TCK_RecipThroughput) 1295 return 1; 1296 1297 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) { 1298 // Store the load or its truncated or extended value in FoldedValue. 1299 const Instruction *FoldedValue = nullptr; 1300 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) { 1301 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin()); 1302 assert (UserI->getNumOperands() == 2 && "Expected a binop."); 1303 1304 // UserI can't fold two loads, so in that case return 0 cost only 1305 // half of the time. 1306 for (unsigned i = 0; i < 2; ++i) { 1307 if (UserI->getOperand(i) == FoldedValue) 1308 continue; 1309 1310 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){ 1311 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp); 1312 if (!OtherLoad && 1313 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) || 1314 isa<ZExtInst>(OtherOp))) 1315 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0)); 1316 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/)) 1317 return i == 0; // Both operands foldable. 1318 } 1319 } 1320 1321 return 0; // Only I is foldable in user. 1322 } 1323 } 1324 1325 // Type legalization (via getNumberOfParts) can't handle structs 1326 if (TLI->getValueType(DL, Src, true) == MVT::Other) 1327 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1328 CostKind); 1329 1330 // FP128 is a legal type but kept in a register pair on older CPUs. 1331 if (Src->isFP128Ty() && !ST->hasVectorEnhancements1()) 1332 return 2; 1333 1334 unsigned NumOps = 1335 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src)); 1336 1337 // Store/Load reversed saves one instruction. 1338 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) && 1339 I != nullptr) { 1340 if (Opcode == Instruction::Load && I->hasOneUse()) { 1341 const Instruction *LdUser = cast<Instruction>(*I->user_begin()); 1342 // In case of load -> bswap -> store, return normal cost for the load. 1343 if (isBswapIntrinsicCall(LdUser) && 1344 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin()))) 1345 return 0; 1346 } 1347 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) { 1348 const Value *StoredVal = SI->getValueOperand(); 1349 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal)) 1350 return 0; 1351 } 1352 } 1353 1354 return NumOps; 1355 } 1356 1357 // The generic implementation of getInterleavedMemoryOpCost() is based on 1358 // adding costs of the memory operations plus all the extracts and inserts 1359 // needed for using / defining the vector operands. The SystemZ version does 1360 // roughly the same but bases the computations on vector permutations 1361 // instead. 1362 InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost( 1363 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1364 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1365 bool UseMaskForCond, bool UseMaskForGaps) { 1366 if (UseMaskForCond || UseMaskForGaps) 1367 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 1368 Alignment, AddressSpace, CostKind, 1369 UseMaskForCond, UseMaskForGaps); 1370 assert(isa<VectorType>(VecTy) && 1371 "Expect a vector type for interleaved memory op"); 1372 1373 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements(); 1374 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1375 unsigned VF = NumElts / Factor; 1376 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy)); 1377 unsigned NumVectorMemOps = getNumVectorRegs(VecTy); 1378 unsigned NumPermutes = 0; 1379 1380 if (Opcode == Instruction::Load) { 1381 // Loading interleave groups may have gaps, which may mean fewer 1382 // loads. Find out how many vectors will be loaded in total, and in how 1383 // many of them each value will be in. 1384 BitVector UsedInsts(NumVectorMemOps, false); 1385 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false)); 1386 for (unsigned Index : Indices) 1387 for (unsigned Elt = 0; Elt < VF; ++Elt) { 1388 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg; 1389 UsedInsts.set(Vec); 1390 ValueVecs[Index].set(Vec); 1391 } 1392 NumVectorMemOps = UsedInsts.count(); 1393 1394 for (unsigned Index : Indices) { 1395 // Estimate that each loaded source vector containing this Index 1396 // requires one operation, except that vperm can handle two input 1397 // registers first time for each dst vector. 1398 unsigned NumSrcVecs = ValueVecs[Index].count(); 1399 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U); 1400 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources"); 1401 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs); 1402 } 1403 } else { 1404 // Estimate the permutes for each stored vector as the smaller of the 1405 // number of elements and the number of source vectors. Subtract one per 1406 // dst vector for vperm (S.A.). 1407 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor); 1408 unsigned NumDstVecs = NumVectorMemOps; 1409 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs; 1410 } 1411 1412 // Cost of load/store operations and the permutations needed. 1413 return NumVectorMemOps + NumPermutes; 1414 } 1415 1416 InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) { 1417 InstructionCost Cost = 0; 1418 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. 1419 Cost += NumVec - 1; 1420 // For integer adds, VSUM creates shorter reductions on the final vector. 1421 Cost += (ScalarBits < 32) ? 3 : 2; 1422 return Cost; 1423 } 1424 1425 InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems, 1426 unsigned ScalarBits) { 1427 unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits); 1428 InstructionCost Cost = 0; 1429 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. 1430 Cost += NumVec - 1; 1431 // For each shuffle / arithmetic layer, we need 2 instructions, and we need 1432 // log2(Elements in Last Vector) layers. 1433 Cost += 2 * Log2_32_Ceil(std::min(NumElems, NumEltsPerVecReg)); 1434 return Cost; 1435 } 1436 1437 inline bool customCostReductions(unsigned Opcode) { 1438 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 1439 Opcode == Instruction::Add || Opcode == Instruction::Mul; 1440 } 1441 1442 InstructionCost 1443 SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 1444 std::optional<FastMathFlags> FMF, 1445 TTI::TargetCostKind CostKind) { 1446 unsigned ScalarBits = Ty->getScalarSizeInBits(); 1447 // The following is only for subtargets with vector math, non-ordered 1448 // reductions, and reasonable scalar sizes for int and fp add/mul. 1449 if (customCostReductions(Opcode) && ST->hasVector() && 1450 !TTI::requiresOrderedReduction(FMF) && 1451 ScalarBits <= SystemZ::VectorBits) { 1452 unsigned NumVectors = getNumVectorRegs(Ty); 1453 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements(); 1454 // Integer Add is using custom code gen, that needs to be accounted for. 1455 if (Opcode == Instruction::Add) 1456 return getIntAddReductionCost(NumVectors, ScalarBits); 1457 // The base cost is the same across all other arithmetic instructions 1458 InstructionCost Cost = 1459 getFastReductionCost(NumVectors, NumElems, ScalarBits); 1460 // But we need to account for the final op involving the scalar operand. 1461 if ((Opcode == Instruction::FAdd) || (Opcode == Instruction::FMul)) 1462 Cost += 1; 1463 return Cost; 1464 } 1465 // otherwise, fall back to the standard implementation 1466 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1467 } 1468 1469 InstructionCost 1470 SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 1471 FastMathFlags FMF, 1472 TTI::TargetCostKind CostKind) { 1473 // Return custom costs only on subtargets with vector enhancements. 1474 if (ST->hasVectorEnhancements1()) { 1475 unsigned NumVectors = getNumVectorRegs(Ty); 1476 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements(); 1477 unsigned ScalarBits = Ty->getScalarSizeInBits(); 1478 InstructionCost Cost = 0; 1479 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. 1480 Cost += NumVectors - 1; 1481 // For the final vector, we need shuffle + min/max operations, and 1482 // we need #Elements - 1 of them. 1483 Cost += 2 * (std::min(NumElems, SystemZ::VectorBits / ScalarBits) - 1); 1484 return Cost; 1485 } 1486 // For other targets, fall back to the standard implementation 1487 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1488 } 1489 1490 static int 1491 getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, 1492 const SmallVectorImpl<Type *> &ParamTys) { 1493 if (RetTy->isVectorTy() && ID == Intrinsic::bswap) 1494 return getNumVectorRegs(RetTy); // VPERM 1495 1496 return -1; 1497 } 1498 1499 InstructionCost 1500 SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1501 TTI::TargetCostKind CostKind) { 1502 InstructionCost Cost = getVectorIntrinsicInstrCost( 1503 ICA.getID(), ICA.getReturnType(), ICA.getArgTypes()); 1504 if (Cost != -1) 1505 return Cost; 1506 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1507 } 1508 1509 bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { 1510 // Always expand on Subtargets without vector instructions. 1511 if (!ST->hasVector()) 1512 return true; 1513 1514 // Whether or not to expand is a per-intrinsic decision. 1515 switch (II->getIntrinsicID()) { 1516 default: 1517 return true; 1518 // Do not expand vector.reduce.add... 1519 case Intrinsic::vector_reduce_add: 1520 auto *VType = cast<FixedVectorType>(II->getOperand(0)->getType()); 1521 // ...unless the scalar size is i64 or larger, 1522 // or the operand vector is not full, since the 1523 // performance benefit is dubious in those cases. 1524 return VType->getScalarSizeInBits() >= 64 || 1525 VType->getPrimitiveSizeInBits() < SystemZ::VectorBits; 1526 } 1527 } 1528