1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 #include "llvm/IR/IntrinsicsR600.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returns true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, 100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 101 }; 102 } 103 104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 105 return [=](const LegalityQuery &Query) { 106 const LLT Ty = Query.Types[TypeIdx]; 107 const LLT EltTy = Ty.getElementType(); 108 unsigned Size = Ty.getSizeInBits(); 109 unsigned Pieces = (Size + 63) / 64; 110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 111 return std::make_pair( 112 TypeIdx, 113 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 114 }; 115 } 116 117 // Increase the number of vector elements to reach the next multiple of 32-bit 118 // type. 119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 120 return [=](const LegalityQuery &Query) { 121 const LLT Ty = Query.Types[TypeIdx]; 122 123 const LLT EltTy = Ty.getElementType(); 124 const int Size = Ty.getSizeInBits(); 125 const int EltSize = EltTy.getSizeInBits(); 126 const int NextMul32 = (Size + 31) / 32; 127 128 assert(EltSize < 32); 129 130 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 131 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 132 }; 133 } 134 135 static LLT getBitcastRegisterType(const LLT Ty) { 136 const unsigned Size = Ty.getSizeInBits(); 137 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB}) 534 .legalFor({S32, S16, V2S16}) 535 .clampMaxNumElementsStrict(0, S16, 2) 536 .scalarize(0) 537 .minScalar(0, S16) 538 .widenScalarToNextMultipleOf(0, 32) 539 .maxScalar(0, S32); 540 541 getActionDefinitionsBuilder(G_MUL) 542 .legalFor({S32, S16, V2S16}) 543 .clampMaxNumElementsStrict(0, S16, 2) 544 .scalarize(0) 545 .minScalar(0, S16) 546 .widenScalarToNextMultipleOf(0, 32) 547 .custom(); 548 assert(ST.hasMad64_32()); 549 550 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 551 .legalFor({S32, S16, V2S16}) // Clamp modifier 552 .minScalarOrElt(0, S16) 553 .clampMaxNumElementsStrict(0, S16, 2) 554 .scalarize(0) 555 .widenScalarToNextPow2(0, 32) 556 .lower(); 557 } else if (ST.has16BitInsts()) { 558 getActionDefinitionsBuilder({G_ADD, G_SUB}) 559 .legalFor({S32, S16}) 560 .minScalar(0, S16) 561 .widenScalarToNextMultipleOf(0, 32) 562 .maxScalar(0, S32) 563 .scalarize(0); 564 565 getActionDefinitionsBuilder(G_MUL) 566 .legalFor({S32, S16}) 567 .scalarize(0) 568 .minScalar(0, S16) 569 .widenScalarToNextMultipleOf(0, 32) 570 .custom(); 571 assert(ST.hasMad64_32()); 572 573 // Technically the saturating operations require clamp bit support, but this 574 // was introduced at the same time as 16-bit operations. 575 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 576 .legalFor({S32, S16}) // Clamp modifier 577 .minScalar(0, S16) 578 .scalarize(0) 579 .widenScalarToNextPow2(0, 16) 580 .lower(); 581 582 // We're just lowering this, but it helps get a better result to try to 583 // coerce to the desired type first. 584 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 585 .minScalar(0, S16) 586 .scalarize(0) 587 .lower(); 588 } else { 589 getActionDefinitionsBuilder({G_ADD, G_SUB}) 590 .legalFor({S32}) 591 .widenScalarToNextMultipleOf(0, 32) 592 .clampScalar(0, S32, S32) 593 .scalarize(0); 594 595 auto &Mul = getActionDefinitionsBuilder(G_MUL) 596 .legalFor({S32}) 597 .scalarize(0) 598 .minScalar(0, S32) 599 .widenScalarToNextMultipleOf(0, 32); 600 601 if (ST.hasMad64_32()) 602 Mul.custom(); 603 else 604 Mul.maxScalar(0, S32); 605 606 if (ST.hasIntClamp()) { 607 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 608 .legalFor({S32}) // Clamp modifier. 609 .scalarize(0) 610 .minScalarOrElt(0, S32) 611 .lower(); 612 } else { 613 // Clamp bit support was added in VI, along with 16-bit operations. 614 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 615 .minScalar(0, S32) 616 .scalarize(0) 617 .lower(); 618 } 619 620 // FIXME: DAG expansion gets better results. The widening uses the smaller 621 // range values and goes for the min/max lowering directly. 622 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 623 .minScalar(0, S32) 624 .scalarize(0) 625 .lower(); 626 } 627 628 getActionDefinitionsBuilder( 629 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 630 .customFor({S32, S64}) 631 .clampScalar(0, S32, S64) 632 .widenScalarToNextPow2(0, 32) 633 .scalarize(0); 634 635 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 636 .legalFor({S32}) 637 .maxScalar(0, S32); 638 639 if (ST.hasVOP3PInsts()) { 640 Mulh 641 .clampMaxNumElements(0, S8, 2) 642 .lowerFor({V2S8}); 643 } 644 645 Mulh 646 .scalarize(0) 647 .lower(); 648 649 // Report legal for any types we can handle anywhere. For the cases only legal 650 // on the SALU, RegBankSelect will be able to re-legalize. 651 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 652 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 653 .clampScalar(0, S32, S64) 654 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 655 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 656 .widenScalarToNextPow2(0) 657 .scalarize(0); 658 659 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 660 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 661 .legalFor({{S32, S1}, {S32, S32}}) 662 .minScalar(0, S32) 663 .scalarize(0) 664 .lower(); 665 666 getActionDefinitionsBuilder(G_BITCAST) 667 // Don't worry about the size constraint. 668 .legalIf(all(isRegisterType(0), isRegisterType(1))) 669 .lower(); 670 671 672 getActionDefinitionsBuilder(G_CONSTANT) 673 .legalFor({S1, S32, S64, S16, GlobalPtr, 674 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 675 .legalIf(isPointer(0)) 676 .clampScalar(0, S32, S64) 677 .widenScalarToNextPow2(0); 678 679 getActionDefinitionsBuilder(G_FCONSTANT) 680 .legalFor({S32, S64, S16}) 681 .clampScalar(0, S16, S64); 682 683 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 684 .legalIf(isRegisterType(0)) 685 // s1 and s16 are special cases because they have legal operations on 686 // them, but don't really occupy registers in the normal way. 687 .legalFor({S1, S16}) 688 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 689 .clampScalarOrElt(0, S32, MaxScalar) 690 .widenScalarToNextPow2(0, 32) 691 .clampMaxNumElements(0, S32, 16); 692 693 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 694 695 // If the amount is divergent, we have to do a wave reduction to get the 696 // maximum value, so this is expanded during RegBankSelect. 697 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 698 .legalFor({{PrivatePtr, S32}}); 699 700 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 701 .customIf(typeIsNot(0, PrivatePtr)); 702 703 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 704 705 auto &FPOpActions = getActionDefinitionsBuilder( 706 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 707 .legalFor({S32, S64}); 708 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 709 .customFor({S32, S64}); 710 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 711 .customFor({S32, S64}); 712 713 if (ST.has16BitInsts()) { 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.legalFor({S16, V2S16}); 716 else 717 FPOpActions.legalFor({S16}); 718 719 TrigActions.customFor({S16}); 720 FDIVActions.customFor({S16}); 721 } 722 723 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 724 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 725 726 if (ST.hasVOP3PInsts()) { 727 MinNumMaxNum.customFor(FPTypesPK16) 728 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 729 .clampMaxNumElements(0, S16, 2) 730 .clampScalar(0, S16, S64) 731 .scalarize(0); 732 } else if (ST.has16BitInsts()) { 733 MinNumMaxNum.customFor(FPTypes16) 734 .clampScalar(0, S16, S64) 735 .scalarize(0); 736 } else { 737 MinNumMaxNum.customFor(FPTypesBase) 738 .clampScalar(0, S32, S64) 739 .scalarize(0); 740 } 741 742 if (ST.hasVOP3PInsts()) 743 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 744 745 FPOpActions 746 .scalarize(0) 747 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 748 749 TrigActions 750 .scalarize(0) 751 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 752 753 FDIVActions 754 .scalarize(0) 755 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 756 757 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 758 .legalFor(FPTypesPK16) 759 .clampMaxNumElementsStrict(0, S16, 2) 760 .scalarize(0) 761 .clampScalar(0, S16, S64); 762 763 if (ST.has16BitInsts()) { 764 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 765 .legalFor({S32, S64, S16}) 766 .scalarize(0) 767 .clampScalar(0, S16, S64); 768 } else { 769 getActionDefinitionsBuilder(G_FSQRT) 770 .legalFor({S32, S64}) 771 .scalarize(0) 772 .clampScalar(0, S32, S64); 773 774 if (ST.hasFractBug()) { 775 getActionDefinitionsBuilder(G_FFLOOR) 776 .customFor({S64}) 777 .legalFor({S32, S64}) 778 .scalarize(0) 779 .clampScalar(0, S32, S64); 780 } else { 781 getActionDefinitionsBuilder(G_FFLOOR) 782 .legalFor({S32, S64}) 783 .scalarize(0) 784 .clampScalar(0, S32, S64); 785 } 786 } 787 788 getActionDefinitionsBuilder(G_FPTRUNC) 789 .legalFor({{S32, S64}, {S16, S32}}) 790 .scalarize(0) 791 .lower(); 792 793 getActionDefinitionsBuilder(G_FPEXT) 794 .legalFor({{S64, S32}, {S32, S16}}) 795 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 796 .scalarize(0); 797 798 auto &FSubActions = getActionDefinitionsBuilder(G_FSUB); 799 if (ST.has16BitInsts()) { 800 FSubActions 801 // Use actual fsub instruction 802 .legalFor({S32, S16}) 803 // Must use fadd + fneg 804 .lowerFor({S64, V2S16}); 805 } else { 806 FSubActions 807 // Use actual fsub instruction 808 .legalFor({S32}) 809 // Must use fadd + fneg 810 .lowerFor({S64, S16, V2S16}); 811 } 812 813 FSubActions 814 .scalarize(0) 815 .clampScalar(0, S32, S64); 816 817 // Whether this is legal depends on the floating point mode for the function. 818 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 819 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 820 FMad.customFor({S32, S16}); 821 else if (ST.hasMadMacF32Insts()) 822 FMad.customFor({S32}); 823 else if (ST.hasMadF16()) 824 FMad.customFor({S16}); 825 FMad.scalarize(0) 826 .lower(); 827 828 auto &FRem = getActionDefinitionsBuilder(G_FREM); 829 if (ST.has16BitInsts()) { 830 FRem.customFor({S16, S32, S64}); 831 } else { 832 FRem.minScalar(0, S32) 833 .customFor({S32, S64}); 834 } 835 FRem.scalarize(0); 836 837 // TODO: Do we need to clamp maximum bitwidth? 838 getActionDefinitionsBuilder(G_TRUNC) 839 .legalIf(isScalar(0)) 840 .legalFor({{V2S16, V2S32}}) 841 .clampMaxNumElements(0, S16, 2) 842 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 843 // situations (like an invalid implicit use), we don't want to infinite loop 844 // in the legalizer. 845 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 846 .alwaysLegal(); 847 848 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 849 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 850 {S32, S1}, {S64, S1}, {S16, S1}}) 851 .scalarize(0) 852 .clampScalar(0, S32, S64) 853 .widenScalarToNextPow2(1, 32); 854 855 // TODO: Split s1->s64 during regbankselect for VALU. 856 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 857 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 858 .lowerIf(typeIs(1, S1)) 859 .customFor({{S32, S64}, {S64, S64}}); 860 if (ST.has16BitInsts()) 861 IToFP.legalFor({{S16, S16}}); 862 IToFP.clampScalar(1, S32, S64) 863 .minScalar(0, S32) 864 .scalarize(0) 865 .widenScalarToNextPow2(1); 866 867 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 868 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 869 .customFor({{S64, S32}, {S64, S64}}) 870 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 871 if (ST.has16BitInsts()) 872 FPToI.legalFor({{S16, S16}}); 873 else 874 FPToI.minScalar(1, S32); 875 876 FPToI.minScalar(0, S32) 877 .widenScalarToNextPow2(0, 32) 878 .scalarize(0) 879 .lower(); 880 881 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 882 .customFor({S16, S32}) 883 .scalarize(0) 884 .lower(); 885 886 // Lower roundeven into G_FRINT 887 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 888 .scalarize(0) 889 .lower(); 890 891 if (ST.has16BitInsts()) { 892 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 893 .legalFor({S16, S32, S64}) 894 .clampScalar(0, S16, S64) 895 .scalarize(0); 896 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 897 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 898 .legalFor({S32, S64}) 899 .clampScalar(0, S32, S64) 900 .scalarize(0); 901 } else { 902 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 903 .legalFor({S32}) 904 .customFor({S64}) 905 .clampScalar(0, S32, S64) 906 .scalarize(0); 907 } 908 909 getActionDefinitionsBuilder(G_PTR_ADD) 910 .legalIf(all(isPointer(0), sameSize(0, 1))) 911 .scalarize(0) 912 .scalarSameSizeAs(1, 0); 913 914 getActionDefinitionsBuilder(G_PTRMASK) 915 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 916 .scalarSameSizeAs(1, 0) 917 .scalarize(0); 918 919 auto &CmpBuilder = 920 getActionDefinitionsBuilder(G_ICMP) 921 // The compare output type differs based on the register bank of the output, 922 // so make both s1 and s32 legal. 923 // 924 // Scalar compares producing output in scc will be promoted to s32, as that 925 // is the allocatable register type that will be needed for the copy from 926 // scc. This will be promoted during RegBankSelect, and we assume something 927 // before that won't try to use s32 result types. 928 // 929 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 930 // bank. 931 .legalForCartesianProduct( 932 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 933 .legalForCartesianProduct( 934 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 935 if (ST.has16BitInsts()) { 936 CmpBuilder.legalFor({{S1, S16}}); 937 } 938 939 CmpBuilder 940 .widenScalarToNextPow2(1) 941 .clampScalar(1, S32, S64) 942 .scalarize(0) 943 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 944 945 getActionDefinitionsBuilder(G_FCMP) 946 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 947 .widenScalarToNextPow2(1) 948 .clampScalar(1, S32, S64) 949 .scalarize(0); 950 951 // FIXME: fpow has a selection pattern that should move to custom lowering. 952 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 953 if (ST.has16BitInsts()) 954 Exp2Ops.legalFor({S32, S16}); 955 else 956 Exp2Ops.legalFor({S32}); 957 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 958 Exp2Ops.scalarize(0); 959 960 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 961 if (ST.has16BitInsts()) 962 ExpOps.customFor({{S32}, {S16}}); 963 else 964 ExpOps.customFor({S32}); 965 ExpOps.clampScalar(0, MinScalarFPTy, S32) 966 .scalarize(0); 967 968 getActionDefinitionsBuilder(G_FPOWI) 969 .clampScalar(0, MinScalarFPTy, S32) 970 .lower(); 971 972 // The 64-bit versions produce 32-bit results, but only on the SALU. 973 getActionDefinitionsBuilder(G_CTPOP) 974 .legalFor({{S32, S32}, {S32, S64}}) 975 .clampScalar(0, S32, S32) 976 .widenScalarToNextPow2(1, 32) 977 .clampScalar(1, S32, S64) 978 .scalarize(0) 979 .widenScalarToNextPow2(0, 32); 980 981 982 // The hardware instructions return a different result on 0 than the generic 983 // instructions expect. The hardware produces -1, but these produce the 984 // bitwidth. 985 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 986 .scalarize(0) 987 .clampScalar(0, S32, S32) 988 .clampScalar(1, S32, S64) 989 .widenScalarToNextPow2(0, 32) 990 .widenScalarToNextPow2(1, 32) 991 .custom(); 992 993 // The 64-bit versions produce 32-bit results, but only on the SALU. 994 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 995 .legalFor({{S32, S32}, {S32, S64}}) 996 .clampScalar(0, S32, S32) 997 .clampScalar(1, S32, S64) 998 .scalarize(0) 999 .widenScalarToNextPow2(0, 32) 1000 .widenScalarToNextPow2(1, 32); 1001 1002 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1003 // RegBankSelect. 1004 getActionDefinitionsBuilder(G_BITREVERSE) 1005 .legalFor({S32, S64}) 1006 .clampScalar(0, S32, S64) 1007 .scalarize(0) 1008 .widenScalarToNextPow2(0); 1009 1010 if (ST.has16BitInsts()) { 1011 getActionDefinitionsBuilder(G_BSWAP) 1012 .legalFor({S16, S32, V2S16}) 1013 .clampMaxNumElementsStrict(0, S16, 2) 1014 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1015 // narrowScalar limitation. 1016 .widenScalarToNextPow2(0) 1017 .clampScalar(0, S16, S32) 1018 .scalarize(0); 1019 1020 if (ST.hasVOP3PInsts()) { 1021 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1022 .legalFor({S32, S16, V2S16}) 1023 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1024 .clampMaxNumElements(0, S16, 2) 1025 .minScalar(0, S16) 1026 .widenScalarToNextPow2(0) 1027 .scalarize(0) 1028 .lower(); 1029 } else { 1030 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1031 .legalFor({S32, S16}) 1032 .widenScalarToNextPow2(0) 1033 .minScalar(0, S16) 1034 .scalarize(0) 1035 .lower(); 1036 } 1037 } else { 1038 // TODO: Should have same legality without v_perm_b32 1039 getActionDefinitionsBuilder(G_BSWAP) 1040 .legalFor({S32}) 1041 .lowerIf(scalarNarrowerThan(0, 32)) 1042 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1043 // narrowScalar limitation. 1044 .widenScalarToNextPow2(0) 1045 .maxScalar(0, S32) 1046 .scalarize(0) 1047 .lower(); 1048 1049 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1050 .legalFor({S32}) 1051 .minScalar(0, S32) 1052 .widenScalarToNextPow2(0) 1053 .scalarize(0) 1054 .lower(); 1055 } 1056 1057 getActionDefinitionsBuilder(G_INTTOPTR) 1058 // List the common cases 1059 .legalForCartesianProduct(AddrSpaces64, {S64}) 1060 .legalForCartesianProduct(AddrSpaces32, {S32}) 1061 .scalarize(0) 1062 // Accept any address space as long as the size matches 1063 .legalIf(sameSize(0, 1)) 1064 .widenScalarIf(smallerThan(1, 0), 1065 [](const LegalityQuery &Query) { 1066 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1067 }) 1068 .narrowScalarIf(largerThan(1, 0), 1069 [](const LegalityQuery &Query) { 1070 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1071 }); 1072 1073 getActionDefinitionsBuilder(G_PTRTOINT) 1074 // List the common cases 1075 .legalForCartesianProduct(AddrSpaces64, {S64}) 1076 .legalForCartesianProduct(AddrSpaces32, {S32}) 1077 .scalarize(0) 1078 // Accept any address space as long as the size matches 1079 .legalIf(sameSize(0, 1)) 1080 .widenScalarIf(smallerThan(0, 1), 1081 [](const LegalityQuery &Query) { 1082 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1083 }) 1084 .narrowScalarIf( 1085 largerThan(0, 1), 1086 [](const LegalityQuery &Query) { 1087 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1088 }); 1089 1090 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1091 .scalarize(0) 1092 .custom(); 1093 1094 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1095 bool IsLoad) -> bool { 1096 const LLT DstTy = Query.Types[0]; 1097 1098 // Split vector extloads. 1099 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1100 1101 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1102 return true; 1103 1104 const LLT PtrTy = Query.Types[1]; 1105 unsigned AS = PtrTy.getAddressSpace(); 1106 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1107 return true; 1108 1109 // Catch weird sized loads that don't evenly divide into the access sizes 1110 // TODO: May be able to widen depending on alignment etc. 1111 unsigned NumRegs = (MemSize + 31) / 32; 1112 if (NumRegs == 3) { 1113 if (!ST.hasDwordx3LoadStores()) 1114 return true; 1115 } else { 1116 // If the alignment allows, these should have been widened. 1117 if (!isPowerOf2_32(NumRegs)) 1118 return true; 1119 } 1120 1121 return false; 1122 }; 1123 1124 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1125 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1126 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1127 1128 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1129 // LDS 1130 // TODO: Unsupported flat for SI. 1131 1132 for (unsigned Op : {G_LOAD, G_STORE}) { 1133 const bool IsStore = Op == G_STORE; 1134 1135 auto &Actions = getActionDefinitionsBuilder(Op); 1136 // Explicitly list some common cases. 1137 // TODO: Does this help compile time at all? 1138 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1139 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1140 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1141 {S64, GlobalPtr, S64, GlobalAlign32}, 1142 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1143 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1144 {S32, GlobalPtr, S8, GlobalAlign8}, 1145 {S32, GlobalPtr, S16, GlobalAlign16}, 1146 1147 {S32, LocalPtr, S32, 32}, 1148 {S64, LocalPtr, S64, 32}, 1149 {V2S32, LocalPtr, V2S32, 32}, 1150 {S32, LocalPtr, S8, 8}, 1151 {S32, LocalPtr, S16, 16}, 1152 {V2S16, LocalPtr, S32, 32}, 1153 1154 {S32, PrivatePtr, S32, 32}, 1155 {S32, PrivatePtr, S8, 8}, 1156 {S32, PrivatePtr, S16, 16}, 1157 {V2S16, PrivatePtr, S32, 32}, 1158 1159 {S32, ConstantPtr, S32, GlobalAlign32}, 1160 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1161 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1162 {S64, ConstantPtr, S64, GlobalAlign32}, 1163 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1164 Actions.legalIf( 1165 [=](const LegalityQuery &Query) -> bool { 1166 return isLoadStoreLegal(ST, Query); 1167 }); 1168 1169 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1170 // 64-bits. 1171 // 1172 // TODO: Should generalize bitcast action into coerce, which will also cover 1173 // inserting addrspacecasts. 1174 Actions.customIf(typeIs(1, Constant32Ptr)); 1175 1176 // Turn any illegal element vectors into something easier to deal 1177 // with. These will ultimately produce 32-bit scalar shifts to extract the 1178 // parts anyway. 1179 // 1180 // For odd 16-bit element vectors, prefer to split those into pieces with 1181 // 16-bit vector parts. 1182 Actions.bitcastIf( 1183 [=](const LegalityQuery &Query) -> bool { 1184 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1185 Query.MMODescrs[0].MemoryTy); 1186 }, bitcastToRegisterType(0)); 1187 1188 if (!IsStore) { 1189 // Widen suitably aligned loads by loading extra bytes. The standard 1190 // legalization actions can't properly express widening memory operands. 1191 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1192 return shouldWidenLoad(ST, Query, G_LOAD); 1193 }); 1194 } 1195 1196 // FIXME: load/store narrowing should be moved to lower action 1197 Actions 1198 .narrowScalarIf( 1199 [=](const LegalityQuery &Query) -> bool { 1200 return !Query.Types[0].isVector() && 1201 needToSplitMemOp(Query, Op == G_LOAD); 1202 }, 1203 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1204 const LLT DstTy = Query.Types[0]; 1205 const LLT PtrTy = Query.Types[1]; 1206 1207 const unsigned DstSize = DstTy.getSizeInBits(); 1208 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1209 1210 // Split extloads. 1211 if (DstSize > MemSize) 1212 return std::make_pair(0, LLT::scalar(MemSize)); 1213 1214 unsigned MaxSize = maxSizeForAddrSpace(ST, 1215 PtrTy.getAddressSpace(), 1216 Op == G_LOAD); 1217 if (MemSize > MaxSize) 1218 return std::make_pair(0, LLT::scalar(MaxSize)); 1219 1220 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1221 return std::make_pair(0, LLT::scalar(Align)); 1222 }) 1223 .fewerElementsIf( 1224 [=](const LegalityQuery &Query) -> bool { 1225 return Query.Types[0].isVector() && 1226 needToSplitMemOp(Query, Op == G_LOAD); 1227 }, 1228 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1229 const LLT DstTy = Query.Types[0]; 1230 const LLT PtrTy = Query.Types[1]; 1231 1232 LLT EltTy = DstTy.getElementType(); 1233 unsigned MaxSize = maxSizeForAddrSpace(ST, 1234 PtrTy.getAddressSpace(), 1235 Op == G_LOAD); 1236 1237 // FIXME: Handle widened to power of 2 results better. This ends 1238 // up scalarizing. 1239 // FIXME: 3 element stores scalarized on SI 1240 1241 // Split if it's too large for the address space. 1242 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1243 if (MemSize > MaxSize) { 1244 unsigned NumElts = DstTy.getNumElements(); 1245 unsigned EltSize = EltTy.getSizeInBits(); 1246 1247 if (MaxSize % EltSize == 0) { 1248 return std::make_pair( 1249 0, LLT::scalarOrVector( 1250 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1251 } 1252 1253 unsigned NumPieces = MemSize / MaxSize; 1254 1255 // FIXME: Refine when odd breakdowns handled 1256 // The scalars will need to be re-legalized. 1257 if (NumPieces == 1 || NumPieces >= NumElts || 1258 NumElts % NumPieces != 0) 1259 return std::make_pair(0, EltTy); 1260 1261 return std::make_pair( 1262 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1263 } 1264 1265 // FIXME: We could probably handle weird extending loads better. 1266 if (DstTy.getSizeInBits() > MemSize) 1267 return std::make_pair(0, EltTy); 1268 1269 unsigned EltSize = EltTy.getSizeInBits(); 1270 unsigned DstSize = DstTy.getSizeInBits(); 1271 if (!isPowerOf2_32(DstSize)) { 1272 // We're probably decomposing an odd sized store. Try to split 1273 // to the widest type. TODO: Account for alignment. As-is it 1274 // should be OK, since the new parts will be further legalized. 1275 unsigned FloorSize = PowerOf2Floor(DstSize); 1276 return std::make_pair( 1277 0, LLT::scalarOrVector( 1278 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1279 } 1280 1281 // May need relegalization for the scalars. 1282 return std::make_pair(0, EltTy); 1283 }) 1284 .minScalar(0, S32) 1285 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1286 .widenScalarToNextPow2(0) 1287 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1288 .lower(); 1289 } 1290 1291 // FIXME: Unaligned accesses not lowered. 1292 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1293 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1294 {S32, GlobalPtr, S16, 2 * 8}, 1295 {S32, LocalPtr, S8, 8}, 1296 {S32, LocalPtr, S16, 16}, 1297 {S32, PrivatePtr, S8, 8}, 1298 {S32, PrivatePtr, S16, 16}, 1299 {S32, ConstantPtr, S8, 8}, 1300 {S32, ConstantPtr, S16, 2 * 8}}) 1301 .legalIf( 1302 [=](const LegalityQuery &Query) -> bool { 1303 return isLoadStoreLegal(ST, Query); 1304 }); 1305 1306 if (ST.hasFlatAddressSpace()) { 1307 ExtLoads.legalForTypesWithMemDesc( 1308 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1309 } 1310 1311 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1312 // 64-bits. 1313 // 1314 // TODO: Should generalize bitcast action into coerce, which will also cover 1315 // inserting addrspacecasts. 1316 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1317 1318 ExtLoads.clampScalar(0, S32, S32) 1319 .widenScalarToNextPow2(0) 1320 .lower(); 1321 1322 auto &Atomics = getActionDefinitionsBuilder( 1323 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1324 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1325 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1326 G_ATOMICRMW_UMIN}) 1327 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1328 {S64, GlobalPtr}, {S64, LocalPtr}, 1329 {S32, RegionPtr}, {S64, RegionPtr}}); 1330 if (ST.hasFlatAddressSpace()) { 1331 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1332 } 1333 1334 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1335 if (ST.hasLDSFPAtomicAdd()) { 1336 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1337 if (ST.hasGFX90AInsts()) 1338 Atomic.legalFor({{S64, LocalPtr}}); 1339 if (ST.hasGFX940Insts()) 1340 Atomic.legalFor({{V2S16, LocalPtr}}); 1341 } 1342 if (ST.hasAtomicFaddInsts()) 1343 Atomic.legalFor({{S32, GlobalPtr}}); 1344 1345 if (ST.hasGFX90AInsts()) { 1346 // These are legal with some caveats, and should have undergone expansion in 1347 // the IR in most situations 1348 // TODO: Move atomic expansion into legalizer 1349 // TODO: Also supports <2 x f16> 1350 Atomic.legalFor({ 1351 {S32, GlobalPtr}, 1352 {S64, GlobalPtr}, 1353 {S64, FlatPtr} 1354 }); 1355 } 1356 1357 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1358 // demarshalling 1359 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1360 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1361 {S32, FlatPtr}, {S64, FlatPtr}}) 1362 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1363 {S32, RegionPtr}, {S64, RegionPtr}}); 1364 // TODO: Pointer types, any 32-bit or 64-bit vector 1365 1366 // Condition should be s32 for scalar, s1 for vector. 1367 getActionDefinitionsBuilder(G_SELECT) 1368 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1369 LocalPtr, FlatPtr, PrivatePtr, 1370 LLT::fixed_vector(2, LocalPtr), 1371 LLT::fixed_vector(2, PrivatePtr)}, 1372 {S1, S32}) 1373 .clampScalar(0, S16, S64) 1374 .scalarize(1) 1375 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1376 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1377 .clampMaxNumElements(0, S32, 2) 1378 .clampMaxNumElements(0, LocalPtr, 2) 1379 .clampMaxNumElements(0, PrivatePtr, 2) 1380 .scalarize(0) 1381 .widenScalarToNextPow2(0) 1382 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1383 1384 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1385 // be more flexible with the shift amount type. 1386 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1387 .legalFor({{S32, S32}, {S64, S32}}); 1388 if (ST.has16BitInsts()) { 1389 if (ST.hasVOP3PInsts()) { 1390 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1391 .clampMaxNumElements(0, S16, 2); 1392 } else 1393 Shifts.legalFor({{S16, S16}}); 1394 1395 // TODO: Support 16-bit shift amounts for all types 1396 Shifts.widenScalarIf( 1397 [=](const LegalityQuery &Query) { 1398 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1399 // 32-bit amount. 1400 const LLT ValTy = Query.Types[0]; 1401 const LLT AmountTy = Query.Types[1]; 1402 return ValTy.getSizeInBits() <= 16 && 1403 AmountTy.getSizeInBits() < 16; 1404 }, changeTo(1, S16)); 1405 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1406 Shifts.clampScalar(1, S32, S32); 1407 Shifts.widenScalarToNextPow2(0, 16); 1408 Shifts.clampScalar(0, S16, S64); 1409 1410 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1411 .minScalar(0, S16) 1412 .scalarize(0) 1413 .lower(); 1414 } else { 1415 // Make sure we legalize the shift amount type first, as the general 1416 // expansion for the shifted type will produce much worse code if it hasn't 1417 // been truncated already. 1418 Shifts.clampScalar(1, S32, S32); 1419 Shifts.widenScalarToNextPow2(0, 32); 1420 Shifts.clampScalar(0, S32, S64); 1421 1422 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1423 .minScalar(0, S32) 1424 .scalarize(0) 1425 .lower(); 1426 } 1427 Shifts.scalarize(0); 1428 1429 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1430 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1431 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1432 unsigned IdxTypeIdx = 2; 1433 1434 getActionDefinitionsBuilder(Op) 1435 .customIf([=](const LegalityQuery &Query) { 1436 const LLT EltTy = Query.Types[EltTypeIdx]; 1437 const LLT VecTy = Query.Types[VecTypeIdx]; 1438 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1439 const unsigned EltSize = EltTy.getSizeInBits(); 1440 return (EltSize == 32 || EltSize == 64) && 1441 VecTy.getSizeInBits() % 32 == 0 && 1442 VecTy.getSizeInBits() <= MaxRegisterSize && 1443 IdxTy.getSizeInBits() == 32; 1444 }) 1445 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1446 bitcastToVectorElement32(VecTypeIdx)) 1447 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1448 .bitcastIf( 1449 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1450 [=](const LegalityQuery &Query) { 1451 // For > 64-bit element types, try to turn this into a 64-bit 1452 // element vector since we may be able to do better indexing 1453 // if this is scalar. If not, fall back to 32. 1454 const LLT EltTy = Query.Types[EltTypeIdx]; 1455 const LLT VecTy = Query.Types[VecTypeIdx]; 1456 const unsigned DstEltSize = EltTy.getSizeInBits(); 1457 const unsigned VecSize = VecTy.getSizeInBits(); 1458 1459 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1460 return std::make_pair( 1461 VecTypeIdx, 1462 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1463 }) 1464 .clampScalar(EltTypeIdx, S32, S64) 1465 .clampScalar(VecTypeIdx, S32, S64) 1466 .clampScalar(IdxTypeIdx, S32, S32) 1467 .clampMaxNumElements(VecTypeIdx, S32, 32) 1468 // TODO: Clamp elements for 64-bit vectors? 1469 // It should only be necessary with variable indexes. 1470 // As a last resort, lower to the stack 1471 .lower(); 1472 } 1473 1474 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1475 .unsupportedIf([=](const LegalityQuery &Query) { 1476 const LLT &EltTy = Query.Types[1].getElementType(); 1477 return Query.Types[0] != EltTy; 1478 }); 1479 1480 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1481 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1482 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1483 1484 // FIXME: Doesn't handle extract of illegal sizes. 1485 getActionDefinitionsBuilder(Op) 1486 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1487 .lowerIf([=](const LegalityQuery &Query) { 1488 // Sub-vector(or single element) insert and extract. 1489 // TODO: verify immediate offset here since lower only works with 1490 // whole elements. 1491 const LLT BigTy = Query.Types[BigTyIdx]; 1492 return BigTy.isVector(); 1493 }) 1494 // FIXME: Multiples of 16 should not be legal. 1495 .legalIf([=](const LegalityQuery &Query) { 1496 const LLT BigTy = Query.Types[BigTyIdx]; 1497 const LLT LitTy = Query.Types[LitTyIdx]; 1498 return (BigTy.getSizeInBits() % 32 == 0) && 1499 (LitTy.getSizeInBits() % 16 == 0); 1500 }) 1501 .widenScalarIf( 1502 [=](const LegalityQuery &Query) { 1503 const LLT BigTy = Query.Types[BigTyIdx]; 1504 return (BigTy.getScalarSizeInBits() < 16); 1505 }, 1506 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1507 .widenScalarIf( 1508 [=](const LegalityQuery &Query) { 1509 const LLT LitTy = Query.Types[LitTyIdx]; 1510 return (LitTy.getScalarSizeInBits() < 16); 1511 }, 1512 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1513 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1514 .widenScalarToNextPow2(BigTyIdx, 32); 1515 1516 } 1517 1518 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1519 .legalForCartesianProduct(AllS32Vectors, {S32}) 1520 .legalForCartesianProduct(AllS64Vectors, {S64}) 1521 .clampNumElements(0, V16S32, V32S32) 1522 .clampNumElements(0, V2S64, V16S64) 1523 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1524 1525 if (ST.hasScalarPackInsts()) { 1526 BuildVector 1527 // FIXME: Should probably widen s1 vectors straight to s32 1528 .minScalarOrElt(0, S16) 1529 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1530 .minScalar(1, S32); 1531 1532 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1533 .legalFor({V2S16, S32}) 1534 .lower(); 1535 BuildVector.minScalarOrElt(0, S32); 1536 } else { 1537 BuildVector.customFor({V2S16, S16}); 1538 BuildVector.minScalarOrElt(0, S32); 1539 1540 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1541 .customFor({V2S16, S32}) 1542 .lower(); 1543 } 1544 1545 BuildVector.legalIf(isRegisterType(0)); 1546 1547 // FIXME: Clamp maximum size 1548 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1549 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1550 .clampMaxNumElements(0, S32, 32) 1551 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1552 .clampMaxNumElements(0, S16, 64); 1553 1554 // TODO: Don't fully scalarize v2s16 pieces? Or combine out those 1555 // pre-legalize. 1556 if (ST.hasVOP3PInsts()) { 1557 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1558 .customFor({V2S16, V2S16}) 1559 .lower(); 1560 } else 1561 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1562 1563 // Merge/Unmerge 1564 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1565 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1566 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1567 1568 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1569 const LLT Ty = Query.Types[TypeIdx]; 1570 if (Ty.isVector()) { 1571 const LLT &EltTy = Ty.getElementType(); 1572 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1573 return true; 1574 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1575 return true; 1576 } 1577 return false; 1578 }; 1579 1580 auto &Builder = getActionDefinitionsBuilder(Op) 1581 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1582 .lowerFor({{S16, V2S16}}) 1583 .lowerIf([=](const LegalityQuery &Query) { 1584 const LLT BigTy = Query.Types[BigTyIdx]; 1585 return BigTy.getSizeInBits() == 32; 1586 }) 1587 // Try to widen to s16 first for small types. 1588 // TODO: Only do this on targets with legal s16 shifts 1589 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1590 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1591 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1592 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1593 elementTypeIs(1, S16)), 1594 changeTo(1, V2S16)) 1595 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1596 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1597 // valid. 1598 .clampScalar(LitTyIdx, S32, S512) 1599 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1600 // Break up vectors with weird elements into scalars 1601 .fewerElementsIf( 1602 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1603 scalarize(0)) 1604 .fewerElementsIf( 1605 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1606 scalarize(1)) 1607 .clampScalar(BigTyIdx, S32, MaxScalar); 1608 1609 if (Op == G_MERGE_VALUES) { 1610 Builder.widenScalarIf( 1611 // TODO: Use 16-bit shifts if legal for 8-bit values? 1612 [=](const LegalityQuery &Query) { 1613 const LLT Ty = Query.Types[LitTyIdx]; 1614 return Ty.getSizeInBits() < 32; 1615 }, 1616 changeTo(LitTyIdx, S32)); 1617 } 1618 1619 Builder.widenScalarIf( 1620 [=](const LegalityQuery &Query) { 1621 const LLT Ty = Query.Types[BigTyIdx]; 1622 return !isPowerOf2_32(Ty.getSizeInBits()) && 1623 Ty.getSizeInBits() % 16 != 0; 1624 }, 1625 [=](const LegalityQuery &Query) { 1626 // Pick the next power of 2, or a multiple of 64 over 128. 1627 // Whichever is smaller. 1628 const LLT &Ty = Query.Types[BigTyIdx]; 1629 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1630 if (NewSizeInBits >= 256) { 1631 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1632 if (RoundedTo < NewSizeInBits) 1633 NewSizeInBits = RoundedTo; 1634 } 1635 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1636 }) 1637 // Any vectors left are the wrong size. Scalarize them. 1638 .scalarize(0) 1639 .scalarize(1); 1640 } 1641 1642 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1643 // RegBankSelect. 1644 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1645 .legalFor({{S32}, {S64}}); 1646 1647 if (ST.hasVOP3PInsts()) { 1648 SextInReg.lowerFor({{V2S16}}) 1649 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1650 // get more vector shift opportunities, since we'll get those when 1651 // expanded. 1652 .clampMaxNumElementsStrict(0, S16, 2); 1653 } else if (ST.has16BitInsts()) { 1654 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1655 } else { 1656 // Prefer to promote to s32 before lowering if we don't have 16-bit 1657 // shifts. This avoid a lot of intermediate truncate and extend operations. 1658 SextInReg.lowerFor({{S32}, {S64}}); 1659 } 1660 1661 SextInReg 1662 .scalarize(0) 1663 .clampScalar(0, S32, S64) 1664 .lower(); 1665 1666 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1667 .scalarize(0) 1668 .lower(); 1669 1670 // TODO: Only Try to form v2s16 with legal packed instructions. 1671 getActionDefinitionsBuilder(G_FSHR) 1672 .legalFor({{S32, S32}}) 1673 .lowerFor({{V2S16, V2S16}}) 1674 .clampMaxNumElementsStrict(0, S16, 2) 1675 .scalarize(0) 1676 .lower(); 1677 1678 if (ST.hasVOP3PInsts()) { 1679 getActionDefinitionsBuilder(G_FSHL) 1680 .lowerFor({{V2S16, V2S16}}) 1681 .clampMaxNumElementsStrict(0, S16, 2) 1682 .scalarize(0) 1683 .lower(); 1684 } else { 1685 getActionDefinitionsBuilder(G_FSHL) 1686 .scalarize(0) 1687 .lower(); 1688 } 1689 1690 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1691 .legalFor({S64}); 1692 1693 getActionDefinitionsBuilder(G_FENCE) 1694 .alwaysLegal(); 1695 1696 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1697 .scalarize(0) 1698 .minScalar(0, S32) 1699 .lower(); 1700 1701 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1702 .legalFor({{S32, S32}, {S64, S32}}) 1703 .clampScalar(1, S32, S32) 1704 .clampScalar(0, S32, S64) 1705 .widenScalarToNextPow2(0) 1706 .scalarize(0); 1707 1708 getActionDefinitionsBuilder({ 1709 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1710 G_FCOPYSIGN, 1711 1712 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1713 G_ATOMICRMW_NAND, 1714 G_ATOMICRMW_FSUB, 1715 G_READ_REGISTER, 1716 G_WRITE_REGISTER, 1717 1718 G_SADDO, G_SSUBO, 1719 1720 // TODO: Implement 1721 G_FMINIMUM, G_FMAXIMUM}).lower(); 1722 1723 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1724 .lower(); 1725 1726 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1727 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1728 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1729 .unsupported(); 1730 1731 getLegacyLegalizerInfo().computeTables(); 1732 verify(*ST.getInstrInfo()); 1733 } 1734 1735 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1736 MachineInstr &MI) const { 1737 MachineIRBuilder &B = Helper.MIRBuilder; 1738 MachineRegisterInfo &MRI = *B.getMRI(); 1739 1740 switch (MI.getOpcode()) { 1741 case TargetOpcode::G_ADDRSPACE_CAST: 1742 return legalizeAddrSpaceCast(MI, MRI, B); 1743 case TargetOpcode::G_FRINT: 1744 return legalizeFrint(MI, MRI, B); 1745 case TargetOpcode::G_FCEIL: 1746 return legalizeFceil(MI, MRI, B); 1747 case TargetOpcode::G_FREM: 1748 return legalizeFrem(MI, MRI, B); 1749 case TargetOpcode::G_INTRINSIC_TRUNC: 1750 return legalizeIntrinsicTrunc(MI, MRI, B); 1751 case TargetOpcode::G_SITOFP: 1752 return legalizeITOFP(MI, MRI, B, true); 1753 case TargetOpcode::G_UITOFP: 1754 return legalizeITOFP(MI, MRI, B, false); 1755 case TargetOpcode::G_FPTOSI: 1756 return legalizeFPTOI(MI, MRI, B, true); 1757 case TargetOpcode::G_FPTOUI: 1758 return legalizeFPTOI(MI, MRI, B, false); 1759 case TargetOpcode::G_FMINNUM: 1760 case TargetOpcode::G_FMAXNUM: 1761 case TargetOpcode::G_FMINNUM_IEEE: 1762 case TargetOpcode::G_FMAXNUM_IEEE: 1763 return legalizeMinNumMaxNum(Helper, MI); 1764 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1765 return legalizeExtractVectorElt(MI, MRI, B); 1766 case TargetOpcode::G_INSERT_VECTOR_ELT: 1767 return legalizeInsertVectorElt(MI, MRI, B); 1768 case TargetOpcode::G_SHUFFLE_VECTOR: 1769 return legalizeShuffleVector(MI, MRI, B); 1770 case TargetOpcode::G_FSIN: 1771 case TargetOpcode::G_FCOS: 1772 return legalizeSinCos(MI, MRI, B); 1773 case TargetOpcode::G_GLOBAL_VALUE: 1774 return legalizeGlobalValue(MI, MRI, B); 1775 case TargetOpcode::G_LOAD: 1776 case TargetOpcode::G_SEXTLOAD: 1777 case TargetOpcode::G_ZEXTLOAD: 1778 return legalizeLoad(Helper, MI); 1779 case TargetOpcode::G_FMAD: 1780 return legalizeFMad(MI, MRI, B); 1781 case TargetOpcode::G_FDIV: 1782 return legalizeFDIV(MI, MRI, B); 1783 case TargetOpcode::G_UDIV: 1784 case TargetOpcode::G_UREM: 1785 case TargetOpcode::G_UDIVREM: 1786 return legalizeUnsignedDIV_REM(MI, MRI, B); 1787 case TargetOpcode::G_SDIV: 1788 case TargetOpcode::G_SREM: 1789 case TargetOpcode::G_SDIVREM: 1790 return legalizeSignedDIV_REM(MI, MRI, B); 1791 case TargetOpcode::G_ATOMIC_CMPXCHG: 1792 return legalizeAtomicCmpXChg(MI, MRI, B); 1793 case TargetOpcode::G_FLOG: 1794 return legalizeFlog(MI, B, numbers::ln2f); 1795 case TargetOpcode::G_FLOG10: 1796 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1797 case TargetOpcode::G_FEXP: 1798 return legalizeFExp(MI, B); 1799 case TargetOpcode::G_FPOW: 1800 return legalizeFPow(MI, B); 1801 case TargetOpcode::G_FFLOOR: 1802 return legalizeFFloor(MI, MRI, B); 1803 case TargetOpcode::G_BUILD_VECTOR: 1804 return legalizeBuildVector(MI, MRI, B); 1805 case TargetOpcode::G_MUL: 1806 return legalizeMul(Helper, MI); 1807 case TargetOpcode::G_CTLZ: 1808 case TargetOpcode::G_CTTZ: 1809 return legalizeCTLZ_CTTZ(MI, MRI, B); 1810 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 1811 return legalizeFPTruncRound(MI, B); 1812 default: 1813 return false; 1814 } 1815 1816 llvm_unreachable("expected switch to return"); 1817 } 1818 1819 Register AMDGPULegalizerInfo::getSegmentAperture( 1820 unsigned AS, 1821 MachineRegisterInfo &MRI, 1822 MachineIRBuilder &B) const { 1823 MachineFunction &MF = B.getMF(); 1824 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1825 const LLT S32 = LLT::scalar(32); 1826 1827 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1828 1829 if (ST.hasApertureRegs()) { 1830 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1831 // getreg. 1832 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1833 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1834 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1835 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1836 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1837 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1838 unsigned Encoding = 1839 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1840 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1841 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1842 1843 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1844 1845 B.buildInstr(AMDGPU::S_GETREG_B32) 1846 .addDef(GetReg) 1847 .addImm(Encoding); 1848 MRI.setType(GetReg, S32); 1849 1850 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1851 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1852 } 1853 1854 // TODO: can we be smarter about machine pointer info? 1855 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1856 Register LoadAddr = MRI.createGenericVirtualRegister( 1857 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1858 // For code object version 5, private_base and shared_base are passed through 1859 // implicit kernargs. 1860 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 1861 AMDGPUTargetLowering::ImplicitParameter Param = 1862 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 1863 : AMDGPUTargetLowering::PRIVATE_BASE; 1864 uint64_t Offset = 1865 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 1866 1867 Register KernargPtrReg = MRI.createGenericVirtualRegister( 1868 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1869 1870 if (!loadInputValue(KernargPtrReg, B, 1871 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 1872 return Register(); 1873 1874 MachineMemOperand *MMO = MF.getMachineMemOperand( 1875 PtrInfo, 1876 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1877 MachineMemOperand::MOInvariant, 1878 LLT::scalar(32), commonAlignment(Align(64), Offset)); 1879 1880 // Pointer address 1881 B.buildPtrAdd(LoadAddr, KernargPtrReg, 1882 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 1883 // Load address 1884 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1885 } 1886 1887 Register QueuePtr = MRI.createGenericVirtualRegister( 1888 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1889 1890 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1891 return Register(); 1892 1893 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1894 // private_segment_aperture_base_hi. 1895 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1896 1897 MachineMemOperand *MMO = MF.getMachineMemOperand( 1898 PtrInfo, 1899 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1900 MachineMemOperand::MOInvariant, 1901 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1902 1903 B.buildPtrAdd(LoadAddr, QueuePtr, 1904 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 1905 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1906 } 1907 1908 /// Return true if the value is a known valid address, such that a null check is 1909 /// not necessary. 1910 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1911 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1912 MachineInstr *Def = MRI.getVRegDef(Val); 1913 switch (Def->getOpcode()) { 1914 case AMDGPU::G_FRAME_INDEX: 1915 case AMDGPU::G_GLOBAL_VALUE: 1916 case AMDGPU::G_BLOCK_ADDR: 1917 return true; 1918 case AMDGPU::G_CONSTANT: { 1919 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1920 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1921 } 1922 default: 1923 return false; 1924 } 1925 1926 return false; 1927 } 1928 1929 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1930 MachineInstr &MI, MachineRegisterInfo &MRI, 1931 MachineIRBuilder &B) const { 1932 MachineFunction &MF = B.getMF(); 1933 1934 const LLT S32 = LLT::scalar(32); 1935 Register Dst = MI.getOperand(0).getReg(); 1936 Register Src = MI.getOperand(1).getReg(); 1937 1938 LLT DstTy = MRI.getType(Dst); 1939 LLT SrcTy = MRI.getType(Src); 1940 unsigned DestAS = DstTy.getAddressSpace(); 1941 unsigned SrcAS = SrcTy.getAddressSpace(); 1942 1943 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1944 // vector element. 1945 assert(!DstTy.isVector()); 1946 1947 const AMDGPUTargetMachine &TM 1948 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1949 1950 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1951 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1952 return true; 1953 } 1954 1955 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 1956 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 1957 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1958 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1959 // Extract low 32-bits of the pointer. 1960 B.buildExtract(Dst, Src, 0); 1961 MI.eraseFromParent(); 1962 return true; 1963 } 1964 1965 unsigned NullVal = TM.getNullPointerValue(DestAS); 1966 1967 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1968 auto FlatNull = B.buildConstant(SrcTy, 0); 1969 1970 // Extract low 32-bits of the pointer. 1971 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1972 1973 auto CmpRes = 1974 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1975 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1976 1977 MI.eraseFromParent(); 1978 return true; 1979 } 1980 1981 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 1982 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 1983 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1984 if (!ST.hasFlatAddressSpace()) 1985 return false; 1986 1987 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1988 if (!ApertureReg.isValid()) 1989 return false; 1990 1991 // Coerce the type of the low half of the result so we can use merge_values. 1992 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1993 1994 // TODO: Should we allow mismatched types but matching sizes in merges to 1995 // avoid the ptrtoint? 1996 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1997 1998 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1999 B.buildCopy(Dst, BuildPtr); 2000 MI.eraseFromParent(); 2001 return true; 2002 } 2003 2004 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 2005 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 2006 2007 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 2008 SegmentNull.getReg(0)); 2009 2010 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 2011 2012 MI.eraseFromParent(); 2013 return true; 2014 } 2015 2016 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2017 SrcTy.getSizeInBits() == 64) { 2018 // Truncate. 2019 B.buildExtract(Dst, Src, 0); 2020 MI.eraseFromParent(); 2021 return true; 2022 } 2023 2024 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2025 DstTy.getSizeInBits() == 64) { 2026 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2027 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2028 2029 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 2030 // another. Merge operands are required to be the same type, but creating an 2031 // extra ptrtoint would be kind of pointless. 2032 auto HighAddr = B.buildConstant( 2033 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 2034 B.buildMerge(Dst, {Src, HighAddr}); 2035 MI.eraseFromParent(); 2036 return true; 2037 } 2038 2039 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 2040 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2041 2042 LLVMContext &Ctx = MF.getFunction().getContext(); 2043 Ctx.diagnose(InvalidAddrSpaceCast); 2044 B.buildUndef(Dst); 2045 MI.eraseFromParent(); 2046 return true; 2047 } 2048 2049 bool AMDGPULegalizerInfo::legalizeFrint( 2050 MachineInstr &MI, MachineRegisterInfo &MRI, 2051 MachineIRBuilder &B) const { 2052 Register Src = MI.getOperand(1).getReg(); 2053 LLT Ty = MRI.getType(Src); 2054 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2055 2056 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2057 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2058 2059 auto C1 = B.buildFConstant(Ty, C1Val); 2060 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2061 2062 // TODO: Should this propagate fast-math-flags? 2063 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2064 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2065 2066 auto C2 = B.buildFConstant(Ty, C2Val); 2067 auto Fabs = B.buildFAbs(Ty, Src); 2068 2069 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2070 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2071 MI.eraseFromParent(); 2072 return true; 2073 } 2074 2075 bool AMDGPULegalizerInfo::legalizeFceil( 2076 MachineInstr &MI, MachineRegisterInfo &MRI, 2077 MachineIRBuilder &B) const { 2078 2079 const LLT S1 = LLT::scalar(1); 2080 const LLT S64 = LLT::scalar(64); 2081 2082 Register Src = MI.getOperand(1).getReg(); 2083 assert(MRI.getType(Src) == S64); 2084 2085 // result = trunc(src) 2086 // if (src > 0.0 && src != result) 2087 // result += 1.0 2088 2089 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2090 2091 const auto Zero = B.buildFConstant(S64, 0.0); 2092 const auto One = B.buildFConstant(S64, 1.0); 2093 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2094 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2095 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2096 auto Add = B.buildSelect(S64, And, One, Zero); 2097 2098 // TODO: Should this propagate fast-math-flags? 2099 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2100 MI.eraseFromParent(); 2101 return true; 2102 } 2103 2104 bool AMDGPULegalizerInfo::legalizeFrem( 2105 MachineInstr &MI, MachineRegisterInfo &MRI, 2106 MachineIRBuilder &B) const { 2107 Register DstReg = MI.getOperand(0).getReg(); 2108 Register Src0Reg = MI.getOperand(1).getReg(); 2109 Register Src1Reg = MI.getOperand(2).getReg(); 2110 auto Flags = MI.getFlags(); 2111 LLT Ty = MRI.getType(DstReg); 2112 2113 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2114 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2115 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2116 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2117 MI.eraseFromParent(); 2118 return true; 2119 } 2120 2121 static MachineInstrBuilder extractF64Exponent(Register Hi, 2122 MachineIRBuilder &B) { 2123 const unsigned FractBits = 52; 2124 const unsigned ExpBits = 11; 2125 LLT S32 = LLT::scalar(32); 2126 2127 auto Const0 = B.buildConstant(S32, FractBits - 32); 2128 auto Const1 = B.buildConstant(S32, ExpBits); 2129 2130 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2131 .addUse(Hi) 2132 .addUse(Const0.getReg(0)) 2133 .addUse(Const1.getReg(0)); 2134 2135 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2136 } 2137 2138 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2139 MachineInstr &MI, MachineRegisterInfo &MRI, 2140 MachineIRBuilder &B) const { 2141 const LLT S1 = LLT::scalar(1); 2142 const LLT S32 = LLT::scalar(32); 2143 const LLT S64 = LLT::scalar(64); 2144 2145 Register Src = MI.getOperand(1).getReg(); 2146 assert(MRI.getType(Src) == S64); 2147 2148 // TODO: Should this use extract since the low half is unused? 2149 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2150 Register Hi = Unmerge.getReg(1); 2151 2152 // Extract the upper half, since this is where we will find the sign and 2153 // exponent. 2154 auto Exp = extractF64Exponent(Hi, B); 2155 2156 const unsigned FractBits = 52; 2157 2158 // Extract the sign bit. 2159 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2160 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2161 2162 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2163 2164 const auto Zero32 = B.buildConstant(S32, 0); 2165 2166 // Extend back to 64-bits. 2167 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2168 2169 auto Shr = B.buildAShr(S64, FractMask, Exp); 2170 auto Not = B.buildNot(S64, Shr); 2171 auto Tmp0 = B.buildAnd(S64, Src, Not); 2172 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2173 2174 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2175 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2176 2177 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2178 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2179 MI.eraseFromParent(); 2180 return true; 2181 } 2182 2183 bool AMDGPULegalizerInfo::legalizeITOFP( 2184 MachineInstr &MI, MachineRegisterInfo &MRI, 2185 MachineIRBuilder &B, bool Signed) const { 2186 2187 Register Dst = MI.getOperand(0).getReg(); 2188 Register Src = MI.getOperand(1).getReg(); 2189 2190 const LLT S64 = LLT::scalar(64); 2191 const LLT S32 = LLT::scalar(32); 2192 2193 assert(MRI.getType(Src) == S64); 2194 2195 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2196 auto ThirtyTwo = B.buildConstant(S32, 32); 2197 2198 if (MRI.getType(Dst) == S64) { 2199 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2200 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2201 2202 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2203 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2204 .addUse(CvtHi.getReg(0)) 2205 .addUse(ThirtyTwo.getReg(0)); 2206 2207 // TODO: Should this propagate fast-math-flags? 2208 B.buildFAdd(Dst, LdExp, CvtLo); 2209 MI.eraseFromParent(); 2210 return true; 2211 } 2212 2213 assert(MRI.getType(Dst) == S32); 2214 2215 auto One = B.buildConstant(S32, 1); 2216 2217 MachineInstrBuilder ShAmt; 2218 if (Signed) { 2219 auto ThirtyOne = B.buildConstant(S32, 31); 2220 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2221 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2222 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2223 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2224 /*HasSideEffects=*/false) 2225 .addUse(Unmerge.getReg(1)); 2226 auto LS2 = B.buildSub(S32, LS, One); 2227 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2228 } else 2229 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2230 auto Norm = B.buildShl(S64, Src, ShAmt); 2231 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2232 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2233 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2234 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2235 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2236 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2237 /*HasSideEffects=*/false) 2238 .addUse(FVal.getReg(0)) 2239 .addUse(Scale.getReg(0)); 2240 MI.eraseFromParent(); 2241 return true; 2242 } 2243 2244 // TODO: Copied from DAG implementation. Verify logic and document how this 2245 // actually works. 2246 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2247 MachineRegisterInfo &MRI, 2248 MachineIRBuilder &B, 2249 bool Signed) const { 2250 2251 Register Dst = MI.getOperand(0).getReg(); 2252 Register Src = MI.getOperand(1).getReg(); 2253 2254 const LLT S64 = LLT::scalar(64); 2255 const LLT S32 = LLT::scalar(32); 2256 2257 const LLT SrcLT = MRI.getType(Src); 2258 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2259 2260 unsigned Flags = MI.getFlags(); 2261 2262 // The basic idea of converting a floating point number into a pair of 32-bit 2263 // integers is illustrated as follows: 2264 // 2265 // tf := trunc(val); 2266 // hif := floor(tf * 2^-32); 2267 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2268 // hi := fptoi(hif); 2269 // lo := fptoi(lof); 2270 // 2271 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2272 MachineInstrBuilder Sign; 2273 if (Signed && SrcLT == S32) { 2274 // However, a 32-bit floating point number has only 23 bits mantissa and 2275 // it's not enough to hold all the significant bits of `lof` if val is 2276 // negative. To avoid the loss of precision, We need to take the absolute 2277 // value after truncating and flip the result back based on the original 2278 // signedness. 2279 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2280 Trunc = B.buildFAbs(S32, Trunc, Flags); 2281 } 2282 MachineInstrBuilder K0, K1; 2283 if (SrcLT == S64) { 2284 K0 = B.buildFConstant(S64, 2285 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2286 K1 = B.buildFConstant(S64, 2287 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2288 } else { 2289 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2290 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2291 } 2292 2293 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2294 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2295 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2296 2297 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2298 : B.buildFPTOUI(S32, FloorMul); 2299 auto Lo = B.buildFPTOUI(S32, Fma); 2300 2301 if (Signed && SrcLT == S32) { 2302 // Flip the result based on the signedness, which is either all 0s or 1s. 2303 Sign = B.buildMerge(S64, {Sign, Sign}); 2304 // r := xor({lo, hi}, sign) - sign; 2305 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2306 } else 2307 B.buildMerge(Dst, {Lo, Hi}); 2308 MI.eraseFromParent(); 2309 2310 return true; 2311 } 2312 2313 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2314 MachineInstr &MI) const { 2315 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2316 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2317 2318 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2319 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2320 2321 // With ieee_mode disabled, the instructions have the correct behavior 2322 // already for G_FMINNUM/G_FMAXNUM 2323 if (!MFI->getMode().IEEE) 2324 return !IsIEEEOp; 2325 2326 if (IsIEEEOp) 2327 return true; 2328 2329 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2330 } 2331 2332 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2333 MachineInstr &MI, MachineRegisterInfo &MRI, 2334 MachineIRBuilder &B) const { 2335 // TODO: Should move some of this into LegalizerHelper. 2336 2337 // TODO: Promote dynamic indexing of s16 to s32 2338 2339 // FIXME: Artifact combiner probably should have replaced the truncated 2340 // constant before this, so we shouldn't need 2341 // getIConstantVRegValWithLookThrough. 2342 Optional<ValueAndVReg> MaybeIdxVal = 2343 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2344 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2345 return true; 2346 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2347 2348 Register Dst = MI.getOperand(0).getReg(); 2349 Register Vec = MI.getOperand(1).getReg(); 2350 2351 LLT VecTy = MRI.getType(Vec); 2352 LLT EltTy = VecTy.getElementType(); 2353 assert(EltTy == MRI.getType(Dst)); 2354 2355 if (IdxVal < VecTy.getNumElements()) { 2356 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2357 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2358 } else { 2359 B.buildUndef(Dst); 2360 } 2361 2362 MI.eraseFromParent(); 2363 return true; 2364 } 2365 2366 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2367 MachineInstr &MI, MachineRegisterInfo &MRI, 2368 MachineIRBuilder &B) const { 2369 // TODO: Should move some of this into LegalizerHelper. 2370 2371 // TODO: Promote dynamic indexing of s16 to s32 2372 2373 // FIXME: Artifact combiner probably should have replaced the truncated 2374 // constant before this, so we shouldn't need 2375 // getIConstantVRegValWithLookThrough. 2376 Optional<ValueAndVReg> MaybeIdxVal = 2377 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2378 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2379 return true; 2380 2381 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2382 Register Dst = MI.getOperand(0).getReg(); 2383 Register Vec = MI.getOperand(1).getReg(); 2384 Register Ins = MI.getOperand(2).getReg(); 2385 2386 LLT VecTy = MRI.getType(Vec); 2387 LLT EltTy = VecTy.getElementType(); 2388 assert(EltTy == MRI.getType(Ins)); 2389 (void)Ins; 2390 2391 unsigned NumElts = VecTy.getNumElements(); 2392 if (IdxVal < NumElts) { 2393 SmallVector<Register, 8> SrcRegs; 2394 for (unsigned i = 0; i < NumElts; ++i) 2395 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2396 B.buildUnmerge(SrcRegs, Vec); 2397 2398 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2399 B.buildMerge(Dst, SrcRegs); 2400 } else { 2401 B.buildUndef(Dst); 2402 } 2403 2404 MI.eraseFromParent(); 2405 return true; 2406 } 2407 2408 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2409 MachineInstr &MI, MachineRegisterInfo &MRI, 2410 MachineIRBuilder &B) const { 2411 const LLT V2S16 = LLT::fixed_vector(2, 16); 2412 2413 Register Dst = MI.getOperand(0).getReg(); 2414 Register Src0 = MI.getOperand(1).getReg(); 2415 LLT DstTy = MRI.getType(Dst); 2416 LLT SrcTy = MRI.getType(Src0); 2417 2418 if (SrcTy == V2S16 && DstTy == V2S16 && 2419 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2420 return true; 2421 2422 MachineIRBuilder HelperBuilder(MI); 2423 GISelObserverWrapper DummyObserver; 2424 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2425 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2426 } 2427 2428 bool AMDGPULegalizerInfo::legalizeSinCos( 2429 MachineInstr &MI, MachineRegisterInfo &MRI, 2430 MachineIRBuilder &B) const { 2431 2432 Register DstReg = MI.getOperand(0).getReg(); 2433 Register SrcReg = MI.getOperand(1).getReg(); 2434 LLT Ty = MRI.getType(DstReg); 2435 unsigned Flags = MI.getFlags(); 2436 2437 Register TrigVal; 2438 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2439 if (ST.hasTrigReducedRange()) { 2440 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2441 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2442 .addUse(MulVal.getReg(0)) 2443 .setMIFlags(Flags).getReg(0); 2444 } else 2445 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2446 2447 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2448 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2449 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2450 .addUse(TrigVal) 2451 .setMIFlags(Flags); 2452 MI.eraseFromParent(); 2453 return true; 2454 } 2455 2456 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2457 MachineIRBuilder &B, 2458 const GlobalValue *GV, 2459 int64_t Offset, 2460 unsigned GAFlags) const { 2461 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2462 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2463 // to the following code sequence: 2464 // 2465 // For constant address space: 2466 // s_getpc_b64 s[0:1] 2467 // s_add_u32 s0, s0, $symbol 2468 // s_addc_u32 s1, s1, 0 2469 // 2470 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2471 // a fixup or relocation is emitted to replace $symbol with a literal 2472 // constant, which is a pc-relative offset from the encoding of the $symbol 2473 // operand to the global variable. 2474 // 2475 // For global address space: 2476 // s_getpc_b64 s[0:1] 2477 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2478 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2479 // 2480 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2481 // fixups or relocations are emitted to replace $symbol@*@lo and 2482 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2483 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2484 // operand to the global variable. 2485 // 2486 // What we want here is an offset from the value returned by s_getpc 2487 // (which is the address of the s_add_u32 instruction) to the global 2488 // variable, but since the encoding of $symbol starts 4 bytes after the start 2489 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2490 // small. This requires us to add 4 to the global variable offset in order to 2491 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2492 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2493 // instruction. 2494 2495 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2496 2497 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2498 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2499 2500 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2501 .addDef(PCReg); 2502 2503 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2504 if (GAFlags == SIInstrInfo::MO_NONE) 2505 MIB.addImm(0); 2506 else 2507 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2508 2509 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2510 2511 if (PtrTy.getSizeInBits() == 32) 2512 B.buildExtract(DstReg, PCReg, 0); 2513 return true; 2514 } 2515 2516 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2517 MachineInstr &MI, MachineRegisterInfo &MRI, 2518 MachineIRBuilder &B) const { 2519 Register DstReg = MI.getOperand(0).getReg(); 2520 LLT Ty = MRI.getType(DstReg); 2521 unsigned AS = Ty.getAddressSpace(); 2522 2523 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2524 MachineFunction &MF = B.getMF(); 2525 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2526 2527 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2528 if (!MFI->isModuleEntryFunction() && 2529 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2530 const Function &Fn = MF.getFunction(); 2531 DiagnosticInfoUnsupported BadLDSDecl( 2532 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2533 DS_Warning); 2534 Fn.getContext().diagnose(BadLDSDecl); 2535 2536 // We currently don't have a way to correctly allocate LDS objects that 2537 // aren't directly associated with a kernel. We do force inlining of 2538 // functions that use local objects. However, if these dead functions are 2539 // not eliminated, we don't want a compile time error. Just emit a warning 2540 // and a trap, since there should be no callable path here. 2541 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2542 B.buildUndef(DstReg); 2543 MI.eraseFromParent(); 2544 return true; 2545 } 2546 2547 // TODO: We could emit code to handle the initialization somewhere. 2548 // We ignore the initializer for now and legalize it to allow selection. 2549 // The initializer will anyway get errored out during assembly emission. 2550 const SITargetLowering *TLI = ST.getTargetLowering(); 2551 if (!TLI->shouldUseLDSConstAddress(GV)) { 2552 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2553 return true; // Leave in place; 2554 } 2555 2556 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2557 Type *Ty = GV->getValueType(); 2558 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2559 // zero-sized type in other languages to declare the dynamic shared 2560 // memory which size is not known at the compile time. They will be 2561 // allocated by the runtime and placed directly after the static 2562 // allocated ones. They all share the same offset. 2563 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2564 // Adjust alignment for that dynamic shared memory array. 2565 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2566 LLT S32 = LLT::scalar(32); 2567 auto Sz = 2568 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2569 B.buildIntToPtr(DstReg, Sz); 2570 MI.eraseFromParent(); 2571 return true; 2572 } 2573 } 2574 2575 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2576 *cast<GlobalVariable>(GV))); 2577 MI.eraseFromParent(); 2578 return true; 2579 } 2580 2581 const SITargetLowering *TLI = ST.getTargetLowering(); 2582 2583 if (TLI->shouldEmitFixup(GV)) { 2584 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2585 MI.eraseFromParent(); 2586 return true; 2587 } 2588 2589 if (TLI->shouldEmitPCReloc(GV)) { 2590 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2591 MI.eraseFromParent(); 2592 return true; 2593 } 2594 2595 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2596 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2597 2598 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2599 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2600 MachinePointerInfo::getGOT(MF), 2601 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2602 MachineMemOperand::MOInvariant, 2603 LoadTy, Align(8)); 2604 2605 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2606 2607 if (Ty.getSizeInBits() == 32) { 2608 // Truncate if this is a 32-bit constant address. 2609 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2610 B.buildExtract(DstReg, Load, 0); 2611 } else 2612 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2613 2614 MI.eraseFromParent(); 2615 return true; 2616 } 2617 2618 static LLT widenToNextPowerOf2(LLT Ty) { 2619 if (Ty.isVector()) 2620 return Ty.changeElementCount( 2621 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2622 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2623 } 2624 2625 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2626 MachineInstr &MI) const { 2627 MachineIRBuilder &B = Helper.MIRBuilder; 2628 MachineRegisterInfo &MRI = *B.getMRI(); 2629 GISelChangeObserver &Observer = Helper.Observer; 2630 2631 Register PtrReg = MI.getOperand(1).getReg(); 2632 LLT PtrTy = MRI.getType(PtrReg); 2633 unsigned AddrSpace = PtrTy.getAddressSpace(); 2634 2635 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2636 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2637 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2638 Observer.changingInstr(MI); 2639 MI.getOperand(1).setReg(Cast.getReg(0)); 2640 Observer.changedInstr(MI); 2641 return true; 2642 } 2643 2644 if (MI.getOpcode() != AMDGPU::G_LOAD) 2645 return false; 2646 2647 Register ValReg = MI.getOperand(0).getReg(); 2648 LLT ValTy = MRI.getType(ValReg); 2649 2650 MachineMemOperand *MMO = *MI.memoperands_begin(); 2651 const unsigned ValSize = ValTy.getSizeInBits(); 2652 const LLT MemTy = MMO->getMemoryType(); 2653 const Align MemAlign = MMO->getAlign(); 2654 const unsigned MemSize = MemTy.getSizeInBits(); 2655 const uint64_t AlignInBits = 8 * MemAlign.value(); 2656 2657 // Widen non-power-of-2 loads to the alignment if needed 2658 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2659 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2660 2661 // This was already the correct extending load result type, so just adjust 2662 // the memory type. 2663 if (WideMemSize == ValSize) { 2664 MachineFunction &MF = B.getMF(); 2665 2666 MachineMemOperand *WideMMO = 2667 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2668 Observer.changingInstr(MI); 2669 MI.setMemRefs(MF, {WideMMO}); 2670 Observer.changedInstr(MI); 2671 return true; 2672 } 2673 2674 // Don't bother handling edge case that should probably never be produced. 2675 if (ValSize > WideMemSize) 2676 return false; 2677 2678 LLT WideTy = widenToNextPowerOf2(ValTy); 2679 2680 Register WideLoad; 2681 if (!WideTy.isVector()) { 2682 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2683 B.buildTrunc(ValReg, WideLoad).getReg(0); 2684 } else { 2685 // Extract the subvector. 2686 2687 if (isRegisterType(ValTy)) { 2688 // If this a case where G_EXTRACT is legal, use it. 2689 // (e.g. <3 x s32> -> <4 x s32>) 2690 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2691 B.buildExtract(ValReg, WideLoad, 0); 2692 } else { 2693 // For cases where the widened type isn't a nice register value, unmerge 2694 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2695 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2696 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2697 } 2698 } 2699 2700 MI.eraseFromParent(); 2701 return true; 2702 } 2703 2704 return false; 2705 } 2706 2707 bool AMDGPULegalizerInfo::legalizeFMad( 2708 MachineInstr &MI, MachineRegisterInfo &MRI, 2709 MachineIRBuilder &B) const { 2710 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2711 assert(Ty.isScalar()); 2712 2713 MachineFunction &MF = B.getMF(); 2714 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2715 2716 // TODO: Always legal with future ftz flag. 2717 // FIXME: Do we need just output? 2718 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2719 return true; 2720 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2721 return true; 2722 2723 MachineIRBuilder HelperBuilder(MI); 2724 GISelObserverWrapper DummyObserver; 2725 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2726 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2727 } 2728 2729 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2730 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2731 Register DstReg = MI.getOperand(0).getReg(); 2732 Register PtrReg = MI.getOperand(1).getReg(); 2733 Register CmpVal = MI.getOperand(2).getReg(); 2734 Register NewVal = MI.getOperand(3).getReg(); 2735 2736 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2737 "this should not have been custom lowered"); 2738 2739 LLT ValTy = MRI.getType(CmpVal); 2740 LLT VecTy = LLT::fixed_vector(2, ValTy); 2741 2742 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2743 2744 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2745 .addDef(DstReg) 2746 .addUse(PtrReg) 2747 .addUse(PackedVal) 2748 .setMemRefs(MI.memoperands()); 2749 2750 MI.eraseFromParent(); 2751 return true; 2752 } 2753 2754 bool AMDGPULegalizerInfo::legalizeFlog( 2755 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2756 Register Dst = MI.getOperand(0).getReg(); 2757 Register Src = MI.getOperand(1).getReg(); 2758 LLT Ty = B.getMRI()->getType(Dst); 2759 unsigned Flags = MI.getFlags(); 2760 2761 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2762 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2763 2764 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2765 MI.eraseFromParent(); 2766 return true; 2767 } 2768 2769 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2770 MachineIRBuilder &B) const { 2771 Register Dst = MI.getOperand(0).getReg(); 2772 Register Src = MI.getOperand(1).getReg(); 2773 unsigned Flags = MI.getFlags(); 2774 LLT Ty = B.getMRI()->getType(Dst); 2775 2776 auto K = B.buildFConstant(Ty, numbers::log2e); 2777 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2778 B.buildFExp2(Dst, Mul, Flags); 2779 MI.eraseFromParent(); 2780 return true; 2781 } 2782 2783 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2784 MachineIRBuilder &B) const { 2785 Register Dst = MI.getOperand(0).getReg(); 2786 Register Src0 = MI.getOperand(1).getReg(); 2787 Register Src1 = MI.getOperand(2).getReg(); 2788 unsigned Flags = MI.getFlags(); 2789 LLT Ty = B.getMRI()->getType(Dst); 2790 const LLT S16 = LLT::scalar(16); 2791 const LLT S32 = LLT::scalar(32); 2792 2793 if (Ty == S32) { 2794 auto Log = B.buildFLog2(S32, Src0, Flags); 2795 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2796 .addUse(Log.getReg(0)) 2797 .addUse(Src1) 2798 .setMIFlags(Flags); 2799 B.buildFExp2(Dst, Mul, Flags); 2800 } else if (Ty == S16) { 2801 // There's no f16 fmul_legacy, so we need to convert for it. 2802 auto Log = B.buildFLog2(S16, Src0, Flags); 2803 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2804 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2805 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2806 .addUse(Ext0.getReg(0)) 2807 .addUse(Ext1.getReg(0)) 2808 .setMIFlags(Flags); 2809 2810 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2811 } else 2812 return false; 2813 2814 MI.eraseFromParent(); 2815 return true; 2816 } 2817 2818 // Find a source register, ignoring any possible source modifiers. 2819 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2820 Register ModSrc = OrigSrc; 2821 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2822 ModSrc = SrcFNeg->getOperand(1).getReg(); 2823 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2824 ModSrc = SrcFAbs->getOperand(1).getReg(); 2825 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2826 ModSrc = SrcFAbs->getOperand(1).getReg(); 2827 return ModSrc; 2828 } 2829 2830 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2831 MachineRegisterInfo &MRI, 2832 MachineIRBuilder &B) const { 2833 2834 const LLT S1 = LLT::scalar(1); 2835 const LLT S64 = LLT::scalar(64); 2836 Register Dst = MI.getOperand(0).getReg(); 2837 Register OrigSrc = MI.getOperand(1).getReg(); 2838 unsigned Flags = MI.getFlags(); 2839 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2840 "this should not have been custom lowered"); 2841 2842 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2843 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2844 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2845 // V_FRACT bug is: 2846 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2847 // 2848 // Convert floor(x) to (x - fract(x)) 2849 2850 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2851 .addUse(OrigSrc) 2852 .setMIFlags(Flags); 2853 2854 // Give source modifier matching some assistance before obscuring a foldable 2855 // pattern. 2856 2857 // TODO: We can avoid the neg on the fract? The input sign to fract 2858 // shouldn't matter? 2859 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2860 2861 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2862 2863 Register Min = MRI.createGenericVirtualRegister(S64); 2864 2865 // We don't need to concern ourselves with the snan handling difference, so 2866 // use the one which will directly select. 2867 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2868 if (MFI->getMode().IEEE) 2869 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2870 else 2871 B.buildFMinNum(Min, Fract, Const, Flags); 2872 2873 Register CorrectedFract = Min; 2874 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2875 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2876 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2877 } 2878 2879 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2880 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2881 2882 MI.eraseFromParent(); 2883 return true; 2884 } 2885 2886 // Turn an illegal packed v2s16 build vector into bit operations. 2887 // TODO: This should probably be a bitcast action in LegalizerHelper. 2888 bool AMDGPULegalizerInfo::legalizeBuildVector( 2889 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2890 Register Dst = MI.getOperand(0).getReg(); 2891 const LLT S32 = LLT::scalar(32); 2892 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2893 2894 Register Src0 = MI.getOperand(1).getReg(); 2895 Register Src1 = MI.getOperand(2).getReg(); 2896 assert(MRI.getType(Src0) == LLT::scalar(16)); 2897 2898 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2899 B.buildBitcast(Dst, Merge); 2900 2901 MI.eraseFromParent(); 2902 return true; 2903 } 2904 2905 // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 2906 // 2907 // Source and accumulation registers must all be 32-bits. 2908 // 2909 // TODO: When the multiply is uniform, we should produce a code sequence 2910 // that is better suited to instruction selection on the SALU. Instead of 2911 // the outer loop going over parts of the result, the outer loop should go 2912 // over parts of one of the factors. This should result in instruction 2913 // selection that makes full use of S_ADDC_U32 instructions. 2914 void AMDGPULegalizerInfo::buildMultiply( 2915 LegalizerHelper &Helper, MutableArrayRef<Register> Accum, 2916 ArrayRef<Register> Src0, ArrayRef<Register> Src1, 2917 bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const { 2918 // Use (possibly empty) vectors of S1 registers to represent the set of 2919 // carries from one pair of positions to the next. 2920 using Carry = SmallVector<Register, 2>; 2921 2922 MachineIRBuilder &B = Helper.MIRBuilder; 2923 2924 const LLT S1 = LLT::scalar(1); 2925 const LLT S32 = LLT::scalar(32); 2926 const LLT S64 = LLT::scalar(64); 2927 2928 Register Zero32; 2929 Register Zero64; 2930 2931 auto getZero32 = [&]() -> Register { 2932 if (!Zero32) 2933 Zero32 = B.buildConstant(S32, 0).getReg(0); 2934 return Zero32; 2935 }; 2936 auto getZero64 = [&]() -> Register { 2937 if (!Zero64) 2938 Zero64 = B.buildConstant(S64, 0).getReg(0); 2939 return Zero64; 2940 }; 2941 2942 // Merge the given carries into the 32-bit LocalAccum, which is modified 2943 // in-place. 2944 // 2945 // Returns the carry-out, which is a single S1 register or null. 2946 auto mergeCarry = 2947 [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 2948 if (CarryIn.empty()) 2949 return Register(); 2950 2951 bool HaveCarryOut = true; 2952 Register CarryAccum; 2953 if (CarryIn.size() == 1) { 2954 if (!LocalAccum) { 2955 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 2956 return Register(); 2957 } 2958 2959 CarryAccum = getZero32(); 2960 } else { 2961 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 2962 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 2963 CarryAccum = 2964 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 2965 .getReg(0); 2966 } 2967 2968 if (!LocalAccum) { 2969 LocalAccum = getZero32(); 2970 HaveCarryOut = false; 2971 } 2972 } 2973 2974 auto Add = 2975 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 2976 LocalAccum = Add.getReg(0); 2977 return HaveCarryOut ? Add.getReg(1) : Register(); 2978 }; 2979 2980 // Build a multiply-add chain to compute 2981 // 2982 // LocalAccum + (partial products at DstIndex) 2983 // + (opportunistic subset of CarryIn) 2984 // 2985 // LocalAccum is an array of one or two 32-bit registers that are updated 2986 // in-place. The incoming registers may be null. 2987 // 2988 // In some edge cases, carry-ins can be consumed "for free". In that case, 2989 // the consumed carry bits are removed from CarryIn in-place. 2990 auto buildMadChain = 2991 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 2992 -> Carry { 2993 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 2994 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 2995 2996 Carry CarryOut; 2997 unsigned j0 = 0; 2998 2999 // Use plain 32-bit multiplication for the most significant part of the 3000 // result by default. 3001 if (LocalAccum.size() == 1 && 3002 (!UsePartialMad64_32 || !CarryIn.empty())) { 3003 do { 3004 unsigned j1 = DstIndex - j0; 3005 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 3006 if (!LocalAccum[0]) { 3007 LocalAccum[0] = Mul.getReg(0); 3008 } else { 3009 if (CarryIn.empty()) { 3010 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 3011 } else { 3012 LocalAccum[0] = 3013 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 3014 .getReg(0); 3015 CarryIn.pop_back(); 3016 } 3017 } 3018 ++j0; 3019 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 3020 } 3021 3022 // Build full 64-bit multiplies. 3023 if (j0 <= DstIndex) { 3024 bool HaveSmallAccum = false; 3025 Register Tmp; 3026 3027 if (LocalAccum[0]) { 3028 if (LocalAccum.size() == 1) { 3029 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 3030 HaveSmallAccum = true; 3031 } else if (LocalAccum[1]) { 3032 Tmp = B.buildMerge(S64, LocalAccum).getReg(0); 3033 HaveSmallAccum = false; 3034 } else { 3035 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 3036 HaveSmallAccum = true; 3037 } 3038 } else { 3039 assert(LocalAccum.size() == 1 || !LocalAccum[1]); 3040 Tmp = getZero64(); 3041 HaveSmallAccum = true; 3042 } 3043 3044 do { 3045 unsigned j1 = DstIndex - j0; 3046 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 3047 {Src0[j0], Src1[j1], Tmp}); 3048 Tmp = Mad.getReg(0); 3049 if (!HaveSmallAccum) 3050 CarryOut.push_back(Mad.getReg(1)); 3051 HaveSmallAccum = false; 3052 ++j0; 3053 } while (j0 <= DstIndex); 3054 3055 auto Unmerge = B.buildUnmerge(S32, Tmp); 3056 LocalAccum[0] = Unmerge.getReg(0); 3057 if (LocalAccum.size() > 1) 3058 LocalAccum[1] = Unmerge.getReg(1); 3059 } 3060 3061 return CarryOut; 3062 }; 3063 3064 // Outer multiply loop, iterating over destination parts from least 3065 // significant to most significant parts. 3066 // 3067 // The columns of the following diagram correspond to the destination parts 3068 // affected by one iteration of the outer loop (ignoring boundary 3069 // conditions). 3070 // 3071 // Dest index relative to 2 * i: 1 0 -1 3072 // ------ 3073 // Carries from previous iteration: e o 3074 // Even-aligned partial product sum: E E . 3075 // Odd-aligned partial product sum: O O 3076 // 3077 // 'o' is OddCarry, 'e' is EvenCarry. 3078 // EE and OO are computed from partial products via buildMadChain and use 3079 // accumulation where possible and appropriate. 3080 // 3081 Register SeparateOddCarry; 3082 Carry EvenCarry; 3083 Carry OddCarry; 3084 3085 for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 3086 Carry OddCarryIn = std::move(OddCarry); 3087 Carry EvenCarryIn = std::move(EvenCarry); 3088 OddCarry.clear(); 3089 EvenCarry.clear(); 3090 3091 // Partial products at offset 2 * i. 3092 if (2 * i < Accum.size()) { 3093 auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 3094 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 3095 } 3096 3097 // Partial products at offset 2 * i - 1. 3098 if (i > 0) { 3099 if (!SeparateOddAlignedProducts) { 3100 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 3101 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3102 } else { 3103 bool IsHighest = 2 * i >= Accum.size(); 3104 Register SeparateOddOut[2]; 3105 auto LocalAccum = makeMutableArrayRef(SeparateOddOut) 3106 .take_front(IsHighest ? 1 : 2); 3107 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3108 3109 MachineInstr *Lo; 3110 3111 if (i == 1) { 3112 if (!IsHighest) 3113 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 3114 else 3115 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 3116 } else { 3117 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 3118 SeparateOddCarry); 3119 } 3120 Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 3121 3122 if (!IsHighest) { 3123 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 3124 Lo->getOperand(1).getReg()); 3125 Accum[2 * i] = Hi.getReg(0); 3126 SeparateOddCarry = Hi.getReg(1); 3127 } 3128 } 3129 } 3130 3131 // Add in the carries from the previous iteration 3132 if (i > 0) { 3133 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 3134 EvenCarryIn.push_back(CarryOut); 3135 3136 if (2 * i < Accum.size()) { 3137 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 3138 OddCarry.push_back(CarryOut); 3139 } 3140 } 3141 } 3142 } 3143 3144 // Custom narrowing of wide multiplies using wide multiply-add instructions. 3145 // 3146 // TODO: If the multiply is followed by an addition, we should attempt to 3147 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 3148 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 3149 MachineInstr &MI) const { 3150 assert(ST.hasMad64_32()); 3151 assert(MI.getOpcode() == TargetOpcode::G_MUL); 3152 3153 MachineIRBuilder &B = Helper.MIRBuilder; 3154 MachineRegisterInfo &MRI = *B.getMRI(); 3155 3156 Register DstReg = MI.getOperand(0).getReg(); 3157 Register Src0 = MI.getOperand(1).getReg(); 3158 Register Src1 = MI.getOperand(2).getReg(); 3159 3160 LLT Ty = MRI.getType(DstReg); 3161 assert(Ty.isScalar()); 3162 3163 unsigned Size = Ty.getSizeInBits(); 3164 unsigned NumParts = Size / 32; 3165 assert((Size % 32) == 0); 3166 assert(NumParts >= 2); 3167 3168 // Whether to use MAD_64_32 for partial products whose high half is 3169 // discarded. This avoids some ADD instructions but risks false dependency 3170 // stalls on some subtargets in some cases. 3171 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 3172 3173 // Whether to compute odd-aligned partial products separately. This is 3174 // advisable on subtargets where the accumulator of MAD_64_32 must be placed 3175 // in an even-aligned VGPR. 3176 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 3177 3178 LLT S32 = LLT::scalar(32); 3179 SmallVector<Register, 2> Src0Parts, Src1Parts; 3180 for (unsigned i = 0; i < NumParts; ++i) { 3181 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3182 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3183 } 3184 B.buildUnmerge(Src0Parts, Src0); 3185 B.buildUnmerge(Src1Parts, Src1); 3186 3187 SmallVector<Register, 2> AccumRegs(NumParts); 3188 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 3189 SeparateOddAlignedProducts); 3190 3191 B.buildMerge(DstReg, AccumRegs); 3192 MI.eraseFromParent(); 3193 return true; 3194 3195 } 3196 3197 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 3198 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 3199 // case with a single min instruction instead of a compare+select. 3200 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 3201 MachineRegisterInfo &MRI, 3202 MachineIRBuilder &B) const { 3203 Register Dst = MI.getOperand(0).getReg(); 3204 Register Src = MI.getOperand(1).getReg(); 3205 LLT DstTy = MRI.getType(Dst); 3206 LLT SrcTy = MRI.getType(Src); 3207 3208 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 3209 ? AMDGPU::G_AMDGPU_FFBH_U32 3210 : AMDGPU::G_AMDGPU_FFBL_B32; 3211 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 3212 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 3213 3214 MI.eraseFromParent(); 3215 return true; 3216 } 3217 3218 // Check that this is a G_XOR x, -1 3219 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 3220 if (MI.getOpcode() != TargetOpcode::G_XOR) 3221 return false; 3222 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 3223 return ConstVal && *ConstVal == -1; 3224 } 3225 3226 // Return the use branch instruction, otherwise null if the usage is invalid. 3227 static MachineInstr * 3228 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 3229 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 3230 Register CondDef = MI.getOperand(0).getReg(); 3231 if (!MRI.hasOneNonDBGUse(CondDef)) 3232 return nullptr; 3233 3234 MachineBasicBlock *Parent = MI.getParent(); 3235 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 3236 3237 if (isNot(MRI, *UseMI)) { 3238 Register NegatedCond = UseMI->getOperand(0).getReg(); 3239 if (!MRI.hasOneNonDBGUse(NegatedCond)) 3240 return nullptr; 3241 3242 // We're deleting the def of this value, so we need to remove it. 3243 eraseInstr(*UseMI, MRI); 3244 3245 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 3246 Negated = true; 3247 } 3248 3249 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 3250 return nullptr; 3251 3252 // Make sure the cond br is followed by a G_BR, or is the last instruction. 3253 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 3254 if (Next == Parent->end()) { 3255 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 3256 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 3257 return nullptr; 3258 UncondBrTarget = &*NextMBB; 3259 } else { 3260 if (Next->getOpcode() != AMDGPU::G_BR) 3261 return nullptr; 3262 Br = &*Next; 3263 UncondBrTarget = Br->getOperand(0).getMBB(); 3264 } 3265 3266 return UseMI; 3267 } 3268 3269 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 3270 const ArgDescriptor *Arg, 3271 const TargetRegisterClass *ArgRC, 3272 LLT ArgTy) const { 3273 MCRegister SrcReg = Arg->getRegister(); 3274 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 3275 assert(DstReg.isVirtual() && "Virtual register expected"); 3276 3277 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 3278 *ArgRC, B.getDebugLoc(), ArgTy); 3279 if (Arg->isMasked()) { 3280 // TODO: Should we try to emit this once in the entry block? 3281 const LLT S32 = LLT::scalar(32); 3282 const unsigned Mask = Arg->getMask(); 3283 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 3284 3285 Register AndMaskSrc = LiveIn; 3286 3287 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 3288 // 0. 3289 if (Shift != 0) { 3290 auto ShiftAmt = B.buildConstant(S32, Shift); 3291 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 3292 } 3293 3294 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 3295 } else { 3296 B.buildCopy(DstReg, LiveIn); 3297 } 3298 3299 return true; 3300 } 3301 3302 bool AMDGPULegalizerInfo::loadInputValue( 3303 Register DstReg, MachineIRBuilder &B, 3304 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3305 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3306 const ArgDescriptor *Arg; 3307 const TargetRegisterClass *ArgRC; 3308 LLT ArgTy; 3309 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 3310 3311 if (!Arg) { 3312 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 3313 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 3314 // case the pointer argument may be missing and we use null. 3315 B.buildConstant(DstReg, 0); 3316 return true; 3317 } 3318 3319 // It's undefined behavior if a function marked with the amdgpu-no-* 3320 // attributes uses the corresponding intrinsic. 3321 B.buildUndef(DstReg); 3322 return true; 3323 } 3324 3325 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 3326 return false; // TODO: Handle these 3327 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 3328 } 3329 3330 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 3331 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 3332 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3333 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 3334 return false; 3335 3336 MI.eraseFromParent(); 3337 return true; 3338 } 3339 3340 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 3341 int64_t C) { 3342 B.buildConstant(MI.getOperand(0).getReg(), C); 3343 MI.eraseFromParent(); 3344 return true; 3345 } 3346 3347 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 3348 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 3349 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3350 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 3351 if (MaxID == 0) 3352 return replaceWithConstant(B, MI, 0); 3353 3354 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3355 const ArgDescriptor *Arg; 3356 const TargetRegisterClass *ArgRC; 3357 LLT ArgTy; 3358 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 3359 3360 Register DstReg = MI.getOperand(0).getReg(); 3361 if (!Arg) { 3362 // It's undefined behavior if a function marked with the amdgpu-no-* 3363 // attributes uses the corresponding intrinsic. 3364 B.buildUndef(DstReg); 3365 MI.eraseFromParent(); 3366 return true; 3367 } 3368 3369 if (Arg->isMasked()) { 3370 // Don't bother inserting AssertZext for packed IDs since we're emitting the 3371 // masking operations anyway. 3372 // 3373 // TODO: We could assert the top bit is 0 for the source copy. 3374 if (!loadInputValue(DstReg, B, ArgType)) 3375 return false; 3376 } else { 3377 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 3378 if (!loadInputValue(TmpReg, B, ArgType)) 3379 return false; 3380 B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID)); 3381 } 3382 3383 MI.eraseFromParent(); 3384 return true; 3385 } 3386 3387 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 3388 int64_t Offset) const { 3389 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3390 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 3391 3392 // TODO: If we passed in the base kernel offset we could have a better 3393 // alignment than 4, but we don't really need it. 3394 if (!loadInputValue(KernArgReg, B, 3395 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3396 llvm_unreachable("failed to find kernarg segment ptr"); 3397 3398 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 3399 // TODO: Should get nuw 3400 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 3401 } 3402 3403 /// Legalize a value that's loaded from kernel arguments. This is only used by 3404 /// legacy intrinsics. 3405 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 3406 MachineIRBuilder &B, 3407 uint64_t Offset, 3408 Align Alignment) const { 3409 Register DstReg = MI.getOperand(0).getReg(); 3410 3411 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 3412 "unexpected kernarg parameter type"); 3413 3414 Register Ptr = getKernargParameterPtr(B, Offset); 3415 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 3416 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 3417 MachineMemOperand::MODereferenceable | 3418 MachineMemOperand::MOInvariant); 3419 MI.eraseFromParent(); 3420 return true; 3421 } 3422 3423 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 3424 MachineRegisterInfo &MRI, 3425 MachineIRBuilder &B) const { 3426 Register Dst = MI.getOperand(0).getReg(); 3427 LLT DstTy = MRI.getType(Dst); 3428 LLT S16 = LLT::scalar(16); 3429 LLT S32 = LLT::scalar(32); 3430 LLT S64 = LLT::scalar(64); 3431 3432 if (DstTy == S16) 3433 return legalizeFDIV16(MI, MRI, B); 3434 if (DstTy == S32) 3435 return legalizeFDIV32(MI, MRI, B); 3436 if (DstTy == S64) 3437 return legalizeFDIV64(MI, MRI, B); 3438 3439 return false; 3440 } 3441 3442 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 3443 Register DstDivReg, 3444 Register DstRemReg, 3445 Register X, 3446 Register Y) const { 3447 const LLT S1 = LLT::scalar(1); 3448 const LLT S32 = LLT::scalar(32); 3449 3450 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 3451 // algorithm used here. 3452 3453 // Initial estimate of inv(y). 3454 auto FloatY = B.buildUITOFP(S32, Y); 3455 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 3456 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 3457 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 3458 auto Z = B.buildFPTOUI(S32, ScaledY); 3459 3460 // One round of UNR. 3461 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 3462 auto NegYZ = B.buildMul(S32, NegY, Z); 3463 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 3464 3465 // Quotient/remainder estimate. 3466 auto Q = B.buildUMulH(S32, X, Z); 3467 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3468 3469 // First quotient/remainder refinement. 3470 auto One = B.buildConstant(S32, 1); 3471 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3472 if (DstDivReg) 3473 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3474 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3475 3476 // Second quotient/remainder refinement. 3477 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3478 if (DstDivReg) 3479 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3480 3481 if (DstRemReg) 3482 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3483 } 3484 3485 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3486 // 3487 // Return lo, hi of result 3488 // 3489 // %cvt.lo = G_UITOFP Val.lo 3490 // %cvt.hi = G_UITOFP Val.hi 3491 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3492 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3493 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3494 // %mul2 = G_FMUL %mul1, 2**(-32) 3495 // %trunc = G_INTRINSIC_TRUNC %mul2 3496 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3497 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3498 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3499 Register Val) { 3500 const LLT S32 = LLT::scalar(32); 3501 auto Unmerge = B.buildUnmerge(S32, Val); 3502 3503 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3504 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3505 3506 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3507 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3508 3509 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3510 auto Mul1 = 3511 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3512 3513 // 2**(-32) 3514 auto Mul2 = 3515 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3516 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3517 3518 // -(2**32) 3519 auto Mad2 = B.buildFMAD(S32, Trunc, 3520 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3521 3522 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3523 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3524 3525 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3526 } 3527 3528 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3529 Register DstDivReg, 3530 Register DstRemReg, 3531 Register Numer, 3532 Register Denom) const { 3533 const LLT S32 = LLT::scalar(32); 3534 const LLT S64 = LLT::scalar(64); 3535 const LLT S1 = LLT::scalar(1); 3536 Register RcpLo, RcpHi; 3537 3538 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3539 3540 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3541 3542 auto Zero64 = B.buildConstant(S64, 0); 3543 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3544 3545 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3546 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3547 3548 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3549 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3550 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3551 3552 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3553 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3554 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3555 3556 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3557 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3558 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3559 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3560 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3561 3562 auto Zero32 = B.buildConstant(S32, 0); 3563 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3564 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3565 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3566 3567 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3568 Register NumerLo = UnmergeNumer.getReg(0); 3569 Register NumerHi = UnmergeNumer.getReg(1); 3570 3571 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3572 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3573 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3574 Register Mul3_Lo = UnmergeMul3.getReg(0); 3575 Register Mul3_Hi = UnmergeMul3.getReg(1); 3576 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3577 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3578 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3579 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3580 3581 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3582 Register DenomLo = UnmergeDenom.getReg(0); 3583 Register DenomHi = UnmergeDenom.getReg(1); 3584 3585 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3586 auto C1 = B.buildSExt(S32, CmpHi); 3587 3588 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3589 auto C2 = B.buildSExt(S32, CmpLo); 3590 3591 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3592 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3593 3594 // TODO: Here and below portions of the code can be enclosed into if/endif. 3595 // Currently control flow is unconditional and we have 4 selects after 3596 // potential endif to substitute PHIs. 3597 3598 // if C3 != 0 ... 3599 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3600 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3601 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3602 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3603 3604 auto One64 = B.buildConstant(S64, 1); 3605 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3606 3607 auto C4 = 3608 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3609 auto C5 = 3610 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3611 auto C6 = B.buildSelect( 3612 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3613 3614 // if (C6 != 0) 3615 auto Add4 = B.buildAdd(S64, Add3, One64); 3616 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3617 3618 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3619 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3620 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3621 3622 // endif C6 3623 // endif C3 3624 3625 if (DstDivReg) { 3626 auto Sel1 = B.buildSelect( 3627 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3628 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3629 Sel1, MulHi3); 3630 } 3631 3632 if (DstRemReg) { 3633 auto Sel2 = B.buildSelect( 3634 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3635 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3636 Sel2, Sub1); 3637 } 3638 } 3639 3640 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3641 MachineRegisterInfo &MRI, 3642 MachineIRBuilder &B) const { 3643 Register DstDivReg, DstRemReg; 3644 switch (MI.getOpcode()) { 3645 default: 3646 llvm_unreachable("Unexpected opcode!"); 3647 case AMDGPU::G_UDIV: { 3648 DstDivReg = MI.getOperand(0).getReg(); 3649 break; 3650 } 3651 case AMDGPU::G_UREM: { 3652 DstRemReg = MI.getOperand(0).getReg(); 3653 break; 3654 } 3655 case AMDGPU::G_UDIVREM: { 3656 DstDivReg = MI.getOperand(0).getReg(); 3657 DstRemReg = MI.getOperand(1).getReg(); 3658 break; 3659 } 3660 } 3661 3662 const LLT S64 = LLT::scalar(64); 3663 const LLT S32 = LLT::scalar(32); 3664 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3665 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3666 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3667 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3668 3669 if (Ty == S32) 3670 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3671 else if (Ty == S64) 3672 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3673 else 3674 return false; 3675 3676 MI.eraseFromParent(); 3677 return true; 3678 } 3679 3680 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3681 MachineRegisterInfo &MRI, 3682 MachineIRBuilder &B) const { 3683 const LLT S64 = LLT::scalar(64); 3684 const LLT S32 = LLT::scalar(32); 3685 3686 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3687 if (Ty != S32 && Ty != S64) 3688 return false; 3689 3690 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3691 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3692 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3693 3694 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3695 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3696 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3697 3698 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3699 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3700 3701 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3702 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3703 3704 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3705 switch (MI.getOpcode()) { 3706 default: 3707 llvm_unreachable("Unexpected opcode!"); 3708 case AMDGPU::G_SDIV: { 3709 DstDivReg = MI.getOperand(0).getReg(); 3710 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3711 break; 3712 } 3713 case AMDGPU::G_SREM: { 3714 DstRemReg = MI.getOperand(0).getReg(); 3715 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3716 break; 3717 } 3718 case AMDGPU::G_SDIVREM: { 3719 DstDivReg = MI.getOperand(0).getReg(); 3720 DstRemReg = MI.getOperand(1).getReg(); 3721 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3722 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3723 break; 3724 } 3725 } 3726 3727 if (Ty == S32) 3728 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3729 else 3730 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3731 3732 if (DstDivReg) { 3733 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3734 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3735 B.buildSub(DstDivReg, SignXor, Sign); 3736 } 3737 3738 if (DstRemReg) { 3739 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3740 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3741 B.buildSub(DstRemReg, SignXor, Sign); 3742 } 3743 3744 MI.eraseFromParent(); 3745 return true; 3746 } 3747 3748 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3749 MachineRegisterInfo &MRI, 3750 MachineIRBuilder &B) const { 3751 Register Res = MI.getOperand(0).getReg(); 3752 Register LHS = MI.getOperand(1).getReg(); 3753 Register RHS = MI.getOperand(2).getReg(); 3754 uint16_t Flags = MI.getFlags(); 3755 LLT ResTy = MRI.getType(Res); 3756 3757 const MachineFunction &MF = B.getMF(); 3758 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3759 MI.getFlag(MachineInstr::FmAfn); 3760 3761 if (!AllowInaccurateRcp) 3762 return false; 3763 3764 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3765 // 1 / x -> RCP(x) 3766 if (CLHS->isExactlyValue(1.0)) { 3767 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3768 .addUse(RHS) 3769 .setMIFlags(Flags); 3770 3771 MI.eraseFromParent(); 3772 return true; 3773 } 3774 3775 // -1 / x -> RCP( FNEG(x) ) 3776 if (CLHS->isExactlyValue(-1.0)) { 3777 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3778 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3779 .addUse(FNeg.getReg(0)) 3780 .setMIFlags(Flags); 3781 3782 MI.eraseFromParent(); 3783 return true; 3784 } 3785 } 3786 3787 // x / y -> x * (1.0 / y) 3788 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3789 .addUse(RHS) 3790 .setMIFlags(Flags); 3791 B.buildFMul(Res, LHS, RCP, Flags); 3792 3793 MI.eraseFromParent(); 3794 return true; 3795 } 3796 3797 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3798 MachineRegisterInfo &MRI, 3799 MachineIRBuilder &B) const { 3800 Register Res = MI.getOperand(0).getReg(); 3801 Register X = MI.getOperand(1).getReg(); 3802 Register Y = MI.getOperand(2).getReg(); 3803 uint16_t Flags = MI.getFlags(); 3804 LLT ResTy = MRI.getType(Res); 3805 3806 const MachineFunction &MF = B.getMF(); 3807 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3808 MI.getFlag(MachineInstr::FmAfn); 3809 3810 if (!AllowInaccurateRcp) 3811 return false; 3812 3813 auto NegY = B.buildFNeg(ResTy, Y); 3814 auto One = B.buildFConstant(ResTy, 1.0); 3815 3816 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3817 .addUse(Y) 3818 .setMIFlags(Flags); 3819 3820 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3821 R = B.buildFMA(ResTy, Tmp0, R, R); 3822 3823 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3824 R = B.buildFMA(ResTy, Tmp1, R, R); 3825 3826 auto Ret = B.buildFMul(ResTy, X, R); 3827 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3828 3829 B.buildFMA(Res, Tmp2, R, Ret); 3830 MI.eraseFromParent(); 3831 return true; 3832 } 3833 3834 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3835 MachineRegisterInfo &MRI, 3836 MachineIRBuilder &B) const { 3837 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3838 return true; 3839 3840 Register Res = MI.getOperand(0).getReg(); 3841 Register LHS = MI.getOperand(1).getReg(); 3842 Register RHS = MI.getOperand(2).getReg(); 3843 3844 uint16_t Flags = MI.getFlags(); 3845 3846 LLT S16 = LLT::scalar(16); 3847 LLT S32 = LLT::scalar(32); 3848 3849 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3850 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3851 3852 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3853 .addUse(RHSExt.getReg(0)) 3854 .setMIFlags(Flags); 3855 3856 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3857 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3858 3859 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3860 .addUse(RDst.getReg(0)) 3861 .addUse(RHS) 3862 .addUse(LHS) 3863 .setMIFlags(Flags); 3864 3865 MI.eraseFromParent(); 3866 return true; 3867 } 3868 3869 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3870 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3871 static void toggleSPDenormMode(bool Enable, 3872 MachineIRBuilder &B, 3873 const GCNSubtarget &ST, 3874 AMDGPU::SIModeRegisterDefaults Mode) { 3875 // Set SP denorm mode to this value. 3876 unsigned SPDenormMode = 3877 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3878 3879 if (ST.hasDenormModeInst()) { 3880 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3881 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3882 3883 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3884 B.buildInstr(AMDGPU::S_DENORM_MODE) 3885 .addImm(NewDenormModeValue); 3886 3887 } else { 3888 // Select FP32 bit field in mode register. 3889 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3890 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3891 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3892 3893 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3894 .addImm(SPDenormMode) 3895 .addImm(SPDenormModeBitField); 3896 } 3897 } 3898 3899 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3900 MachineRegisterInfo &MRI, 3901 MachineIRBuilder &B) const { 3902 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3903 return true; 3904 3905 Register Res = MI.getOperand(0).getReg(); 3906 Register LHS = MI.getOperand(1).getReg(); 3907 Register RHS = MI.getOperand(2).getReg(); 3908 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3909 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3910 3911 uint16_t Flags = MI.getFlags(); 3912 3913 LLT S32 = LLT::scalar(32); 3914 LLT S1 = LLT::scalar(1); 3915 3916 auto One = B.buildFConstant(S32, 1.0f); 3917 3918 auto DenominatorScaled = 3919 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3920 .addUse(LHS) 3921 .addUse(RHS) 3922 .addImm(0) 3923 .setMIFlags(Flags); 3924 auto NumeratorScaled = 3925 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3926 .addUse(LHS) 3927 .addUse(RHS) 3928 .addImm(1) 3929 .setMIFlags(Flags); 3930 3931 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3932 .addUse(DenominatorScaled.getReg(0)) 3933 .setMIFlags(Flags); 3934 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3935 3936 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3937 // aren't modeled as reading it. 3938 if (!Mode.allFP32Denormals()) 3939 toggleSPDenormMode(true, B, ST, Mode); 3940 3941 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3942 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3943 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3944 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3945 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3946 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3947 3948 if (!Mode.allFP32Denormals()) 3949 toggleSPDenormMode(false, B, ST, Mode); 3950 3951 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3952 .addUse(Fma4.getReg(0)) 3953 .addUse(Fma1.getReg(0)) 3954 .addUse(Fma3.getReg(0)) 3955 .addUse(NumeratorScaled.getReg(1)) 3956 .setMIFlags(Flags); 3957 3958 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3959 .addUse(Fmas.getReg(0)) 3960 .addUse(RHS) 3961 .addUse(LHS) 3962 .setMIFlags(Flags); 3963 3964 MI.eraseFromParent(); 3965 return true; 3966 } 3967 3968 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3969 MachineRegisterInfo &MRI, 3970 MachineIRBuilder &B) const { 3971 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3972 return true; 3973 3974 Register Res = MI.getOperand(0).getReg(); 3975 Register LHS = MI.getOperand(1).getReg(); 3976 Register RHS = MI.getOperand(2).getReg(); 3977 3978 uint16_t Flags = MI.getFlags(); 3979 3980 LLT S64 = LLT::scalar(64); 3981 LLT S1 = LLT::scalar(1); 3982 3983 auto One = B.buildFConstant(S64, 1.0); 3984 3985 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3986 .addUse(LHS) 3987 .addUse(RHS) 3988 .addImm(0) 3989 .setMIFlags(Flags); 3990 3991 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3992 3993 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3994 .addUse(DivScale0.getReg(0)) 3995 .setMIFlags(Flags); 3996 3997 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3998 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3999 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 4000 4001 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 4002 .addUse(LHS) 4003 .addUse(RHS) 4004 .addImm(1) 4005 .setMIFlags(Flags); 4006 4007 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 4008 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 4009 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 4010 4011 Register Scale; 4012 if (!ST.hasUsableDivScaleConditionOutput()) { 4013 // Workaround a hardware bug on SI where the condition output from div_scale 4014 // is not usable. 4015 4016 LLT S32 = LLT::scalar(32); 4017 4018 auto NumUnmerge = B.buildUnmerge(S32, LHS); 4019 auto DenUnmerge = B.buildUnmerge(S32, RHS); 4020 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 4021 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 4022 4023 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 4024 Scale1Unmerge.getReg(1)); 4025 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 4026 Scale0Unmerge.getReg(1)); 4027 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 4028 } else { 4029 Scale = DivScale1.getReg(1); 4030 } 4031 4032 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 4033 .addUse(Fma4.getReg(0)) 4034 .addUse(Fma3.getReg(0)) 4035 .addUse(Mul.getReg(0)) 4036 .addUse(Scale) 4037 .setMIFlags(Flags); 4038 4039 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 4040 .addUse(Fmas.getReg(0)) 4041 .addUse(RHS) 4042 .addUse(LHS) 4043 .setMIFlags(Flags); 4044 4045 MI.eraseFromParent(); 4046 return true; 4047 } 4048 4049 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 4050 MachineRegisterInfo &MRI, 4051 MachineIRBuilder &B) const { 4052 Register Res = MI.getOperand(0).getReg(); 4053 Register LHS = MI.getOperand(2).getReg(); 4054 Register RHS = MI.getOperand(3).getReg(); 4055 uint16_t Flags = MI.getFlags(); 4056 4057 LLT S32 = LLT::scalar(32); 4058 LLT S1 = LLT::scalar(1); 4059 4060 auto Abs = B.buildFAbs(S32, RHS, Flags); 4061 const APFloat C0Val(1.0f); 4062 4063 auto C0 = B.buildConstant(S32, 0x6f800000); 4064 auto C1 = B.buildConstant(S32, 0x2f800000); 4065 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 4066 4067 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 4068 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 4069 4070 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 4071 4072 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 4073 .addUse(Mul0.getReg(0)) 4074 .setMIFlags(Flags); 4075 4076 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 4077 4078 B.buildFMul(Res, Sel, Mul1, Flags); 4079 4080 MI.eraseFromParent(); 4081 return true; 4082 } 4083 4084 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 4085 // FIXME: Why do we handle this one but not other removed instructions? 4086 // 4087 // Reciprocal square root. The clamp prevents infinite results, clamping 4088 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 4089 // +-max_float. 4090 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 4091 MachineRegisterInfo &MRI, 4092 MachineIRBuilder &B) const { 4093 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 4094 return true; 4095 4096 Register Dst = MI.getOperand(0).getReg(); 4097 Register Src = MI.getOperand(2).getReg(); 4098 auto Flags = MI.getFlags(); 4099 4100 LLT Ty = MRI.getType(Dst); 4101 4102 const fltSemantics *FltSemantics; 4103 if (Ty == LLT::scalar(32)) 4104 FltSemantics = &APFloat::IEEEsingle(); 4105 else if (Ty == LLT::scalar(64)) 4106 FltSemantics = &APFloat::IEEEdouble(); 4107 else 4108 return false; 4109 4110 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 4111 .addUse(Src) 4112 .setMIFlags(Flags); 4113 4114 // We don't need to concern ourselves with the snan handling difference, since 4115 // the rsq quieted (or not) so use the one which will directly select. 4116 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4117 const bool UseIEEE = MFI->getMode().IEEE; 4118 4119 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 4120 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 4121 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 4122 4123 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 4124 4125 if (UseIEEE) 4126 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 4127 else 4128 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 4129 MI.eraseFromParent(); 4130 return true; 4131 } 4132 4133 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 4134 switch (IID) { 4135 case Intrinsic::amdgcn_ds_fadd: 4136 return AMDGPU::G_ATOMICRMW_FADD; 4137 case Intrinsic::amdgcn_ds_fmin: 4138 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 4139 case Intrinsic::amdgcn_ds_fmax: 4140 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 4141 default: 4142 llvm_unreachable("not a DS FP intrinsic"); 4143 } 4144 } 4145 4146 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 4147 MachineInstr &MI, 4148 Intrinsic::ID IID) const { 4149 GISelChangeObserver &Observer = Helper.Observer; 4150 Observer.changingInstr(MI); 4151 4152 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 4153 4154 // The remaining operands were used to set fields in the MemOperand on 4155 // construction. 4156 for (int I = 6; I > 3; --I) 4157 MI.removeOperand(I); 4158 4159 MI.removeOperand(1); // Remove the intrinsic ID. 4160 Observer.changedInstr(MI); 4161 return true; 4162 } 4163 4164 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 4165 MachineRegisterInfo &MRI, 4166 MachineIRBuilder &B) const { 4167 uint64_t Offset = 4168 ST.getTargetLowering()->getImplicitParameterOffset( 4169 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 4170 LLT DstTy = MRI.getType(DstReg); 4171 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 4172 4173 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 4174 if (!loadInputValue(KernargPtrReg, B, 4175 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4176 return false; 4177 4178 // FIXME: This should be nuw 4179 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 4180 return true; 4181 } 4182 4183 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 4184 MachineRegisterInfo &MRI, 4185 MachineIRBuilder &B) const { 4186 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4187 if (!MFI->isEntryFunction()) { 4188 return legalizePreloadedArgIntrin(MI, MRI, B, 4189 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 4190 } 4191 4192 Register DstReg = MI.getOperand(0).getReg(); 4193 if (!getImplicitArgPtr(DstReg, MRI, B)) 4194 return false; 4195 4196 MI.eraseFromParent(); 4197 return true; 4198 } 4199 4200 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 4201 MachineRegisterInfo &MRI, 4202 MachineIRBuilder &B, 4203 unsigned AddrSpace) const { 4204 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 4205 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 4206 Register Hi32 = Unmerge.getReg(1); 4207 4208 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 4209 MI.eraseFromParent(); 4210 return true; 4211 } 4212 4213 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 4214 // offset (the offset that is included in bounds checking and swizzling, to be 4215 // split between the instruction's voffset and immoffset fields) and soffset 4216 // (the offset that is excluded from bounds checking and swizzling, to go in 4217 // the instruction's soffset field). This function takes the first kind of 4218 // offset and figures out how to split it between voffset and immoffset. 4219 std::pair<Register, unsigned> 4220 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 4221 Register OrigOffset) const { 4222 const unsigned MaxImm = 4095; 4223 Register BaseReg; 4224 unsigned ImmOffset; 4225 const LLT S32 = LLT::scalar(32); 4226 MachineRegisterInfo &MRI = *B.getMRI(); 4227 4228 std::tie(BaseReg, ImmOffset) = 4229 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 4230 4231 // If BaseReg is a pointer, convert it to int. 4232 if (MRI.getType(BaseReg).isPointer()) 4233 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 4234 4235 // If the immediate value is too big for the immoffset field, put the value 4236 // and -4096 into the immoffset field so that the value that is copied/added 4237 // for the voffset field is a multiple of 4096, and it stands more chance 4238 // of being CSEd with the copy/add for another similar load/store. 4239 // However, do not do that rounding down to a multiple of 4096 if that is a 4240 // negative number, as it appears to be illegal to have a negative offset 4241 // in the vgpr, even if adding the immediate offset makes it positive. 4242 unsigned Overflow = ImmOffset & ~MaxImm; 4243 ImmOffset -= Overflow; 4244 if ((int32_t)Overflow < 0) { 4245 Overflow += ImmOffset; 4246 ImmOffset = 0; 4247 } 4248 4249 if (Overflow != 0) { 4250 if (!BaseReg) { 4251 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 4252 } else { 4253 auto OverflowVal = B.buildConstant(S32, Overflow); 4254 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 4255 } 4256 } 4257 4258 if (!BaseReg) 4259 BaseReg = B.buildConstant(S32, 0).getReg(0); 4260 4261 return std::make_pair(BaseReg, ImmOffset); 4262 } 4263 4264 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 4265 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 4266 Register VOffset, Register SOffset, 4267 unsigned ImmOffset, Register VIndex, 4268 MachineRegisterInfo &MRI) const { 4269 Optional<ValueAndVReg> MaybeVOffsetVal = 4270 getIConstantVRegValWithLookThrough(VOffset, MRI); 4271 Optional<ValueAndVReg> MaybeSOffsetVal = 4272 getIConstantVRegValWithLookThrough(SOffset, MRI); 4273 Optional<ValueAndVReg> MaybeVIndexVal = 4274 getIConstantVRegValWithLookThrough(VIndex, MRI); 4275 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 4276 // update the MMO with that offset. The stride is unknown so we can only do 4277 // this if VIndex is constant 0. 4278 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 4279 MaybeVIndexVal->Value == 0) { 4280 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 4281 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 4282 MMO->setOffset(TotalOffset); 4283 } else { 4284 // We don't have a constant combined offset to use in the MMO. Give up. 4285 MMO->setValue((Value *)nullptr); 4286 } 4287 } 4288 4289 /// Handle register layout difference for f16 images for some subtargets. 4290 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 4291 MachineRegisterInfo &MRI, 4292 Register Reg, 4293 bool ImageStore) const { 4294 const LLT S16 = LLT::scalar(16); 4295 const LLT S32 = LLT::scalar(32); 4296 LLT StoreVT = MRI.getType(Reg); 4297 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 4298 4299 if (ST.hasUnpackedD16VMem()) { 4300 auto Unmerge = B.buildUnmerge(S16, Reg); 4301 4302 SmallVector<Register, 4> WideRegs; 4303 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4304 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 4305 4306 int NumElts = StoreVT.getNumElements(); 4307 4308 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 4309 .getReg(0); 4310 } 4311 4312 if (ImageStore && ST.hasImageStoreD16Bug()) { 4313 if (StoreVT.getNumElements() == 2) { 4314 SmallVector<Register, 4> PackedRegs; 4315 Reg = B.buildBitcast(S32, Reg).getReg(0); 4316 PackedRegs.push_back(Reg); 4317 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 4318 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 4319 .getReg(0); 4320 } 4321 4322 if (StoreVT.getNumElements() == 3) { 4323 SmallVector<Register, 4> PackedRegs; 4324 auto Unmerge = B.buildUnmerge(S16, Reg); 4325 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4326 PackedRegs.push_back(Unmerge.getReg(I)); 4327 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 4328 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 4329 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 4330 } 4331 4332 if (StoreVT.getNumElements() == 4) { 4333 SmallVector<Register, 4> PackedRegs; 4334 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 4335 auto Unmerge = B.buildUnmerge(S32, Reg); 4336 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4337 PackedRegs.push_back(Unmerge.getReg(I)); 4338 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 4339 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 4340 .getReg(0); 4341 } 4342 4343 llvm_unreachable("invalid data type"); 4344 } 4345 4346 if (StoreVT == LLT::fixed_vector(3, S16)) { 4347 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 4348 .getReg(0); 4349 } 4350 return Reg; 4351 } 4352 4353 Register AMDGPULegalizerInfo::fixStoreSourceType( 4354 MachineIRBuilder &B, Register VData, bool IsFormat) const { 4355 MachineRegisterInfo *MRI = B.getMRI(); 4356 LLT Ty = MRI->getType(VData); 4357 4358 const LLT S16 = LLT::scalar(16); 4359 4360 // Fixup illegal register types for i8 stores. 4361 if (Ty == LLT::scalar(8) || Ty == S16) { 4362 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 4363 return AnyExt; 4364 } 4365 4366 if (Ty.isVector()) { 4367 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 4368 if (IsFormat) 4369 return handleD16VData(B, *MRI, VData); 4370 } 4371 } 4372 4373 return VData; 4374 } 4375 4376 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 4377 MachineRegisterInfo &MRI, 4378 MachineIRBuilder &B, 4379 bool IsTyped, 4380 bool IsFormat) const { 4381 Register VData = MI.getOperand(1).getReg(); 4382 LLT Ty = MRI.getType(VData); 4383 LLT EltTy = Ty.getScalarType(); 4384 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4385 const LLT S32 = LLT::scalar(32); 4386 4387 VData = fixStoreSourceType(B, VData, IsFormat); 4388 Register RSrc = MI.getOperand(2).getReg(); 4389 4390 MachineMemOperand *MMO = *MI.memoperands_begin(); 4391 const int MemSize = MMO->getSize(); 4392 4393 unsigned ImmOffset; 4394 4395 // The typed intrinsics add an immediate after the registers. 4396 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4397 4398 // The struct intrinsic variants add one additional operand over raw. 4399 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4400 Register VIndex; 4401 int OpOffset = 0; 4402 if (HasVIndex) { 4403 VIndex = MI.getOperand(3).getReg(); 4404 OpOffset = 1; 4405 } else { 4406 VIndex = B.buildConstant(S32, 0).getReg(0); 4407 } 4408 4409 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4410 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4411 4412 unsigned Format = 0; 4413 if (IsTyped) { 4414 Format = MI.getOperand(5 + OpOffset).getImm(); 4415 ++OpOffset; 4416 } 4417 4418 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4419 4420 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4421 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4422 4423 unsigned Opc; 4424 if (IsTyped) { 4425 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 4426 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 4427 } else if (IsFormat) { 4428 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 4429 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 4430 } else { 4431 switch (MemSize) { 4432 case 1: 4433 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 4434 break; 4435 case 2: 4436 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 4437 break; 4438 default: 4439 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 4440 break; 4441 } 4442 } 4443 4444 auto MIB = B.buildInstr(Opc) 4445 .addUse(VData) // vdata 4446 .addUse(RSrc) // rsrc 4447 .addUse(VIndex) // vindex 4448 .addUse(VOffset) // voffset 4449 .addUse(SOffset) // soffset 4450 .addImm(ImmOffset); // offset(imm) 4451 4452 if (IsTyped) 4453 MIB.addImm(Format); 4454 4455 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4456 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4457 .addMemOperand(MMO); 4458 4459 MI.eraseFromParent(); 4460 return true; 4461 } 4462 4463 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 4464 MachineRegisterInfo &MRI, 4465 MachineIRBuilder &B, 4466 bool IsFormat, 4467 bool IsTyped) const { 4468 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4469 MachineMemOperand *MMO = *MI.memoperands_begin(); 4470 const LLT MemTy = MMO->getMemoryType(); 4471 const LLT S32 = LLT::scalar(32); 4472 4473 Register Dst = MI.getOperand(0).getReg(); 4474 Register RSrc = MI.getOperand(2).getReg(); 4475 4476 // The typed intrinsics add an immediate after the registers. 4477 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4478 4479 // The struct intrinsic variants add one additional operand over raw. 4480 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4481 Register VIndex; 4482 int OpOffset = 0; 4483 if (HasVIndex) { 4484 VIndex = MI.getOperand(3).getReg(); 4485 OpOffset = 1; 4486 } else { 4487 VIndex = B.buildConstant(S32, 0).getReg(0); 4488 } 4489 4490 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4491 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4492 4493 unsigned Format = 0; 4494 if (IsTyped) { 4495 Format = MI.getOperand(5 + OpOffset).getImm(); 4496 ++OpOffset; 4497 } 4498 4499 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4500 unsigned ImmOffset; 4501 4502 LLT Ty = MRI.getType(Dst); 4503 LLT EltTy = Ty.getScalarType(); 4504 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4505 const bool Unpacked = ST.hasUnpackedD16VMem(); 4506 4507 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4508 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4509 4510 unsigned Opc; 4511 4512 if (IsTyped) { 4513 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4514 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4515 } else if (IsFormat) { 4516 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4517 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4518 } else { 4519 switch (MemTy.getSizeInBits()) { 4520 case 8: 4521 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4522 break; 4523 case 16: 4524 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4525 break; 4526 default: 4527 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4528 break; 4529 } 4530 } 4531 4532 Register LoadDstReg; 4533 4534 bool IsExtLoad = 4535 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4536 LLT UnpackedTy = Ty.changeElementSize(32); 4537 4538 if (IsExtLoad) 4539 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4540 else if (Unpacked && IsD16 && Ty.isVector()) 4541 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4542 else 4543 LoadDstReg = Dst; 4544 4545 auto MIB = B.buildInstr(Opc) 4546 .addDef(LoadDstReg) // vdata 4547 .addUse(RSrc) // rsrc 4548 .addUse(VIndex) // vindex 4549 .addUse(VOffset) // voffset 4550 .addUse(SOffset) // soffset 4551 .addImm(ImmOffset); // offset(imm) 4552 4553 if (IsTyped) 4554 MIB.addImm(Format); 4555 4556 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4557 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4558 .addMemOperand(MMO); 4559 4560 if (LoadDstReg != Dst) { 4561 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4562 4563 // Widen result for extending loads was widened. 4564 if (IsExtLoad) 4565 B.buildTrunc(Dst, LoadDstReg); 4566 else { 4567 // Repack to original 16-bit vector result 4568 // FIXME: G_TRUNC should work, but legalization currently fails 4569 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4570 SmallVector<Register, 4> Repack; 4571 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4572 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4573 B.buildMerge(Dst, Repack); 4574 } 4575 } 4576 4577 MI.eraseFromParent(); 4578 return true; 4579 } 4580 4581 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4582 MachineIRBuilder &B, 4583 bool IsInc) const { 4584 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4585 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4586 B.buildInstr(Opc) 4587 .addDef(MI.getOperand(0).getReg()) 4588 .addUse(MI.getOperand(2).getReg()) 4589 .addUse(MI.getOperand(3).getReg()) 4590 .cloneMemRefs(MI); 4591 MI.eraseFromParent(); 4592 return true; 4593 } 4594 4595 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4596 switch (IntrID) { 4597 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4598 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4599 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4600 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4601 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4602 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4603 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4604 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4605 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4606 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4607 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4608 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4609 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4610 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4611 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4612 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4613 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4614 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4615 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4616 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4617 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4618 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4619 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4620 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4621 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4622 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4623 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4624 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4625 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4626 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4627 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4628 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4629 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4630 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4631 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4632 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4633 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4634 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4635 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4636 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4637 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4638 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4639 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4640 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4641 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4642 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4643 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4644 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4645 default: 4646 llvm_unreachable("unhandled atomic opcode"); 4647 } 4648 } 4649 4650 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4651 MachineIRBuilder &B, 4652 Intrinsic::ID IID) const { 4653 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4654 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4655 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4656 4657 Register Dst; 4658 4659 int OpOffset = 0; 4660 if (HasReturn) { 4661 // A few FP atomics do not support return values. 4662 Dst = MI.getOperand(0).getReg(); 4663 } else { 4664 OpOffset = -1; 4665 } 4666 4667 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4668 Register CmpVal; 4669 4670 if (IsCmpSwap) { 4671 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4672 ++OpOffset; 4673 } 4674 4675 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4676 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4677 4678 // The struct intrinsic variants add one additional operand over raw. 4679 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4680 Register VIndex; 4681 if (HasVIndex) { 4682 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4683 ++OpOffset; 4684 } else { 4685 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4686 } 4687 4688 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4689 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4690 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4691 4692 MachineMemOperand *MMO = *MI.memoperands_begin(); 4693 4694 unsigned ImmOffset; 4695 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4696 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4697 4698 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4699 4700 if (HasReturn) 4701 MIB.addDef(Dst); 4702 4703 MIB.addUse(VData); // vdata 4704 4705 if (IsCmpSwap) 4706 MIB.addReg(CmpVal); 4707 4708 MIB.addUse(RSrc) // rsrc 4709 .addUse(VIndex) // vindex 4710 .addUse(VOffset) // voffset 4711 .addUse(SOffset) // soffset 4712 .addImm(ImmOffset) // offset(imm) 4713 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4714 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4715 .addMemOperand(MMO); 4716 4717 MI.eraseFromParent(); 4718 return true; 4719 } 4720 4721 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4722 /// vector with s16 typed elements. 4723 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4724 SmallVectorImpl<Register> &PackedAddrs, 4725 unsigned ArgOffset, 4726 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4727 bool IsA16, bool IsG16) { 4728 const LLT S16 = LLT::scalar(16); 4729 const LLT V2S16 = LLT::fixed_vector(2, 16); 4730 auto EndIdx = Intr->VAddrEnd; 4731 4732 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4733 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4734 if (!SrcOp.isReg()) 4735 continue; // _L to _LZ may have eliminated this. 4736 4737 Register AddrReg = SrcOp.getReg(); 4738 4739 if ((I < Intr->GradientStart) || 4740 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4741 (I >= Intr->CoordStart && !IsA16)) { 4742 if ((I < Intr->GradientStart) && IsA16 && 4743 (B.getMRI()->getType(AddrReg) == S16)) { 4744 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4745 // Special handling of bias when A16 is on. Bias is of type half but 4746 // occupies full 32-bit. 4747 PackedAddrs.push_back( 4748 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4749 .getReg(0)); 4750 } else { 4751 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4752 "Bias needs to be converted to 16 bit in A16 mode"); 4753 // Handle any gradient or coordinate operands that should not be packed 4754 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4755 PackedAddrs.push_back(AddrReg); 4756 } 4757 } else { 4758 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4759 // derivatives dx/dh and dx/dv are packed with undef. 4760 if (((I + 1) >= EndIdx) || 4761 ((Intr->NumGradients / 2) % 2 == 1 && 4762 (I == static_cast<unsigned>(Intr->GradientStart + 4763 (Intr->NumGradients / 2) - 1) || 4764 I == static_cast<unsigned>(Intr->GradientStart + 4765 Intr->NumGradients - 1))) || 4766 // Check for _L to _LZ optimization 4767 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4768 PackedAddrs.push_back( 4769 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4770 .getReg(0)); 4771 } else { 4772 PackedAddrs.push_back( 4773 B.buildBuildVector( 4774 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4775 .getReg(0)); 4776 ++I; 4777 } 4778 } 4779 } 4780 } 4781 4782 /// Convert from separate vaddr components to a single vector address register, 4783 /// and replace the remaining operands with $noreg. 4784 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4785 int DimIdx, int NumVAddrs) { 4786 const LLT S32 = LLT::scalar(32); 4787 4788 SmallVector<Register, 8> AddrRegs; 4789 for (int I = 0; I != NumVAddrs; ++I) { 4790 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4791 if (SrcOp.isReg()) { 4792 AddrRegs.push_back(SrcOp.getReg()); 4793 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4794 } 4795 } 4796 4797 int NumAddrRegs = AddrRegs.size(); 4798 if (NumAddrRegs != 1) { 4799 // Above 8 elements round up to next power of 2 (i.e. 16). 4800 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4801 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4802 auto Undef = B.buildUndef(S32); 4803 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4804 NumAddrRegs = RoundedNumRegs; 4805 } 4806 4807 auto VAddr = 4808 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4809 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4810 } 4811 4812 for (int I = 1; I != NumVAddrs; ++I) { 4813 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4814 if (SrcOp.isReg()) 4815 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4816 } 4817 } 4818 4819 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4820 /// 4821 /// Depending on the subtarget, load/store with 16-bit element data need to be 4822 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4823 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4824 /// registers. 4825 /// 4826 /// We don't want to directly select image instructions just yet, but also want 4827 /// to exposes all register repacking to the legalizer/combiners. We also don't 4828 /// want a selected instruction entering RegBankSelect. In order to avoid 4829 /// defining a multitude of intermediate image instructions, directly hack on 4830 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4831 /// padding now unnecessary arguments with $noreg. 4832 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4833 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4834 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4835 4836 const unsigned NumDefs = MI.getNumExplicitDefs(); 4837 const unsigned ArgOffset = NumDefs + 1; 4838 bool IsTFE = NumDefs == 2; 4839 // We are only processing the operands of d16 image operations on subtargets 4840 // that use the unpacked register layout, or need to repack the TFE result. 4841 4842 // TODO: Do we need to guard against already legalized intrinsics? 4843 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4844 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4845 4846 MachineRegisterInfo *MRI = B.getMRI(); 4847 const LLT S32 = LLT::scalar(32); 4848 const LLT S16 = LLT::scalar(16); 4849 const LLT V2S16 = LLT::fixed_vector(2, 16); 4850 4851 unsigned DMask = 0; 4852 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4853 LLT Ty = MRI->getType(VData); 4854 4855 // Check for 16 bit addresses and pack if true. 4856 LLT GradTy = 4857 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4858 LLT AddrTy = 4859 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4860 const bool IsG16 = GradTy == S16; 4861 const bool IsA16 = AddrTy == S16; 4862 const bool IsD16 = Ty.getScalarType() == S16; 4863 4864 int DMaskLanes = 0; 4865 if (!BaseOpcode->Atomic) { 4866 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4867 if (BaseOpcode->Gather4) { 4868 DMaskLanes = 4; 4869 } else if (DMask != 0) { 4870 DMaskLanes = countPopulation(DMask); 4871 } else if (!IsTFE && !BaseOpcode->Store) { 4872 // If dmask is 0, this is a no-op load. This can be eliminated. 4873 B.buildUndef(MI.getOperand(0)); 4874 MI.eraseFromParent(); 4875 return true; 4876 } 4877 } 4878 4879 Observer.changingInstr(MI); 4880 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4881 4882 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4883 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4884 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4885 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4886 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4887 4888 // Track that we legalized this 4889 MI.setDesc(B.getTII().get(NewOpcode)); 4890 4891 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4892 // dmask to be at least 1 otherwise the instruction will fail 4893 if (IsTFE && DMask == 0) { 4894 DMask = 0x1; 4895 DMaskLanes = 1; 4896 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4897 } 4898 4899 if (BaseOpcode->Atomic) { 4900 Register VData0 = MI.getOperand(2).getReg(); 4901 LLT Ty = MRI->getType(VData0); 4902 4903 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4904 if (Ty.isVector()) 4905 return false; 4906 4907 if (BaseOpcode->AtomicX2) { 4908 Register VData1 = MI.getOperand(3).getReg(); 4909 // The two values are packed in one register. 4910 LLT PackedTy = LLT::fixed_vector(2, Ty); 4911 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4912 MI.getOperand(2).setReg(Concat.getReg(0)); 4913 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4914 } 4915 } 4916 4917 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4918 4919 // Rewrite the addressing register layout before doing anything else. 4920 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4921 // 16 bit gradients are supported, but are tied to the A16 control 4922 // so both gradients and addresses must be 16 bit 4923 return false; 4924 } 4925 4926 if (IsA16 && !ST.hasA16()) { 4927 // A16 not supported 4928 return false; 4929 } 4930 4931 if (IsA16 || IsG16) { 4932 if (Intr->NumVAddrs > 1) { 4933 SmallVector<Register, 4> PackedRegs; 4934 4935 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4936 IsG16); 4937 4938 // See also below in the non-a16 branch 4939 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4940 PackedRegs.size() <= ST.getNSAMaxSize(); 4941 4942 if (!UseNSA && PackedRegs.size() > 1) { 4943 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4944 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4945 PackedRegs[0] = Concat.getReg(0); 4946 PackedRegs.resize(1); 4947 } 4948 4949 const unsigned NumPacked = PackedRegs.size(); 4950 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4951 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4952 if (!SrcOp.isReg()) { 4953 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4954 continue; 4955 } 4956 4957 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4958 4959 if (I - Intr->VAddrStart < NumPacked) 4960 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4961 else 4962 SrcOp.setReg(AMDGPU::NoRegister); 4963 } 4964 } 4965 } else { 4966 // If the register allocator cannot place the address registers contiguously 4967 // without introducing moves, then using the non-sequential address encoding 4968 // is always preferable, since it saves VALU instructions and is usually a 4969 // wash in terms of code size or even better. 4970 // 4971 // However, we currently have no way of hinting to the register allocator 4972 // that MIMG addresses should be placed contiguously when it is possible to 4973 // do so, so force non-NSA for the common 2-address case as a heuristic. 4974 // 4975 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4976 // allocation when possible. 4977 // 4978 // TODO: we can actually allow partial NSA where the final register is a 4979 // contiguous set of the remaining addresses. 4980 // This could help where there are more addresses than supported. 4981 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4982 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4983 4984 if (!UseNSA && Intr->NumVAddrs > 1) 4985 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4986 Intr->NumVAddrs); 4987 } 4988 4989 int Flags = 0; 4990 if (IsA16) 4991 Flags |= 1; 4992 if (IsG16) 4993 Flags |= 2; 4994 MI.addOperand(MachineOperand::CreateImm(Flags)); 4995 4996 if (BaseOpcode->Store) { // No TFE for stores? 4997 // TODO: Handle dmask trim 4998 if (!Ty.isVector() || !IsD16) 4999 return true; 5000 5001 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 5002 if (RepackedReg != VData) { 5003 MI.getOperand(1).setReg(RepackedReg); 5004 } 5005 5006 return true; 5007 } 5008 5009 Register DstReg = MI.getOperand(0).getReg(); 5010 const LLT EltTy = Ty.getScalarType(); 5011 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 5012 5013 // Confirm that the return type is large enough for the dmask specified 5014 if (NumElts < DMaskLanes) 5015 return false; 5016 5017 if (NumElts > 4 || DMaskLanes > 4) 5018 return false; 5019 5020 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 5021 const LLT AdjustedTy = 5022 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 5023 5024 // The raw dword aligned data component of the load. The only legal cases 5025 // where this matters should be when using the packed D16 format, for 5026 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 5027 LLT RoundedTy; 5028 5029 // S32 vector to to cover all data, plus TFE result element. 5030 LLT TFETy; 5031 5032 // Register type to use for each loaded component. Will be S32 or V2S16. 5033 LLT RegTy; 5034 5035 if (IsD16 && ST.hasUnpackedD16VMem()) { 5036 RoundedTy = 5037 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 5038 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 5039 RegTy = S32; 5040 } else { 5041 unsigned EltSize = EltTy.getSizeInBits(); 5042 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 5043 unsigned RoundedSize = 32 * RoundedElts; 5044 RoundedTy = LLT::scalarOrVector( 5045 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 5046 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 5047 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 5048 } 5049 5050 // The return type does not need adjustment. 5051 // TODO: Should we change s16 case to s32 or <2 x s16>? 5052 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 5053 return true; 5054 5055 Register Dst1Reg; 5056 5057 // Insert after the instruction. 5058 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 5059 5060 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 5061 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 5062 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 5063 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 5064 5065 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 5066 5067 MI.getOperand(0).setReg(NewResultReg); 5068 5069 // In the IR, TFE is supposed to be used with a 2 element struct return 5070 // type. The instruction really returns these two values in one contiguous 5071 // register, with one additional dword beyond the loaded data. Rewrite the 5072 // return type to use a single register result. 5073 5074 if (IsTFE) { 5075 Dst1Reg = MI.getOperand(1).getReg(); 5076 if (MRI->getType(Dst1Reg) != S32) 5077 return false; 5078 5079 // TODO: Make sure the TFE operand bit is set. 5080 MI.removeOperand(1); 5081 5082 // Handle the easy case that requires no repack instructions. 5083 if (Ty == S32) { 5084 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 5085 return true; 5086 } 5087 } 5088 5089 // Now figure out how to copy the new result register back into the old 5090 // result. 5091 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 5092 5093 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 5094 5095 if (ResultNumRegs == 1) { 5096 assert(!IsTFE); 5097 ResultRegs[0] = NewResultReg; 5098 } else { 5099 // We have to repack into a new vector of some kind. 5100 for (int I = 0; I != NumDataRegs; ++I) 5101 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 5102 B.buildUnmerge(ResultRegs, NewResultReg); 5103 5104 // Drop the final TFE element to get the data part. The TFE result is 5105 // directly written to the right place already. 5106 if (IsTFE) 5107 ResultRegs.resize(NumDataRegs); 5108 } 5109 5110 // For an s16 scalar result, we form an s32 result with a truncate regardless 5111 // of packed vs. unpacked. 5112 if (IsD16 && !Ty.isVector()) { 5113 B.buildTrunc(DstReg, ResultRegs[0]); 5114 return true; 5115 } 5116 5117 // Avoid a build/concat_vector of 1 entry. 5118 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 5119 B.buildBitcast(DstReg, ResultRegs[0]); 5120 return true; 5121 } 5122 5123 assert(Ty.isVector()); 5124 5125 if (IsD16) { 5126 // For packed D16 results with TFE enabled, all the data components are 5127 // S32. Cast back to the expected type. 5128 // 5129 // TODO: We don't really need to use load s32 elements. We would only need one 5130 // cast for the TFE result if a multiple of v2s16 was used. 5131 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 5132 for (Register &Reg : ResultRegs) 5133 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 5134 } else if (ST.hasUnpackedD16VMem()) { 5135 for (Register &Reg : ResultRegs) 5136 Reg = B.buildTrunc(S16, Reg).getReg(0); 5137 } 5138 } 5139 5140 auto padWithUndef = [&](LLT Ty, int NumElts) { 5141 if (NumElts == 0) 5142 return; 5143 Register Undef = B.buildUndef(Ty).getReg(0); 5144 for (int I = 0; I != NumElts; ++I) 5145 ResultRegs.push_back(Undef); 5146 }; 5147 5148 // Pad out any elements eliminated due to the dmask. 5149 LLT ResTy = MRI->getType(ResultRegs[0]); 5150 if (!ResTy.isVector()) { 5151 padWithUndef(ResTy, NumElts - ResultRegs.size()); 5152 B.buildBuildVector(DstReg, ResultRegs); 5153 return true; 5154 } 5155 5156 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 5157 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 5158 5159 // Deal with the one annoying legal case. 5160 const LLT V3S16 = LLT::fixed_vector(3, 16); 5161 if (Ty == V3S16) { 5162 if (IsTFE) { 5163 if (ResultRegs.size() == 1) { 5164 NewResultReg = ResultRegs[0]; 5165 } else if (ResultRegs.size() == 2) { 5166 LLT V4S16 = LLT::fixed_vector(4, 16); 5167 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 5168 } else { 5169 return false; 5170 } 5171 } 5172 5173 if (MRI->getType(DstReg).getNumElements() < 5174 MRI->getType(NewResultReg).getNumElements()) { 5175 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 5176 } else { 5177 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 5178 } 5179 return true; 5180 } 5181 5182 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 5183 B.buildConcatVectors(DstReg, ResultRegs); 5184 return true; 5185 } 5186 5187 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 5188 LegalizerHelper &Helper, MachineInstr &MI) const { 5189 MachineIRBuilder &B = Helper.MIRBuilder; 5190 GISelChangeObserver &Observer = Helper.Observer; 5191 5192 Register Dst = MI.getOperand(0).getReg(); 5193 LLT Ty = B.getMRI()->getType(Dst); 5194 unsigned Size = Ty.getSizeInBits(); 5195 MachineFunction &MF = B.getMF(); 5196 5197 Observer.changingInstr(MI); 5198 5199 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 5200 Ty = getBitcastRegisterType(Ty); 5201 Helper.bitcastDst(MI, Ty, 0); 5202 Dst = MI.getOperand(0).getReg(); 5203 B.setInsertPt(B.getMBB(), MI); 5204 } 5205 5206 // FIXME: We don't really need this intermediate instruction. The intrinsic 5207 // should be fixed to have a memory operand. Since it's readnone, we're not 5208 // allowed to add one. 5209 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 5210 MI.removeOperand(1); // Remove intrinsic ID 5211 5212 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 5213 // TODO: Should this use datalayout alignment? 5214 const unsigned MemSize = (Size + 7) / 8; 5215 const Align MemAlign(4); 5216 MachineMemOperand *MMO = MF.getMachineMemOperand( 5217 MachinePointerInfo(), 5218 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 5219 MachineMemOperand::MOInvariant, 5220 MemSize, MemAlign); 5221 MI.addMemOperand(MF, MMO); 5222 5223 // There are no 96-bit result scalar loads, but widening to 128-bit should 5224 // always be legal. We may need to restore this to a 96-bit result if it turns 5225 // out this needs to be converted to a vector load during RegBankSelect. 5226 if (!isPowerOf2_32(Size)) { 5227 if (Ty.isVector()) 5228 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 5229 else 5230 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 5231 } 5232 5233 Observer.changedInstr(MI); 5234 return true; 5235 } 5236 5237 // TODO: Move to selection 5238 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 5239 MachineRegisterInfo &MRI, 5240 MachineIRBuilder &B) const { 5241 if (!ST.isTrapHandlerEnabled() || 5242 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 5243 return legalizeTrapEndpgm(MI, MRI, B); 5244 5245 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 5246 switch (*HsaAbiVer) { 5247 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 5248 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 5249 return legalizeTrapHsaQueuePtr(MI, MRI, B); 5250 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 5251 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 5252 return ST.supportsGetDoorbellID() ? 5253 legalizeTrapHsa(MI, MRI, B) : 5254 legalizeTrapHsaQueuePtr(MI, MRI, B); 5255 } 5256 } 5257 5258 llvm_unreachable("Unknown trap handler"); 5259 } 5260 5261 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 5262 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5263 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 5264 MI.eraseFromParent(); 5265 return true; 5266 } 5267 5268 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 5269 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5270 MachineFunction &MF = B.getMF(); 5271 const LLT S64 = LLT::scalar(64); 5272 5273 Register SGPR01(AMDGPU::SGPR0_SGPR1); 5274 // For code object version 5, queue_ptr is passed through implicit kernarg. 5275 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 5276 AMDGPUTargetLowering::ImplicitParameter Param = 5277 AMDGPUTargetLowering::QUEUE_PTR; 5278 uint64_t Offset = 5279 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 5280 5281 Register KernargPtrReg = MRI.createGenericVirtualRegister( 5282 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5283 5284 if (!loadInputValue(KernargPtrReg, B, 5285 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5286 return false; 5287 5288 // TODO: can we be smarter about machine pointer info? 5289 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 5290 MachineMemOperand *MMO = MF.getMachineMemOperand( 5291 PtrInfo, 5292 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 5293 MachineMemOperand::MOInvariant, 5294 LLT::scalar(64), commonAlignment(Align(64), Offset)); 5295 5296 // Pointer address 5297 Register LoadAddr = MRI.createGenericVirtualRegister( 5298 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5299 B.buildPtrAdd(LoadAddr, KernargPtrReg, 5300 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 5301 // Load address 5302 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 5303 B.buildCopy(SGPR01, Temp); 5304 B.buildInstr(AMDGPU::S_TRAP) 5305 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 5306 .addReg(SGPR01, RegState::Implicit); 5307 MI.eraseFromParent(); 5308 return true; 5309 } 5310 5311 // Pass queue pointer to trap handler as input, and insert trap instruction 5312 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 5313 Register LiveIn = 5314 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5315 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 5316 return false; 5317 5318 B.buildCopy(SGPR01, LiveIn); 5319 B.buildInstr(AMDGPU::S_TRAP) 5320 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 5321 .addReg(SGPR01, RegState::Implicit); 5322 5323 MI.eraseFromParent(); 5324 return true; 5325 } 5326 5327 bool AMDGPULegalizerInfo::legalizeTrapHsa( 5328 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5329 B.buildInstr(AMDGPU::S_TRAP) 5330 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 5331 MI.eraseFromParent(); 5332 return true; 5333 } 5334 5335 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 5336 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5337 // Is non-HSA path or trap-handler disabled? Then, report a warning 5338 // accordingly 5339 if (!ST.isTrapHandlerEnabled() || 5340 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 5341 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 5342 "debugtrap handler not supported", 5343 MI.getDebugLoc(), DS_Warning); 5344 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 5345 Ctx.diagnose(NoTrap); 5346 } else { 5347 // Insert debug-trap instruction 5348 B.buildInstr(AMDGPU::S_TRAP) 5349 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 5350 } 5351 5352 MI.eraseFromParent(); 5353 return true; 5354 } 5355 5356 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 5357 MachineIRBuilder &B) const { 5358 MachineRegisterInfo &MRI = *B.getMRI(); 5359 const LLT S16 = LLT::scalar(16); 5360 const LLT S32 = LLT::scalar(32); 5361 const LLT V2S16 = LLT::fixed_vector(2, 16); 5362 const LLT V3S32 = LLT::fixed_vector(3, 32); 5363 5364 Register DstReg = MI.getOperand(0).getReg(); 5365 Register NodePtr = MI.getOperand(2).getReg(); 5366 Register RayExtent = MI.getOperand(3).getReg(); 5367 Register RayOrigin = MI.getOperand(4).getReg(); 5368 Register RayDir = MI.getOperand(5).getReg(); 5369 Register RayInvDir = MI.getOperand(6).getReg(); 5370 Register TDescr = MI.getOperand(7).getReg(); 5371 5372 if (!ST.hasGFX10_AEncoding()) { 5373 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 5374 "intrinsic not supported on subtarget", 5375 MI.getDebugLoc()); 5376 B.getMF().getFunction().getContext().diagnose(BadIntrin); 5377 return false; 5378 } 5379 5380 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 5381 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 5382 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 5383 const unsigned NumVDataDwords = 4; 5384 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 5385 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 5386 const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); 5387 const unsigned BaseOpcodes[2][2] = { 5388 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 5389 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 5390 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 5391 int Opcode; 5392 if (UseNSA) { 5393 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 5394 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA 5395 : AMDGPU::MIMGEncGfx10NSA, 5396 NumVDataDwords, NumVAddrDwords); 5397 } else { 5398 Opcode = AMDGPU::getMIMGOpcode( 5399 BaseOpcodes[Is64][IsA16], 5400 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, 5401 NumVDataDwords, PowerOf2Ceil(NumVAddrDwords)); 5402 } 5403 assert(Opcode != -1); 5404 5405 SmallVector<Register, 12> Ops; 5406 if (UseNSA && IsGFX11Plus) { 5407 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 5408 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5409 auto Merged = B.buildMerge( 5410 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 5411 Ops.push_back(Merged.getReg(0)); 5412 }; 5413 5414 Ops.push_back(NodePtr); 5415 Ops.push_back(RayExtent); 5416 packLanes(RayOrigin); 5417 5418 if (IsA16) { 5419 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 5420 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5421 auto MergedDir = B.buildMerge( 5422 V3S32, 5423 {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0), 5424 UnmergeRayDir.getReg(0)})) 5425 .getReg(0), 5426 B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1), 5427 UnmergeRayDir.getReg(1)})) 5428 .getReg(0), 5429 B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2), 5430 UnmergeRayDir.getReg(2)})) 5431 .getReg(0)}); 5432 Ops.push_back(MergedDir.getReg(0)); 5433 } else { 5434 packLanes(RayDir); 5435 packLanes(RayInvDir); 5436 } 5437 } else { 5438 if (Is64) { 5439 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 5440 Ops.push_back(Unmerge.getReg(0)); 5441 Ops.push_back(Unmerge.getReg(1)); 5442 } else { 5443 Ops.push_back(NodePtr); 5444 } 5445 Ops.push_back(RayExtent); 5446 5447 auto packLanes = [&Ops, &S32, &B](Register Src) { 5448 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5449 Ops.push_back(Unmerge.getReg(0)); 5450 Ops.push_back(Unmerge.getReg(1)); 5451 Ops.push_back(Unmerge.getReg(2)); 5452 }; 5453 5454 packLanes(RayOrigin); 5455 if (IsA16) { 5456 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 5457 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5458 Register R1 = MRI.createGenericVirtualRegister(S32); 5459 Register R2 = MRI.createGenericVirtualRegister(S32); 5460 Register R3 = MRI.createGenericVirtualRegister(S32); 5461 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 5462 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 5463 B.buildMerge(R3, 5464 {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 5465 Ops.push_back(R1); 5466 Ops.push_back(R2); 5467 Ops.push_back(R3); 5468 } else { 5469 packLanes(RayDir); 5470 packLanes(RayInvDir); 5471 } 5472 } 5473 5474 if (!UseNSA) { 5475 // Build a single vector containing all the operands so far prepared. 5476 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 5477 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 5478 Ops.clear(); 5479 Ops.push_back(MergedOps); 5480 } 5481 5482 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 5483 .addDef(DstReg) 5484 .addImm(Opcode); 5485 5486 for (Register R : Ops) { 5487 MIB.addUse(R); 5488 } 5489 5490 MIB.addUse(TDescr) 5491 .addImm(IsA16 ? 1 : 0) 5492 .cloneMemRefs(MI); 5493 5494 MI.eraseFromParent(); 5495 return true; 5496 } 5497 5498 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 5499 MachineIRBuilder &B) const { 5500 unsigned Opc; 5501 int RoundMode = MI.getOperand(2).getImm(); 5502 5503 if (RoundMode == (int)RoundingMode::TowardPositive) 5504 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 5505 else if (RoundMode == (int)RoundingMode::TowardNegative) 5506 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 5507 else 5508 return false; 5509 5510 B.buildInstr(Opc) 5511 .addDef(MI.getOperand(0).getReg()) 5512 .addUse(MI.getOperand(1).getReg()); 5513 5514 MI.eraseFromParent(); 5515 5516 return true; 5517 } 5518 5519 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 5520 MachineInstr &MI) const { 5521 MachineIRBuilder &B = Helper.MIRBuilder; 5522 MachineRegisterInfo &MRI = *B.getMRI(); 5523 5524 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5525 auto IntrID = MI.getIntrinsicID(); 5526 switch (IntrID) { 5527 case Intrinsic::amdgcn_if: 5528 case Intrinsic::amdgcn_else: { 5529 MachineInstr *Br = nullptr; 5530 MachineBasicBlock *UncondBrTarget = nullptr; 5531 bool Negated = false; 5532 if (MachineInstr *BrCond = 5533 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5534 const SIRegisterInfo *TRI 5535 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5536 5537 Register Def = MI.getOperand(1).getReg(); 5538 Register Use = MI.getOperand(3).getReg(); 5539 5540 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5541 5542 if (Negated) 5543 std::swap(CondBrTarget, UncondBrTarget); 5544 5545 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5546 if (IntrID == Intrinsic::amdgcn_if) { 5547 B.buildInstr(AMDGPU::SI_IF) 5548 .addDef(Def) 5549 .addUse(Use) 5550 .addMBB(UncondBrTarget); 5551 } else { 5552 B.buildInstr(AMDGPU::SI_ELSE) 5553 .addDef(Def) 5554 .addUse(Use) 5555 .addMBB(UncondBrTarget); 5556 } 5557 5558 if (Br) { 5559 Br->getOperand(0).setMBB(CondBrTarget); 5560 } else { 5561 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5562 // since we're swapping branch targets it needs to be reinserted. 5563 // FIXME: IRTranslator should probably not do this 5564 B.buildBr(*CondBrTarget); 5565 } 5566 5567 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5568 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5569 MI.eraseFromParent(); 5570 BrCond->eraseFromParent(); 5571 return true; 5572 } 5573 5574 return false; 5575 } 5576 case Intrinsic::amdgcn_loop: { 5577 MachineInstr *Br = nullptr; 5578 MachineBasicBlock *UncondBrTarget = nullptr; 5579 bool Negated = false; 5580 if (MachineInstr *BrCond = 5581 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5582 const SIRegisterInfo *TRI 5583 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5584 5585 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5586 Register Reg = MI.getOperand(2).getReg(); 5587 5588 if (Negated) 5589 std::swap(CondBrTarget, UncondBrTarget); 5590 5591 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5592 B.buildInstr(AMDGPU::SI_LOOP) 5593 .addUse(Reg) 5594 .addMBB(UncondBrTarget); 5595 5596 if (Br) 5597 Br->getOperand(0).setMBB(CondBrTarget); 5598 else 5599 B.buildBr(*CondBrTarget); 5600 5601 MI.eraseFromParent(); 5602 BrCond->eraseFromParent(); 5603 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5604 return true; 5605 } 5606 5607 return false; 5608 } 5609 case Intrinsic::amdgcn_kernarg_segment_ptr: 5610 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5611 // This only makes sense to call in a kernel, so just lower to null. 5612 B.buildConstant(MI.getOperand(0).getReg(), 0); 5613 MI.eraseFromParent(); 5614 return true; 5615 } 5616 5617 return legalizePreloadedArgIntrin( 5618 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5619 case Intrinsic::amdgcn_implicitarg_ptr: 5620 return legalizeImplicitArgPtr(MI, MRI, B); 5621 case Intrinsic::amdgcn_workitem_id_x: 5622 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 5623 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5624 case Intrinsic::amdgcn_workitem_id_y: 5625 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 5626 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5627 case Intrinsic::amdgcn_workitem_id_z: 5628 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 5629 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5630 case Intrinsic::amdgcn_workgroup_id_x: 5631 return legalizePreloadedArgIntrin(MI, MRI, B, 5632 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5633 case Intrinsic::amdgcn_workgroup_id_y: 5634 return legalizePreloadedArgIntrin(MI, MRI, B, 5635 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5636 case Intrinsic::amdgcn_workgroup_id_z: 5637 return legalizePreloadedArgIntrin(MI, MRI, B, 5638 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5639 case Intrinsic::amdgcn_dispatch_ptr: 5640 return legalizePreloadedArgIntrin(MI, MRI, B, 5641 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5642 case Intrinsic::amdgcn_queue_ptr: 5643 return legalizePreloadedArgIntrin(MI, MRI, B, 5644 AMDGPUFunctionArgInfo::QUEUE_PTR); 5645 case Intrinsic::amdgcn_implicit_buffer_ptr: 5646 return legalizePreloadedArgIntrin( 5647 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5648 case Intrinsic::amdgcn_dispatch_id: 5649 return legalizePreloadedArgIntrin(MI, MRI, B, 5650 AMDGPUFunctionArgInfo::DISPATCH_ID); 5651 case Intrinsic::r600_read_ngroups_x: 5652 // TODO: Emit error for hsa 5653 return legalizeKernargMemParameter(MI, B, 5654 SI::KernelInputOffsets::NGROUPS_X); 5655 case Intrinsic::r600_read_ngroups_y: 5656 return legalizeKernargMemParameter(MI, B, 5657 SI::KernelInputOffsets::NGROUPS_Y); 5658 case Intrinsic::r600_read_ngroups_z: 5659 return legalizeKernargMemParameter(MI, B, 5660 SI::KernelInputOffsets::NGROUPS_Z); 5661 case Intrinsic::r600_read_local_size_x: 5662 // TODO: Could insert G_ASSERT_ZEXT from s16 5663 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 5664 case Intrinsic::r600_read_local_size_y: 5665 // TODO: Could insert G_ASSERT_ZEXT from s16 5666 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 5667 // TODO: Could insert G_ASSERT_ZEXT from s16 5668 case Intrinsic::r600_read_local_size_z: 5669 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 5670 case Intrinsic::r600_read_global_size_x: 5671 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 5672 case Intrinsic::r600_read_global_size_y: 5673 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 5674 case Intrinsic::r600_read_global_size_z: 5675 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 5676 case Intrinsic::amdgcn_fdiv_fast: 5677 return legalizeFDIVFastIntrin(MI, MRI, B); 5678 case Intrinsic::amdgcn_is_shared: 5679 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5680 case Intrinsic::amdgcn_is_private: 5681 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5682 case Intrinsic::amdgcn_wavefrontsize: { 5683 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5684 MI.eraseFromParent(); 5685 return true; 5686 } 5687 case Intrinsic::amdgcn_s_buffer_load: 5688 return legalizeSBufferLoad(Helper, MI); 5689 case Intrinsic::amdgcn_raw_buffer_store: 5690 case Intrinsic::amdgcn_struct_buffer_store: 5691 return legalizeBufferStore(MI, MRI, B, false, false); 5692 case Intrinsic::amdgcn_raw_buffer_store_format: 5693 case Intrinsic::amdgcn_struct_buffer_store_format: 5694 return legalizeBufferStore(MI, MRI, B, false, true); 5695 case Intrinsic::amdgcn_raw_tbuffer_store: 5696 case Intrinsic::amdgcn_struct_tbuffer_store: 5697 return legalizeBufferStore(MI, MRI, B, true, true); 5698 case Intrinsic::amdgcn_raw_buffer_load: 5699 case Intrinsic::amdgcn_struct_buffer_load: 5700 return legalizeBufferLoad(MI, MRI, B, false, false); 5701 case Intrinsic::amdgcn_raw_buffer_load_format: 5702 case Intrinsic::amdgcn_struct_buffer_load_format: 5703 return legalizeBufferLoad(MI, MRI, B, true, false); 5704 case Intrinsic::amdgcn_raw_tbuffer_load: 5705 case Intrinsic::amdgcn_struct_tbuffer_load: 5706 return legalizeBufferLoad(MI, MRI, B, true, true); 5707 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5708 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5709 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5710 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5711 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5712 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5713 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5714 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5715 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5716 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5717 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5718 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5719 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5720 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5721 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5722 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5723 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5724 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5725 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5726 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5727 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5728 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5729 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5730 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5731 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5732 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5733 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5734 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5735 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5736 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5737 return legalizeBufferAtomic(MI, B, IntrID); 5738 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5739 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5740 Register DstReg = MI.getOperand(0).getReg(); 5741 if (!MRI.use_empty(DstReg) && 5742 !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) { 5743 Function &F = B.getMF().getFunction(); 5744 DiagnosticInfoUnsupported NoFpRet( 5745 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5746 DS_Error); 5747 F.getContext().diagnose(NoFpRet); 5748 B.buildUndef(DstReg); 5749 MI.eraseFromParent(); 5750 return true; 5751 } 5752 5753 return legalizeBufferAtomic(MI, B, IntrID); 5754 } 5755 case Intrinsic::amdgcn_atomic_inc: 5756 return legalizeAtomicIncDec(MI, B, true); 5757 case Intrinsic::amdgcn_atomic_dec: 5758 return legalizeAtomicIncDec(MI, B, false); 5759 case Intrinsic::trap: 5760 return legalizeTrapIntrinsic(MI, MRI, B); 5761 case Intrinsic::debugtrap: 5762 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5763 case Intrinsic::amdgcn_rsq_clamp: 5764 return legalizeRsqClampIntrinsic(MI, MRI, B); 5765 case Intrinsic::amdgcn_ds_fadd: 5766 case Intrinsic::amdgcn_ds_fmin: 5767 case Intrinsic::amdgcn_ds_fmax: 5768 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5769 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5770 return legalizeBVHIntrinsic(MI, B); 5771 default: { 5772 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5773 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5774 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5775 return true; 5776 } 5777 } 5778 5779 return true; 5780 } 5781