1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUMemoryUtils.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIInstrInfo.h" 23 #include "SIMachineFunctionInfo.h" 24 #include "SIRegisterInfo.h" 25 #include "Utils/AMDGPUBaseInfo.h" 26 #include "llvm/ADT/ScopeExit.h" 27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 31 #include "llvm/CodeGen/GlobalISel/Utils.h" 32 #include "llvm/CodeGen/TargetOpcodes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/IntrinsicsAMDGPU.h" 35 #include "llvm/IR/IntrinsicsR600.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Hack until load/store selection patterns support any tuple of legal types. 46 static cl::opt<bool> EnableNewLegality( 47 "amdgpu-global-isel-new-legality", 48 cl::desc("Use GlobalISel desired legality, rather than try to use" 49 "rules compatible with selection patterns"), 50 cl::init(false), 51 cl::ReallyHidden); 52 53 static constexpr unsigned MaxRegisterSize = 1024; 54 55 // Round the number of elements to the next power of two elements 56 static LLT getPow2VectorType(LLT Ty) { 57 unsigned NElts = Ty.getNumElements(); 58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 60 } 61 62 // Round the number of bits to the next power of two bits 63 static LLT getPow2ScalarType(LLT Ty) { 64 unsigned Bits = Ty.getSizeInBits(); 65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 66 return LLT::scalar(Pow2Bits); 67 } 68 69 /// \returns true if this is an odd sized vector which should widen by adding an 70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 71 /// excludes s1 vectors, which should always be scalarized. 72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 73 return [=](const LegalityQuery &Query) { 74 const LLT Ty = Query.Types[TypeIdx]; 75 if (!Ty.isVector()) 76 return false; 77 78 const LLT EltTy = Ty.getElementType(); 79 const unsigned EltSize = EltTy.getSizeInBits(); 80 return Ty.getNumElements() % 2 != 0 && 81 EltSize > 1 && EltSize < 32 && 82 Ty.getSizeInBits() % 32 != 0; 83 }; 84 } 85 86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 return Ty.getSizeInBits() % 32 == 0; 90 }; 91 } 92 93 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 94 return [=](const LegalityQuery &Query) { 95 const LLT Ty = Query.Types[TypeIdx]; 96 const LLT EltTy = Ty.getScalarType(); 97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 98 }; 99 } 100 101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 const LLT EltTy = Ty.getElementType(); 105 return std::pair(TypeIdx, 106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 107 }; 108 } 109 110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 111 return [=](const LegalityQuery &Query) { 112 const LLT Ty = Query.Types[TypeIdx]; 113 const LLT EltTy = Ty.getElementType(); 114 unsigned Size = Ty.getSizeInBits(); 115 unsigned Pieces = (Size + 63) / 64; 116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 117 return std::pair(TypeIdx, LLT::scalarOrVector( 118 ElementCount::getFixed(NewNumElts), EltTy)); 119 }; 120 } 121 122 // Increase the number of vector elements to reach the next multiple of 32-bit 123 // type. 124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 125 return [=](const LegalityQuery &Query) { 126 const LLT Ty = Query.Types[TypeIdx]; 127 128 const LLT EltTy = Ty.getElementType(); 129 const int Size = Ty.getSizeInBits(); 130 const int EltSize = EltTy.getSizeInBits(); 131 const int NextMul32 = (Size + 31) / 32; 132 133 assert(EltSize < 32); 134 135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 137 }; 138 } 139 140 // Increase the number of vector elements to reach the next legal RegClass. 141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { 142 return [=](const LegalityQuery &Query) { 143 const LLT Ty = Query.Types[TypeIdx]; 144 const unsigned NumElts = Ty.getNumElements(); 145 const unsigned EltSize = Ty.getElementType().getSizeInBits(); 146 const unsigned MaxNumElts = MaxRegisterSize / EltSize; 147 148 assert(EltSize == 32 || EltSize == 64); 149 assert(Ty.getSizeInBits() < MaxRegisterSize); 150 151 unsigned NewNumElts; 152 // Find the nearest legal RegClass that is larger than the current type. 153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) { 154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize)) 155 break; 156 } 157 return std::pair(TypeIdx, 158 LLT::fixed_vector(NewNumElts, Ty.getElementType())); 159 }; 160 } 161 162 static LLT getBufferRsrcScalarType(const LLT Ty) { 163 if (!Ty.isVector()) 164 return LLT::scalar(128); 165 const ElementCount NumElems = Ty.getElementCount(); 166 return LLT::vector(NumElems, LLT::scalar(128)); 167 } 168 169 static LLT getBufferRsrcRegisterType(const LLT Ty) { 170 if (!Ty.isVector()) 171 return LLT::fixed_vector(4, LLT::scalar(32)); 172 const unsigned NumElems = Ty.getElementCount().getFixedValue(); 173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32)); 174 } 175 176 static LLT getBitcastRegisterType(const LLT Ty) { 177 const unsigned Size = Ty.getSizeInBits(); 178 179 if (Size <= 32) { 180 // <2 x s8> -> s16 181 // <4 x s8> -> s32 182 return LLT::scalar(Size); 183 } 184 185 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 186 } 187 188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 189 return [=](const LegalityQuery &Query) { 190 const LLT Ty = Query.Types[TypeIdx]; 191 return std::pair(TypeIdx, getBitcastRegisterType(Ty)); 192 }; 193 } 194 195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 196 return [=](const LegalityQuery &Query) { 197 const LLT Ty = Query.Types[TypeIdx]; 198 unsigned Size = Ty.getSizeInBits(); 199 assert(Size % 32 == 0); 200 return std::pair( 201 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 202 }; 203 } 204 205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 206 return [=](const LegalityQuery &Query) { 207 const LLT QueryTy = Query.Types[TypeIdx]; 208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 209 }; 210 } 211 212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 213 return [=](const LegalityQuery &Query) { 214 const LLT QueryTy = Query.Types[TypeIdx]; 215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 216 }; 217 } 218 219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 223 }; 224 } 225 226 static bool isRegisterSize(unsigned Size) { 227 return Size % 32 == 0 && Size <= MaxRegisterSize; 228 } 229 230 static bool isRegisterVectorElementType(LLT EltTy) { 231 const int EltSize = EltTy.getSizeInBits(); 232 return EltSize == 16 || EltSize % 32 == 0; 233 } 234 235 static bool isRegisterVectorType(LLT Ty) { 236 const int EltSize = Ty.getElementType().getSizeInBits(); 237 return EltSize == 32 || EltSize == 64 || 238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 239 EltSize == 128 || EltSize == 256; 240 } 241 242 // TODO: replace all uses of isRegisterType with isRegisterClassType 243 static bool isRegisterType(LLT Ty) { 244 if (!isRegisterSize(Ty.getSizeInBits())) 245 return false; 246 247 if (Ty.isVector()) 248 return isRegisterVectorType(Ty); 249 250 return true; 251 } 252 253 // Any combination of 32 or 64-bit elements up the maximum register size, and 254 // multiples of v2s16. 255 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 256 return [=](const LegalityQuery &Query) { 257 return isRegisterType(Query.Types[TypeIdx]); 258 }; 259 } 260 261 // RegisterType that doesn't have a corresponding RegClass. 262 // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this 263 // should be removed. 264 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { 265 return [=](const LegalityQuery &Query) { 266 LLT Ty = Query.Types[TypeIdx]; 267 return isRegisterType(Ty) && 268 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); 269 }; 270 } 271 272 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 273 return [=](const LegalityQuery &Query) { 274 const LLT QueryTy = Query.Types[TypeIdx]; 275 if (!QueryTy.isVector()) 276 return false; 277 const LLT EltTy = QueryTy.getElementType(); 278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 279 }; 280 } 281 282 static const LLT S1 = LLT::scalar(1); 283 static const LLT S8 = LLT::scalar(8); 284 static const LLT S16 = LLT::scalar(16); 285 static const LLT S32 = LLT::scalar(32); 286 static const LLT F32 = LLT::float32(); 287 static const LLT S64 = LLT::scalar(64); 288 static const LLT F64 = LLT::float64(); 289 static const LLT S96 = LLT::scalar(96); 290 static const LLT S128 = LLT::scalar(128); 291 static const LLT S160 = LLT::scalar(160); 292 static const LLT S192 = LLT::scalar(192); 293 static const LLT S224 = LLT::scalar(224); 294 static const LLT S256 = LLT::scalar(256); 295 static const LLT S512 = LLT::scalar(512); 296 static const LLT S1024 = LLT::scalar(1024); 297 static const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 298 299 static const LLT V2S8 = LLT::fixed_vector(2, 8); 300 static const LLT V2S16 = LLT::fixed_vector(2, 16); 301 static const LLT V4S16 = LLT::fixed_vector(4, 16); 302 static const LLT V6S16 = LLT::fixed_vector(6, 16); 303 static const LLT V8S16 = LLT::fixed_vector(8, 16); 304 static const LLT V10S16 = LLT::fixed_vector(10, 16); 305 static const LLT V12S16 = LLT::fixed_vector(12, 16); 306 static const LLT V16S16 = LLT::fixed_vector(16, 16); 307 308 static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16()); 309 static const LLT V2BF16 = V2F16; // FIXME 310 311 static const LLT V2S32 = LLT::fixed_vector(2, 32); 312 static const LLT V3S32 = LLT::fixed_vector(3, 32); 313 static const LLT V4S32 = LLT::fixed_vector(4, 32); 314 static const LLT V5S32 = LLT::fixed_vector(5, 32); 315 static const LLT V6S32 = LLT::fixed_vector(6, 32); 316 static const LLT V7S32 = LLT::fixed_vector(7, 32); 317 static const LLT V8S32 = LLT::fixed_vector(8, 32); 318 static const LLT V9S32 = LLT::fixed_vector(9, 32); 319 static const LLT V10S32 = LLT::fixed_vector(10, 32); 320 static const LLT V11S32 = LLT::fixed_vector(11, 32); 321 static const LLT V12S32 = LLT::fixed_vector(12, 32); 322 static const LLT V16S32 = LLT::fixed_vector(16, 32); 323 static const LLT V32S32 = LLT::fixed_vector(32, 32); 324 325 static const LLT V2S64 = LLT::fixed_vector(2, 64); 326 static const LLT V3S64 = LLT::fixed_vector(3, 64); 327 static const LLT V4S64 = LLT::fixed_vector(4, 64); 328 static const LLT V5S64 = LLT::fixed_vector(5, 64); 329 static const LLT V6S64 = LLT::fixed_vector(6, 64); 330 static const LLT V7S64 = LLT::fixed_vector(7, 64); 331 static const LLT V8S64 = LLT::fixed_vector(8, 64); 332 static const LLT V16S64 = LLT::fixed_vector(16, 64); 333 334 static const LLT V2S128 = LLT::fixed_vector(2, 128); 335 static const LLT V4S128 = LLT::fixed_vector(4, 128); 336 337 static std::initializer_list<LLT> AllScalarTypes = { 338 S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024}; 339 340 static std::initializer_list<LLT> AllS16Vectors{ 341 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128}; 342 343 static std::initializer_list<LLT> AllS32Vectors = { 344 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 345 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32}; 346 347 static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64, 348 V6S64, V7S64, V8S64, V16S64}; 349 350 // Checks whether a type is in the list of legal register types. 351 static bool isRegisterClassType(LLT Ty) { 352 if (Ty.isPointerOrPointerVector()) 353 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits())); 354 355 return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) || 356 is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty); 357 } 358 359 static LegalityPredicate isRegisterClassType(unsigned TypeIdx) { 360 return [TypeIdx](const LegalityQuery &Query) { 361 return isRegisterClassType(Query.Types[TypeIdx]); 362 }; 363 } 364 365 // If we have a truncating store or an extending load with a data size larger 366 // than 32-bits, we need to reduce to a 32-bit type. 367 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 368 return [=](const LegalityQuery &Query) { 369 const LLT Ty = Query.Types[TypeIdx]; 370 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 371 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 372 }; 373 } 374 375 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 376 // handle some operations by just promoting the register during 377 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 378 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 379 bool IsLoad, bool IsAtomic) { 380 switch (AS) { 381 case AMDGPUAS::PRIVATE_ADDRESS: 382 // FIXME: Private element size. 383 return ST.enableFlatScratch() ? 128 : 32; 384 case AMDGPUAS::LOCAL_ADDRESS: 385 return ST.useDS128() ? 128 : 64; 386 case AMDGPUAS::GLOBAL_ADDRESS: 387 case AMDGPUAS::CONSTANT_ADDRESS: 388 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 389 case AMDGPUAS::BUFFER_RESOURCE: 390 // Treat constant and global as identical. SMRD loads are sometimes usable for 391 // global loads (ideally constant address space should be eliminated) 392 // depending on the context. Legality cannot be context dependent, but 393 // RegBankSelect can split the load as necessary depending on the pointer 394 // register bank/uniformity and if the memory is invariant or not written in a 395 // kernel. 396 return IsLoad ? 512 : 128; 397 default: 398 // FIXME: Flat addresses may contextually need to be split to 32-bit parts 399 // if they may alias scratch depending on the subtarget. This needs to be 400 // moved to custom handling to use addressMayBeAccessedAsPrivate 401 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32; 402 } 403 } 404 405 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 406 const LegalityQuery &Query) { 407 const LLT Ty = Query.Types[0]; 408 409 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 410 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 411 412 unsigned RegSize = Ty.getSizeInBits(); 413 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 414 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 415 unsigned AS = Query.Types[1].getAddressSpace(); 416 417 // All of these need to be custom lowered to cast the pointer operand. 418 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 419 return false; 420 421 // Do not handle extending vector loads. 422 if (Ty.isVector() && MemSize != RegSize) 423 return false; 424 425 // TODO: We should be able to widen loads if the alignment is high enough, but 426 // we also need to modify the memory access size. 427 #if 0 428 // Accept widening loads based on alignment. 429 if (IsLoad && MemSize < Size) 430 MemSize = std::max(MemSize, Align); 431 #endif 432 433 // Only 1-byte and 2-byte to 32-bit extloads are valid. 434 if (MemSize != RegSize && RegSize != 32) 435 return false; 436 437 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 438 Query.MMODescrs[0].Ordering != 439 AtomicOrdering::NotAtomic)) 440 return false; 441 442 switch (MemSize) { 443 case 8: 444 case 16: 445 case 32: 446 case 64: 447 case 128: 448 break; 449 case 96: 450 if (!ST.hasDwordx3LoadStores()) 451 return false; 452 break; 453 case 256: 454 case 512: 455 // These may contextually need to be broken down. 456 break; 457 default: 458 return false; 459 } 460 461 assert(RegSize >= MemSize); 462 463 if (AlignBits < MemSize) { 464 const SITargetLowering *TLI = ST.getTargetLowering(); 465 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 466 Align(AlignBits / 8))) 467 return false; 468 } 469 470 return true; 471 } 472 473 // The newer buffer intrinsic forms take their resource arguments as 474 // pointers in address space 8, aka s128 values. However, in order to not break 475 // SelectionDAG, the underlying operations have to continue to take v4i32 476 // arguments. Therefore, we convert resource pointers - or vectors of them 477 // to integer values here. 478 static bool hasBufferRsrcWorkaround(const LLT Ty) { 479 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) 480 return true; 481 if (Ty.isVector()) { 482 const LLT ElemTy = Ty.getElementType(); 483 return hasBufferRsrcWorkaround(ElemTy); 484 } 485 return false; 486 } 487 488 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 489 // workaround this. Eventually it should ignore the type for loads and only care 490 // about the size. Return true in cases where we will workaround this for now by 491 // bitcasting. 492 static bool loadStoreBitcastWorkaround(const LLT Ty) { 493 if (EnableNewLegality) 494 return false; 495 496 const unsigned Size = Ty.getSizeInBits(); 497 if (Ty.isPointerVector()) 498 return true; 499 if (Size <= 64) 500 return false; 501 // Address space 8 pointers get their own workaround. 502 if (hasBufferRsrcWorkaround(Ty)) 503 return false; 504 if (!Ty.isVector()) 505 return true; 506 507 unsigned EltSize = Ty.getScalarSizeInBits(); 508 return EltSize != 32 && EltSize != 64; 509 } 510 511 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 512 const LLT Ty = Query.Types[0]; 513 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 514 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty); 515 } 516 517 /// Return true if a load or store of the type should be lowered with a bitcast 518 /// to a different type. 519 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 520 const LLT MemTy) { 521 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 522 const unsigned Size = Ty.getSizeInBits(); 523 if (Size != MemSizeInBits) 524 return Size <= 32 && Ty.isVector(); 525 526 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 527 return true; 528 529 // Don't try to handle bitcasting vector ext loads for now. 530 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 531 (Size <= 32 || isRegisterSize(Size)) && 532 !isRegisterVectorElementType(Ty.getElementType()); 533 } 534 535 /// Return true if we should legalize a load by widening an odd sized memory 536 /// access up to the alignment. Note this case when the memory access itself 537 /// changes, not the size of the result register. 538 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 539 uint64_t AlignInBits, unsigned AddrSpace, 540 unsigned Opcode) { 541 unsigned SizeInBits = MemoryTy.getSizeInBits(); 542 // We don't want to widen cases that are naturally legal. 543 if (isPowerOf2_32(SizeInBits)) 544 return false; 545 546 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 547 // end up widening these for a scalar load during RegBankSelect, if we don't 548 // have 96-bit scalar loads. 549 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 550 return false; 551 552 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false)) 553 return false; 554 555 // A load is known dereferenceable up to the alignment, so it's legal to widen 556 // to it. 557 // 558 // TODO: Could check dereferenceable for less aligned cases. 559 unsigned RoundedSize = NextPowerOf2(SizeInBits); 560 if (AlignInBits < RoundedSize) 561 return false; 562 563 // Do not widen if it would introduce a slow unaligned load. 564 const SITargetLowering *TLI = ST.getTargetLowering(); 565 unsigned Fast = 0; 566 return TLI->allowsMisalignedMemoryAccessesImpl( 567 RoundedSize, AddrSpace, Align(AlignInBits / 8), 568 MachineMemOperand::MOLoad, &Fast) && 569 Fast; 570 } 571 572 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 573 unsigned Opcode) { 574 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 575 return false; 576 577 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 578 Query.MMODescrs[0].AlignInBits, 579 Query.Types[1].getAddressSpace(), Opcode); 580 } 581 582 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial 583 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts 584 /// and inttoptr. In addition, handle vectors of p8. Returns the new type. 585 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, 586 MachineRegisterInfo &MRI, unsigned Idx) { 587 MachineOperand &MO = MI.getOperand(Idx); 588 589 const LLT PointerTy = MRI.getType(MO.getReg()); 590 591 // Paranoidly prevent us from doing this multiple times. 592 if (!hasBufferRsrcWorkaround(PointerTy)) 593 return PointerTy; 594 595 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 596 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 597 if (!PointerTy.isVector()) { 598 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8) 599 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 600 const LLT S32 = LLT::scalar(32); 601 602 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy); 603 std::array<Register, 4> VectorElems; 604 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 605 for (unsigned I = 0; I < NumParts; ++I) 606 VectorElems[I] = 607 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0); 608 B.buildMergeValues(MO, VectorElems); 609 MO.setReg(VectorReg); 610 return VectorTy; 611 } 612 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy); 613 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 614 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg); 615 B.buildIntToPtr(MO, Scalar); 616 MO.setReg(BitcastReg); 617 618 return VectorTy; 619 } 620 621 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is 622 /// the form in which the value must be in order to be passed to the low-level 623 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is 624 /// needed in order to account for the fact that we can't define a register 625 /// class for s128 without breaking SelectionDAG. 626 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) { 627 MachineRegisterInfo &MRI = *B.getMRI(); 628 const LLT PointerTy = MRI.getType(Pointer); 629 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 630 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 631 632 if (!PointerTy.isVector()) { 633 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32) 634 SmallVector<Register, 4> PointerParts; 635 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 636 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer); 637 for (unsigned I = 0; I < NumParts; ++I) 638 PointerParts.push_back(Unmerged.getReg(I)); 639 return B.buildBuildVector(VectorTy, PointerParts).getReg(0); 640 } 641 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0); 642 return B.buildBitcast(VectorTy, Scalar).getReg(0); 643 } 644 645 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, 646 unsigned Idx) { 647 MachineOperand &MO = MI.getOperand(Idx); 648 649 const LLT PointerTy = B.getMRI()->getType(MO.getReg()); 650 // Paranoidly prevent us from doing this multiple times. 651 if (!hasBufferRsrcWorkaround(PointerTy)) 652 return; 653 MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B)); 654 } 655 656 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 657 const GCNTargetMachine &TM) 658 : ST(ST_) { 659 using namespace TargetOpcode; 660 661 auto GetAddrSpacePtr = [&TM](unsigned AS) { 662 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 663 }; 664 665 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 666 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 667 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 668 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 669 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 670 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 671 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 672 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER); 673 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE); 674 const LLT BufferStridedPtr = 675 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER); 676 677 const LLT CodePtr = FlatPtr; 678 679 const std::initializer_list<LLT> AddrSpaces64 = { 680 GlobalPtr, ConstantPtr, FlatPtr 681 }; 682 683 const std::initializer_list<LLT> AddrSpaces32 = { 684 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 685 }; 686 687 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr}; 688 689 const std::initializer_list<LLT> FPTypesBase = { 690 S32, S64 691 }; 692 693 const std::initializer_list<LLT> FPTypes16 = { 694 S32, S64, S16 695 }; 696 697 const std::initializer_list<LLT> FPTypesPK16 = { 698 S32, S64, S16, V2S16 699 }; 700 701 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 702 703 // s1 for VCC branches, s32 for SCC branches. 704 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 705 706 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 707 // elements for v3s16 708 getActionDefinitionsBuilder(G_PHI) 709 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 710 .legalFor(AllS32Vectors) 711 .legalFor(AllS64Vectors) 712 .legalFor(AddrSpaces64) 713 .legalFor(AddrSpaces32) 714 .legalFor(AddrSpaces128) 715 .legalIf(isPointer(0)) 716 .clampScalar(0, S16, S256) 717 .widenScalarToNextPow2(0, 32) 718 .clampMaxNumElements(0, S32, 16) 719 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 720 .scalarize(0); 721 722 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 723 // Full set of gfx9 features. 724 if (ST.hasScalarAddSub64()) { 725 getActionDefinitionsBuilder({G_ADD, G_SUB}) 726 .legalFor({S64, S32, S16, V2S16}) 727 .clampMaxNumElementsStrict(0, S16, 2) 728 .scalarize(0) 729 .minScalar(0, S16) 730 .widenScalarToNextMultipleOf(0, 32) 731 .maxScalar(0, S32); 732 } else { 733 getActionDefinitionsBuilder({G_ADD, G_SUB}) 734 .legalFor({S32, S16, V2S16}) 735 .clampMaxNumElementsStrict(0, S16, 2) 736 .scalarize(0) 737 .minScalar(0, S16) 738 .widenScalarToNextMultipleOf(0, 32) 739 .maxScalar(0, S32); 740 } 741 742 if (ST.hasScalarSMulU64()) { 743 getActionDefinitionsBuilder(G_MUL) 744 .legalFor({S64, S32, S16, V2S16}) 745 .clampMaxNumElementsStrict(0, S16, 2) 746 .scalarize(0) 747 .minScalar(0, S16) 748 .widenScalarToNextMultipleOf(0, 32) 749 .custom(); 750 } else { 751 getActionDefinitionsBuilder(G_MUL) 752 .legalFor({S32, S16, V2S16}) 753 .clampMaxNumElementsStrict(0, S16, 2) 754 .scalarize(0) 755 .minScalar(0, S16) 756 .widenScalarToNextMultipleOf(0, 32) 757 .custom(); 758 } 759 assert(ST.hasMad64_32()); 760 761 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 762 .legalFor({S32, S16, V2S16}) // Clamp modifier 763 .minScalarOrElt(0, S16) 764 .clampMaxNumElementsStrict(0, S16, 2) 765 .scalarize(0) 766 .widenScalarToNextPow2(0, 32) 767 .lower(); 768 } else if (ST.has16BitInsts()) { 769 getActionDefinitionsBuilder({G_ADD, G_SUB}) 770 .legalFor({S32, S16}) 771 .minScalar(0, S16) 772 .widenScalarToNextMultipleOf(0, 32) 773 .maxScalar(0, S32) 774 .scalarize(0); 775 776 getActionDefinitionsBuilder(G_MUL) 777 .legalFor({S32, S16}) 778 .scalarize(0) 779 .minScalar(0, S16) 780 .widenScalarToNextMultipleOf(0, 32) 781 .custom(); 782 assert(ST.hasMad64_32()); 783 784 // Technically the saturating operations require clamp bit support, but this 785 // was introduced at the same time as 16-bit operations. 786 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 787 .legalFor({S32, S16}) // Clamp modifier 788 .minScalar(0, S16) 789 .scalarize(0) 790 .widenScalarToNextPow2(0, 16) 791 .lower(); 792 793 // We're just lowering this, but it helps get a better result to try to 794 // coerce to the desired type first. 795 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 796 .minScalar(0, S16) 797 .scalarize(0) 798 .lower(); 799 } else { 800 getActionDefinitionsBuilder({G_ADD, G_SUB}) 801 .legalFor({S32}) 802 .widenScalarToNextMultipleOf(0, 32) 803 .clampScalar(0, S32, S32) 804 .scalarize(0); 805 806 auto &Mul = getActionDefinitionsBuilder(G_MUL) 807 .legalFor({S32}) 808 .scalarize(0) 809 .minScalar(0, S32) 810 .widenScalarToNextMultipleOf(0, 32); 811 812 if (ST.hasMad64_32()) 813 Mul.custom(); 814 else 815 Mul.maxScalar(0, S32); 816 817 if (ST.hasIntClamp()) { 818 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 819 .legalFor({S32}) // Clamp modifier. 820 .scalarize(0) 821 .minScalarOrElt(0, S32) 822 .lower(); 823 } else { 824 // Clamp bit support was added in VI, along with 16-bit operations. 825 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 826 .minScalar(0, S32) 827 .scalarize(0) 828 .lower(); 829 } 830 831 // FIXME: DAG expansion gets better results. The widening uses the smaller 832 // range values and goes for the min/max lowering directly. 833 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 834 .minScalar(0, S32) 835 .scalarize(0) 836 .lower(); 837 } 838 839 getActionDefinitionsBuilder( 840 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 841 .customFor({S32, S64}) 842 .clampScalar(0, S32, S64) 843 .widenScalarToNextPow2(0, 32) 844 .scalarize(0); 845 846 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 847 .legalFor({S32}) 848 .maxScalar(0, S32); 849 850 if (ST.hasVOP3PInsts()) { 851 Mulh 852 .clampMaxNumElements(0, S8, 2) 853 .lowerFor({V2S8}); 854 } 855 856 Mulh 857 .scalarize(0) 858 .lower(); 859 860 // Report legal for any types we can handle anywhere. For the cases only legal 861 // on the SALU, RegBankSelect will be able to re-legalize. 862 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 863 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 864 .clampScalar(0, S32, S64) 865 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 866 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 867 .widenScalarToNextPow2(0) 868 .scalarize(0); 869 870 getActionDefinitionsBuilder( 871 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 872 .legalFor({{S32, S1}, {S32, S32}}) 873 .clampScalar(0, S32, S32) 874 .scalarize(0); 875 876 getActionDefinitionsBuilder(G_BITCAST) 877 // Don't worry about the size constraint. 878 .legalIf(all(isRegisterClassType(0), isRegisterClassType(1))) 879 .lower(); 880 881 getActionDefinitionsBuilder(G_CONSTANT) 882 .legalFor({S1, S32, S64, S16, GlobalPtr, 883 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 884 .legalIf(isPointer(0)) 885 .clampScalar(0, S32, S64) 886 .widenScalarToNextPow2(0); 887 888 getActionDefinitionsBuilder(G_FCONSTANT) 889 .legalFor({S32, S64, S16}) 890 .clampScalar(0, S16, S64); 891 892 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 893 .legalIf(isRegisterClassType(0)) 894 // s1 and s16 are special cases because they have legal operations on 895 // them, but don't really occupy registers in the normal way. 896 .legalFor({S1, S16}) 897 .clampNumElements(0, V16S32, V32S32) 898 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 899 .clampScalarOrElt(0, S32, MaxScalar) 900 .widenScalarToNextPow2(0, 32) 901 .clampMaxNumElements(0, S32, 16); 902 903 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 904 905 // If the amount is divergent, we have to do a wave reduction to get the 906 // maximum value, so this is expanded during RegBankSelect. 907 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 908 .legalFor({{PrivatePtr, S32}}); 909 910 getActionDefinitionsBuilder(G_STACKSAVE) 911 .customFor({PrivatePtr}); 912 getActionDefinitionsBuilder(G_STACKRESTORE) 913 .legalFor({PrivatePtr}); 914 915 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64}); 916 917 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 918 .customIf(typeIsNot(0, PrivatePtr)); 919 920 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 921 922 auto &FPOpActions = getActionDefinitionsBuilder( 923 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, 924 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) 925 .legalFor({S32, S64}); 926 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 927 .customFor({S32, S64}); 928 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 929 .customFor({S32, S64}); 930 931 if (ST.has16BitInsts()) { 932 if (ST.hasVOP3PInsts()) 933 FPOpActions.legalFor({S16, V2S16}); 934 else 935 FPOpActions.legalFor({S16}); 936 937 TrigActions.customFor({S16}); 938 FDIVActions.customFor({S16}); 939 } 940 941 if (ST.hasPackedFP32Ops()) { 942 FPOpActions.legalFor({V2S32}); 943 FPOpActions.clampMaxNumElementsStrict(0, S32, 2); 944 } 945 946 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 947 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 948 949 if (ST.hasVOP3PInsts()) { 950 MinNumMaxNum.customFor(FPTypesPK16) 951 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 952 .clampMaxNumElements(0, S16, 2) 953 .clampScalar(0, S16, S64) 954 .scalarize(0); 955 } else if (ST.has16BitInsts()) { 956 MinNumMaxNum.customFor(FPTypes16) 957 .clampScalar(0, S16, S64) 958 .scalarize(0); 959 } else { 960 MinNumMaxNum.customFor(FPTypesBase) 961 .clampScalar(0, S32, S64) 962 .scalarize(0); 963 } 964 965 if (ST.hasVOP3PInsts()) 966 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 967 968 FPOpActions 969 .scalarize(0) 970 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 971 972 TrigActions 973 .scalarize(0) 974 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 975 976 FDIVActions 977 .scalarize(0) 978 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 979 980 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 981 .legalFor(FPTypesPK16) 982 .clampMaxNumElementsStrict(0, S16, 2) 983 .scalarize(0) 984 .clampScalar(0, S16, S64); 985 986 if (ST.has16BitInsts()) { 987 getActionDefinitionsBuilder(G_FSQRT) 988 .legalFor({S16}) 989 .customFor({S32, S64}) 990 .scalarize(0) 991 .unsupported(); 992 getActionDefinitionsBuilder(G_FFLOOR) 993 .legalFor({S32, S64, S16}) 994 .scalarize(0) 995 .clampScalar(0, S16, S64); 996 997 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 998 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}}) 999 .scalarize(0) 1000 .maxScalarIf(typeIs(0, S16), 1, S16) 1001 .clampScalar(1, S32, S32) 1002 .lower(); 1003 1004 getActionDefinitionsBuilder(G_FFREXP) 1005 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}}) 1006 .scalarize(0) 1007 .lower(); 1008 } else { 1009 getActionDefinitionsBuilder(G_FSQRT) 1010 .customFor({S32, S64, S16}) 1011 .scalarize(0) 1012 .unsupported(); 1013 1014 1015 if (ST.hasFractBug()) { 1016 getActionDefinitionsBuilder(G_FFLOOR) 1017 .customFor({S64}) 1018 .legalFor({S32, S64}) 1019 .scalarize(0) 1020 .clampScalar(0, S32, S64); 1021 } else { 1022 getActionDefinitionsBuilder(G_FFLOOR) 1023 .legalFor({S32, S64}) 1024 .scalarize(0) 1025 .clampScalar(0, S32, S64); 1026 } 1027 1028 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 1029 .legalFor({{S32, S32}, {S64, S32}}) 1030 .scalarize(0) 1031 .clampScalar(0, S32, S64) 1032 .clampScalar(1, S32, S32) 1033 .lower(); 1034 1035 getActionDefinitionsBuilder(G_FFREXP) 1036 .customFor({{S32, S32}, {S64, S32}}) 1037 .scalarize(0) 1038 .minScalar(0, S32) 1039 .clampScalar(1, S32, S32) 1040 .lower(); 1041 } 1042 1043 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC); 1044 if (ST.hasCvtPkF16F32Inst()) 1045 FPTruncActions.legalFor( 1046 {{S32, S64}, {S16, S32}, {V2S16, V2S32}, {V2S16, V2S64}}); 1047 else 1048 FPTruncActions.legalFor({{S32, S64}, {S16, S32}}); 1049 FPTruncActions.scalarize(0).lower(); 1050 1051 getActionDefinitionsBuilder(G_FPEXT) 1052 .legalFor({{S64, S32}, {S32, S16}}) 1053 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 1054 .scalarize(0); 1055 1056 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); 1057 if (ST.has16BitInsts()) { 1058 FSubActions 1059 // Use actual fsub instruction 1060 .legalFor({S32, S16}) 1061 // Must use fadd + fneg 1062 .lowerFor({S64, V2S16}); 1063 } else { 1064 FSubActions 1065 // Use actual fsub instruction 1066 .legalFor({S32}) 1067 // Must use fadd + fneg 1068 .lowerFor({S64, S16, V2S16}); 1069 } 1070 1071 FSubActions 1072 .scalarize(0) 1073 .clampScalar(0, S32, S64); 1074 1075 // Whether this is legal depends on the floating point mode for the function. 1076 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 1077 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 1078 FMad.customFor({S32, S16}); 1079 else if (ST.hasMadMacF32Insts()) 1080 FMad.customFor({S32}); 1081 else if (ST.hasMadF16()) 1082 FMad.customFor({S16}); 1083 FMad.scalarize(0) 1084 .lower(); 1085 1086 auto &FRem = getActionDefinitionsBuilder(G_FREM); 1087 if (ST.has16BitInsts()) { 1088 FRem.customFor({S16, S32, S64}); 1089 } else { 1090 FRem.minScalar(0, S32) 1091 .customFor({S32, S64}); 1092 } 1093 FRem.scalarize(0); 1094 1095 // TODO: Do we need to clamp maximum bitwidth? 1096 getActionDefinitionsBuilder(G_TRUNC) 1097 .legalIf(isScalar(0)) 1098 .legalFor({{V2S16, V2S32}}) 1099 .clampMaxNumElements(0, S16, 2) 1100 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 1101 // situations (like an invalid implicit use), we don't want to infinite loop 1102 // in the legalizer. 1103 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 1104 .alwaysLegal(); 1105 1106 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 1107 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 1108 {S32, S1}, {S64, S1}, {S16, S1}}) 1109 .scalarize(0) 1110 .clampScalar(0, S32, S64) 1111 .widenScalarToNextPow2(1, 32); 1112 1113 // TODO: Split s1->s64 during regbankselect for VALU. 1114 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 1115 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 1116 .lowerIf(typeIs(1, S1)) 1117 .customFor({{S32, S64}, {S64, S64}}); 1118 if (ST.has16BitInsts()) 1119 IToFP.legalFor({{S16, S16}}); 1120 IToFP.clampScalar(1, S32, S64) 1121 .minScalar(0, S32) 1122 .scalarize(0) 1123 .widenScalarToNextPow2(1); 1124 1125 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 1126 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 1127 .customFor({{S64, S32}, {S64, S64}}) 1128 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 1129 if (ST.has16BitInsts()) 1130 FPToI.legalFor({{S16, S16}}); 1131 else 1132 FPToI.minScalar(1, S32); 1133 1134 FPToI.minScalar(0, S32) 1135 .widenScalarToNextPow2(0, 32) 1136 .scalarize(0) 1137 .lower(); 1138 1139 getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) 1140 .clampScalar(0, S16, S64) 1141 .scalarize(0) 1142 .lower(); 1143 1144 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 1145 .legalFor({S16, S32}) 1146 .scalarize(0) 1147 .lower(); 1148 1149 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN 1150 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT}) 1151 .scalarize(0) 1152 .lower(); 1153 1154 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT}) 1155 .clampScalar(0, S16, S64) 1156 .scalarize(0) 1157 .lower(); 1158 1159 if (ST.has16BitInsts()) { 1160 getActionDefinitionsBuilder( 1161 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1162 .legalFor({S16, S32, S64}) 1163 .clampScalar(0, S16, S64) 1164 .scalarize(0); 1165 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 1166 getActionDefinitionsBuilder( 1167 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1168 .legalFor({S32, S64}) 1169 .clampScalar(0, S32, S64) 1170 .scalarize(0); 1171 } else { 1172 getActionDefinitionsBuilder( 1173 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1174 .legalFor({S32}) 1175 .customFor({S64}) 1176 .clampScalar(0, S32, S64) 1177 .scalarize(0); 1178 } 1179 1180 getActionDefinitionsBuilder(G_PTR_ADD) 1181 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr}) 1182 .legalIf(all(isPointer(0), sameSize(0, 1))) 1183 .scalarize(0) 1184 .scalarSameSizeAs(1, 0); 1185 1186 getActionDefinitionsBuilder(G_PTRMASK) 1187 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 1188 .scalarSameSizeAs(1, 0) 1189 .scalarize(0); 1190 1191 auto &CmpBuilder = 1192 getActionDefinitionsBuilder(G_ICMP) 1193 // The compare output type differs based on the register bank of the output, 1194 // so make both s1 and s32 legal. 1195 // 1196 // Scalar compares producing output in scc will be promoted to s32, as that 1197 // is the allocatable register type that will be needed for the copy from 1198 // scc. This will be promoted during RegBankSelect, and we assume something 1199 // before that won't try to use s32 result types. 1200 // 1201 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 1202 // bank. 1203 .legalForCartesianProduct( 1204 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 1205 .legalForCartesianProduct( 1206 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 1207 if (ST.has16BitInsts()) { 1208 CmpBuilder.legalFor({{S1, S16}}); 1209 } 1210 1211 CmpBuilder 1212 .widenScalarToNextPow2(1) 1213 .clampScalar(1, S32, S64) 1214 .scalarize(0) 1215 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 1216 1217 auto &FCmpBuilder = 1218 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct( 1219 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase); 1220 1221 if (ST.hasSALUFloatInsts()) 1222 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32}); 1223 1224 FCmpBuilder 1225 .widenScalarToNextPow2(1) 1226 .clampScalar(1, S32, S64) 1227 .scalarize(0); 1228 1229 // FIXME: fpow has a selection pattern that should move to custom lowering. 1230 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW); 1231 if (ST.has16BitInsts()) 1232 ExpOps.customFor({{S32}, {S16}}); 1233 else 1234 ExpOps.customFor({S32}); 1235 ExpOps.clampScalar(0, MinScalarFPTy, S32) 1236 .scalarize(0); 1237 1238 getActionDefinitionsBuilder(G_FPOWI) 1239 .clampScalar(0, MinScalarFPTy, S32) 1240 .lower(); 1241 1242 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2}); 1243 Log2Ops.customFor({S32}); 1244 if (ST.has16BitInsts()) 1245 Log2Ops.legalFor({S16}); 1246 else 1247 Log2Ops.customFor({S16}); 1248 Log2Ops.scalarize(0) 1249 .lower(); 1250 1251 auto &LogOps = 1252 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10}); 1253 LogOps.customFor({S32, S16}); 1254 LogOps.clampScalar(0, MinScalarFPTy, S32) 1255 .scalarize(0); 1256 1257 // The 64-bit versions produce 32-bit results, but only on the SALU. 1258 getActionDefinitionsBuilder(G_CTPOP) 1259 .legalFor({{S32, S32}, {S32, S64}}) 1260 .clampScalar(0, S32, S32) 1261 .widenScalarToNextPow2(1, 32) 1262 .clampScalar(1, S32, S64) 1263 .scalarize(0) 1264 .widenScalarToNextPow2(0, 32); 1265 1266 // If no 16 bit instr is available, lower into different instructions. 1267 if (ST.has16BitInsts()) 1268 getActionDefinitionsBuilder(G_IS_FPCLASS) 1269 .legalForCartesianProduct({S1}, FPTypes16) 1270 .widenScalarToNextPow2(1) 1271 .scalarize(0) 1272 .lower(); 1273 else 1274 getActionDefinitionsBuilder(G_IS_FPCLASS) 1275 .legalForCartesianProduct({S1}, FPTypesBase) 1276 .lowerFor({S1, S16}) 1277 .widenScalarToNextPow2(1) 1278 .scalarize(0) 1279 .lower(); 1280 1281 // The hardware instructions return a different result on 0 than the generic 1282 // instructions expect. The hardware produces -1, but these produce the 1283 // bitwidth. 1284 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 1285 .scalarize(0) 1286 .clampScalar(0, S32, S32) 1287 .clampScalar(1, S32, S64) 1288 .widenScalarToNextPow2(0, 32) 1289 .widenScalarToNextPow2(1, 32) 1290 .custom(); 1291 1292 // The 64-bit versions produce 32-bit results, but only on the SALU. 1293 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) 1294 .legalFor({{S32, S32}, {S32, S64}}) 1295 .customIf(scalarNarrowerThan(1, 32)) 1296 .clampScalar(0, S32, S32) 1297 .clampScalar(1, S32, S64) 1298 .scalarize(0) 1299 .widenScalarToNextPow2(0, 32) 1300 .widenScalarToNextPow2(1, 32); 1301 1302 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF) 1303 .legalFor({{S32, S32}, {S32, S64}}) 1304 .clampScalar(0, S32, S32) 1305 .clampScalar(1, S32, S64) 1306 .scalarize(0) 1307 .widenScalarToNextPow2(0, 32) 1308 .widenScalarToNextPow2(1, 32); 1309 1310 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1311 // RegBankSelect. 1312 getActionDefinitionsBuilder(G_BITREVERSE) 1313 .legalFor({S32, S64}) 1314 .clampScalar(0, S32, S64) 1315 .scalarize(0) 1316 .widenScalarToNextPow2(0); 1317 1318 if (ST.has16BitInsts()) { 1319 getActionDefinitionsBuilder(G_BSWAP) 1320 .legalFor({S16, S32, V2S16}) 1321 .clampMaxNumElementsStrict(0, S16, 2) 1322 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1323 // narrowScalar limitation. 1324 .widenScalarToNextPow2(0) 1325 .clampScalar(0, S16, S32) 1326 .scalarize(0); 1327 1328 if (ST.hasVOP3PInsts()) { 1329 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1330 .legalFor({S32, S16, V2S16}) 1331 .clampMaxNumElements(0, S16, 2) 1332 .minScalar(0, S16) 1333 .widenScalarToNextPow2(0) 1334 .scalarize(0) 1335 .lower(); 1336 } else { 1337 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1338 .legalFor({S32, S16}) 1339 .widenScalarToNextPow2(0) 1340 .minScalar(0, S16) 1341 .scalarize(0) 1342 .lower(); 1343 } 1344 } else { 1345 // TODO: Should have same legality without v_perm_b32 1346 getActionDefinitionsBuilder(G_BSWAP) 1347 .legalFor({S32}) 1348 .lowerIf(scalarNarrowerThan(0, 32)) 1349 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1350 // narrowScalar limitation. 1351 .widenScalarToNextPow2(0) 1352 .maxScalar(0, S32) 1353 .scalarize(0) 1354 .lower(); 1355 1356 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1357 .legalFor({S32}) 1358 .minScalar(0, S32) 1359 .widenScalarToNextPow2(0) 1360 .scalarize(0) 1361 .lower(); 1362 } 1363 1364 getActionDefinitionsBuilder(G_INTTOPTR) 1365 // List the common cases 1366 .legalForCartesianProduct(AddrSpaces64, {S64}) 1367 .legalForCartesianProduct(AddrSpaces32, {S32}) 1368 .scalarize(0) 1369 // Accept any address space as long as the size matches 1370 .legalIf(sameSize(0, 1)) 1371 .widenScalarIf(smallerThan(1, 0), 1372 [](const LegalityQuery &Query) { 1373 return std::pair( 1374 1, LLT::scalar(Query.Types[0].getSizeInBits())); 1375 }) 1376 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { 1377 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1378 }); 1379 1380 getActionDefinitionsBuilder(G_PTRTOINT) 1381 // List the common cases 1382 .legalForCartesianProduct(AddrSpaces64, {S64}) 1383 .legalForCartesianProduct(AddrSpaces32, {S32}) 1384 .scalarize(0) 1385 // Accept any address space as long as the size matches 1386 .legalIf(sameSize(0, 1)) 1387 .widenScalarIf(smallerThan(0, 1), 1388 [](const LegalityQuery &Query) { 1389 return std::pair( 1390 0, LLT::scalar(Query.Types[1].getSizeInBits())); 1391 }) 1392 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { 1393 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1394 }); 1395 1396 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1397 .scalarize(0) 1398 .custom(); 1399 1400 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1401 bool IsLoad) -> bool { 1402 const LLT DstTy = Query.Types[0]; 1403 1404 // Split vector extloads. 1405 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1406 1407 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1408 return true; 1409 1410 const LLT PtrTy = Query.Types[1]; 1411 unsigned AS = PtrTy.getAddressSpace(); 1412 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 1413 Query.MMODescrs[0].Ordering != 1414 AtomicOrdering::NotAtomic)) 1415 return true; 1416 1417 // Catch weird sized loads that don't evenly divide into the access sizes 1418 // TODO: May be able to widen depending on alignment etc. 1419 unsigned NumRegs = (MemSize + 31) / 32; 1420 if (NumRegs == 3) { 1421 if (!ST.hasDwordx3LoadStores()) 1422 return true; 1423 } else { 1424 // If the alignment allows, these should have been widened. 1425 if (!isPowerOf2_32(NumRegs)) 1426 return true; 1427 } 1428 1429 return false; 1430 }; 1431 1432 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1433 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1434 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1435 1436 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1437 // LDS 1438 // TODO: Unsupported flat for SI. 1439 1440 for (unsigned Op : {G_LOAD, G_STORE}) { 1441 const bool IsStore = Op == G_STORE; 1442 1443 auto &Actions = getActionDefinitionsBuilder(Op); 1444 // Explicitly list some common cases. 1445 // TODO: Does this help compile time at all? 1446 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1447 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1448 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1449 {S64, GlobalPtr, S64, GlobalAlign32}, 1450 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1451 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1452 {S32, GlobalPtr, S8, GlobalAlign8}, 1453 {S32, GlobalPtr, S16, GlobalAlign16}, 1454 1455 {S32, LocalPtr, S32, 32}, 1456 {S64, LocalPtr, S64, 32}, 1457 {V2S32, LocalPtr, V2S32, 32}, 1458 {S32, LocalPtr, S8, 8}, 1459 {S32, LocalPtr, S16, 16}, 1460 {V2S16, LocalPtr, S32, 32}, 1461 1462 {S32, PrivatePtr, S32, 32}, 1463 {S32, PrivatePtr, S8, 8}, 1464 {S32, PrivatePtr, S16, 16}, 1465 {V2S16, PrivatePtr, S32, 32}, 1466 1467 {S32, ConstantPtr, S32, GlobalAlign32}, 1468 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1469 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1470 {S64, ConstantPtr, S64, GlobalAlign32}, 1471 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1472 Actions.legalIf( 1473 [=](const LegalityQuery &Query) -> bool { 1474 return isLoadStoreLegal(ST, Query); 1475 }); 1476 1477 // The custom pointers (fat pointers, buffer resources) don't work with load 1478 // and store at this level. Fat pointers should have been lowered to 1479 // intrinsics before the translation to MIR. 1480 Actions.unsupportedIf( 1481 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr})); 1482 1483 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and 1484 // ptrtoint. This is needed to account for the fact that we can't have i128 1485 // as a register class for SelectionDAG reasons. 1486 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1487 return hasBufferRsrcWorkaround(Query.Types[0]); 1488 }); 1489 1490 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1491 // 64-bits. 1492 // 1493 // TODO: Should generalize bitcast action into coerce, which will also cover 1494 // inserting addrspacecasts. 1495 Actions.customIf(typeIs(1, Constant32Ptr)); 1496 1497 // Turn any illegal element vectors into something easier to deal 1498 // with. These will ultimately produce 32-bit scalar shifts to extract the 1499 // parts anyway. 1500 // 1501 // For odd 16-bit element vectors, prefer to split those into pieces with 1502 // 16-bit vector parts. 1503 Actions.bitcastIf( 1504 [=](const LegalityQuery &Query) -> bool { 1505 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1506 Query.MMODescrs[0].MemoryTy); 1507 }, bitcastToRegisterType(0)); 1508 1509 if (!IsStore) { 1510 // Widen suitably aligned loads by loading extra bytes. The standard 1511 // legalization actions can't properly express widening memory operands. 1512 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1513 return shouldWidenLoad(ST, Query, G_LOAD); 1514 }); 1515 } 1516 1517 // FIXME: load/store narrowing should be moved to lower action 1518 Actions 1519 .narrowScalarIf( 1520 [=](const LegalityQuery &Query) -> bool { 1521 return !Query.Types[0].isVector() && 1522 needToSplitMemOp(Query, Op == G_LOAD); 1523 }, 1524 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1525 const LLT DstTy = Query.Types[0]; 1526 const LLT PtrTy = Query.Types[1]; 1527 1528 const unsigned DstSize = DstTy.getSizeInBits(); 1529 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1530 1531 // Split extloads. 1532 if (DstSize > MemSize) 1533 return std::pair(0, LLT::scalar(MemSize)); 1534 1535 unsigned MaxSize = maxSizeForAddrSpace( 1536 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1537 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1538 if (MemSize > MaxSize) 1539 return std::pair(0, LLT::scalar(MaxSize)); 1540 1541 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1542 return std::pair(0, LLT::scalar(Align)); 1543 }) 1544 .fewerElementsIf( 1545 [=](const LegalityQuery &Query) -> bool { 1546 return Query.Types[0].isVector() && 1547 needToSplitMemOp(Query, Op == G_LOAD); 1548 }, 1549 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1550 const LLT DstTy = Query.Types[0]; 1551 const LLT PtrTy = Query.Types[1]; 1552 1553 LLT EltTy = DstTy.getElementType(); 1554 unsigned MaxSize = maxSizeForAddrSpace( 1555 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1556 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1557 1558 // FIXME: Handle widened to power of 2 results better. This ends 1559 // up scalarizing. 1560 // FIXME: 3 element stores scalarized on SI 1561 1562 // Split if it's too large for the address space. 1563 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1564 if (MemSize > MaxSize) { 1565 unsigned NumElts = DstTy.getNumElements(); 1566 unsigned EltSize = EltTy.getSizeInBits(); 1567 1568 if (MaxSize % EltSize == 0) { 1569 return std::pair( 1570 0, LLT::scalarOrVector( 1571 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1572 } 1573 1574 unsigned NumPieces = MemSize / MaxSize; 1575 1576 // FIXME: Refine when odd breakdowns handled 1577 // The scalars will need to be re-legalized. 1578 if (NumPieces == 1 || NumPieces >= NumElts || 1579 NumElts % NumPieces != 0) 1580 return std::pair(0, EltTy); 1581 1582 return std::pair(0, 1583 LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1584 } 1585 1586 // FIXME: We could probably handle weird extending loads better. 1587 if (DstTy.getSizeInBits() > MemSize) 1588 return std::pair(0, EltTy); 1589 1590 unsigned EltSize = EltTy.getSizeInBits(); 1591 unsigned DstSize = DstTy.getSizeInBits(); 1592 if (!isPowerOf2_32(DstSize)) { 1593 // We're probably decomposing an odd sized store. Try to split 1594 // to the widest type. TODO: Account for alignment. As-is it 1595 // should be OK, since the new parts will be further legalized. 1596 unsigned FloorSize = llvm::bit_floor(DstSize); 1597 return std::pair( 1598 0, LLT::scalarOrVector( 1599 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1600 } 1601 1602 // May need relegalization for the scalars. 1603 return std::pair(0, EltTy); 1604 }) 1605 .minScalar(0, S32) 1606 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1607 .widenScalarToNextPow2(0) 1608 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1609 .lower(); 1610 } 1611 1612 // FIXME: Unaligned accesses not lowered. 1613 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1614 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1615 {S32, GlobalPtr, S16, 2 * 8}, 1616 {S32, LocalPtr, S8, 8}, 1617 {S32, LocalPtr, S16, 16}, 1618 {S32, PrivatePtr, S8, 8}, 1619 {S32, PrivatePtr, S16, 16}, 1620 {S32, ConstantPtr, S8, 8}, 1621 {S32, ConstantPtr, S16, 2 * 8}}) 1622 .legalIf( 1623 [=](const LegalityQuery &Query) -> bool { 1624 return isLoadStoreLegal(ST, Query); 1625 }); 1626 1627 if (ST.hasFlatAddressSpace()) { 1628 ExtLoads.legalForTypesWithMemDesc( 1629 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1630 } 1631 1632 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1633 // 64-bits. 1634 // 1635 // TODO: Should generalize bitcast action into coerce, which will also cover 1636 // inserting addrspacecasts. 1637 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1638 1639 ExtLoads.clampScalar(0, S32, S32) 1640 .widenScalarToNextPow2(0) 1641 .lower(); 1642 1643 auto &Atomics = getActionDefinitionsBuilder( 1644 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1645 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1646 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1647 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) 1648 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1649 {S64, GlobalPtr}, {S64, LocalPtr}, 1650 {S32, RegionPtr}, {S64, RegionPtr}}); 1651 if (ST.hasFlatAddressSpace()) { 1652 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1653 } 1654 1655 // TODO: v2bf16 operations, and fat buffer pointer support. 1656 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1657 if (ST.hasLDSFPAtomicAddF32()) { 1658 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1659 if (ST.hasLdsAtomicAddF64()) 1660 Atomic.legalFor({{S64, LocalPtr}}); 1661 if (ST.hasAtomicDsPkAdd16Insts()) 1662 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}}); 1663 } 1664 if (ST.hasAtomicFaddInsts()) 1665 Atomic.legalFor({{S32, GlobalPtr}}); 1666 if (ST.hasFlatAtomicFaddF32Inst()) 1667 Atomic.legalFor({{S32, FlatPtr}}); 1668 1669 if (ST.hasGFX90AInsts()) { 1670 // These are legal with some caveats, and should have undergone expansion in 1671 // the IR in most situations 1672 // TODO: Move atomic expansion into legalizer 1673 Atomic.legalFor({ 1674 {S32, GlobalPtr}, 1675 {S64, GlobalPtr}, 1676 {S64, FlatPtr} 1677 }); 1678 } 1679 1680 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() || 1681 ST.hasAtomicBufferGlobalPkAddF16Insts()) 1682 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}}); 1683 if (ST.hasAtomicGlobalPkAddBF16Inst()) 1684 Atomic.legalFor({{V2BF16, GlobalPtr}}); 1685 if (ST.hasAtomicFlatPkAdd16Insts()) 1686 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}}); 1687 1688 1689 // Most of the legalization work here is done by AtomicExpand. We could 1690 // probably use a simpler legality rule that just assumes anything is OK. 1691 auto &AtomicFMinFMax = 1692 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX}) 1693 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}}); 1694 1695 if (ST.hasAtomicFMinFMaxF32GlobalInsts()) 1696 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}}); 1697 if (ST.hasAtomicFMinFMaxF64GlobalInsts()) 1698 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}}); 1699 if (ST.hasAtomicFMinFMaxF32FlatInsts()) 1700 AtomicFMinFMax.legalFor({F32, FlatPtr}); 1701 if (ST.hasAtomicFMinFMaxF64FlatInsts()) 1702 AtomicFMinFMax.legalFor({F64, FlatPtr}); 1703 1704 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1705 // demarshalling 1706 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1707 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1708 {S32, FlatPtr}, {S64, FlatPtr}}) 1709 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1710 {S32, RegionPtr}, {S64, RegionPtr}}); 1711 // TODO: Pointer types, any 32-bit or 64-bit vector 1712 1713 // Condition should be s32 for scalar, s1 for vector. 1714 getActionDefinitionsBuilder(G_SELECT) 1715 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1716 LocalPtr, FlatPtr, PrivatePtr, 1717 LLT::fixed_vector(2, LocalPtr), 1718 LLT::fixed_vector(2, PrivatePtr)}, 1719 {S1, S32}) 1720 .clampScalar(0, S16, S64) 1721 .scalarize(1) 1722 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1723 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1724 .clampMaxNumElements(0, S32, 2) 1725 .clampMaxNumElements(0, LocalPtr, 2) 1726 .clampMaxNumElements(0, PrivatePtr, 2) 1727 .scalarize(0) 1728 .widenScalarToNextPow2(0) 1729 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1730 1731 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1732 // be more flexible with the shift amount type. 1733 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1734 .legalFor({{S32, S32}, {S64, S32}}); 1735 if (ST.has16BitInsts()) { 1736 if (ST.hasVOP3PInsts()) { 1737 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1738 .clampMaxNumElements(0, S16, 2); 1739 } else 1740 Shifts.legalFor({{S16, S16}}); 1741 1742 // TODO: Support 16-bit shift amounts for all types 1743 Shifts.widenScalarIf( 1744 [=](const LegalityQuery &Query) { 1745 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1746 // 32-bit amount. 1747 const LLT ValTy = Query.Types[0]; 1748 const LLT AmountTy = Query.Types[1]; 1749 return ValTy.getSizeInBits() <= 16 && 1750 AmountTy.getSizeInBits() < 16; 1751 }, changeTo(1, S16)); 1752 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1753 Shifts.clampScalar(1, S32, S32); 1754 Shifts.widenScalarToNextPow2(0, 16); 1755 Shifts.clampScalar(0, S16, S64); 1756 1757 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1758 .minScalar(0, S16) 1759 .scalarize(0) 1760 .lower(); 1761 } else { 1762 // Make sure we legalize the shift amount type first, as the general 1763 // expansion for the shifted type will produce much worse code if it hasn't 1764 // been truncated already. 1765 Shifts.clampScalar(1, S32, S32); 1766 Shifts.widenScalarToNextPow2(0, 32); 1767 Shifts.clampScalar(0, S32, S64); 1768 1769 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1770 .minScalar(0, S32) 1771 .scalarize(0) 1772 .lower(); 1773 } 1774 Shifts.scalarize(0); 1775 1776 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1777 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1778 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1779 unsigned IdxTypeIdx = 2; 1780 1781 getActionDefinitionsBuilder(Op) 1782 .customIf([=](const LegalityQuery &Query) { 1783 const LLT EltTy = Query.Types[EltTypeIdx]; 1784 const LLT VecTy = Query.Types[VecTypeIdx]; 1785 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1786 const unsigned EltSize = EltTy.getSizeInBits(); 1787 const bool isLegalVecType = 1788 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits()); 1789 // Address space 8 pointers are 128-bit wide values, but the logic 1790 // below will try to bitcast them to 2N x s64, which will fail. 1791 // Therefore, as an intermediate step, wrap extracts/insertions from a 1792 // ptrtoint-ing the vector and scalar arguments (or inttoptring the 1793 // extraction result) in order to produce a vector operation that can 1794 // be handled by the logic below. 1795 if (EltTy.isPointer() && EltSize > 64) 1796 return true; 1797 return (EltSize == 32 || EltSize == 64) && 1798 VecTy.getSizeInBits() % 32 == 0 && 1799 VecTy.getSizeInBits() <= MaxRegisterSize && 1800 IdxTy.getSizeInBits() == 32 && 1801 isLegalVecType; 1802 }) 1803 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1804 bitcastToVectorElement32(VecTypeIdx)) 1805 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1806 .bitcastIf( 1807 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1808 [=](const LegalityQuery &Query) { 1809 // For > 64-bit element types, try to turn this into a 64-bit 1810 // element vector since we may be able to do better indexing 1811 // if this is scalar. If not, fall back to 32. 1812 const LLT EltTy = Query.Types[EltTypeIdx]; 1813 const LLT VecTy = Query.Types[VecTypeIdx]; 1814 const unsigned DstEltSize = EltTy.getSizeInBits(); 1815 const unsigned VecSize = VecTy.getSizeInBits(); 1816 1817 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1818 return std::pair( 1819 VecTypeIdx, 1820 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1821 }) 1822 .clampScalar(EltTypeIdx, S32, S64) 1823 .clampScalar(VecTypeIdx, S32, S64) 1824 .clampScalar(IdxTypeIdx, S32, S32) 1825 .clampMaxNumElements(VecTypeIdx, S32, 32) 1826 // TODO: Clamp elements for 64-bit vectors? 1827 .moreElementsIf( 1828 isIllegalRegisterType(VecTypeIdx), 1829 moreElementsToNextExistingRegClass(VecTypeIdx)) 1830 // It should only be necessary with variable indexes. 1831 // As a last resort, lower to the stack 1832 .lower(); 1833 } 1834 1835 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1836 .unsupportedIf([=](const LegalityQuery &Query) { 1837 const LLT &EltTy = Query.Types[1].getElementType(); 1838 return Query.Types[0] != EltTy; 1839 }); 1840 1841 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1842 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1843 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1844 1845 // FIXME: Doesn't handle extract of illegal sizes. 1846 getActionDefinitionsBuilder(Op) 1847 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1848 .lowerIf([=](const LegalityQuery &Query) { 1849 // Sub-vector(or single element) insert and extract. 1850 // TODO: verify immediate offset here since lower only works with 1851 // whole elements. 1852 const LLT BigTy = Query.Types[BigTyIdx]; 1853 return BigTy.isVector(); 1854 }) 1855 // FIXME: Multiples of 16 should not be legal. 1856 .legalIf([=](const LegalityQuery &Query) { 1857 const LLT BigTy = Query.Types[BigTyIdx]; 1858 const LLT LitTy = Query.Types[LitTyIdx]; 1859 return (BigTy.getSizeInBits() % 32 == 0) && 1860 (LitTy.getSizeInBits() % 16 == 0); 1861 }) 1862 .widenScalarIf( 1863 [=](const LegalityQuery &Query) { 1864 const LLT BigTy = Query.Types[BigTyIdx]; 1865 return (BigTy.getScalarSizeInBits() < 16); 1866 }, 1867 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1868 .widenScalarIf( 1869 [=](const LegalityQuery &Query) { 1870 const LLT LitTy = Query.Types[LitTyIdx]; 1871 return (LitTy.getScalarSizeInBits() < 16); 1872 }, 1873 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1874 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1875 .widenScalarToNextPow2(BigTyIdx, 32); 1876 1877 } 1878 1879 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1880 .legalForCartesianProduct(AllS32Vectors, {S32}) 1881 .legalForCartesianProduct(AllS64Vectors, {S64}) 1882 .clampNumElements(0, V16S32, V32S32) 1883 .clampNumElements(0, V2S64, V16S64) 1884 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) 1885 .moreElementsIf( 1886 isIllegalRegisterType(0), 1887 moreElementsToNextExistingRegClass(0)); 1888 1889 if (ST.hasScalarPackInsts()) { 1890 BuildVector 1891 // FIXME: Should probably widen s1 vectors straight to s32 1892 .minScalarOrElt(0, S16) 1893 .minScalar(1, S16); 1894 1895 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1896 .legalFor({V2S16, S32}) 1897 .lower(); 1898 } else { 1899 BuildVector.customFor({V2S16, S16}); 1900 BuildVector.minScalarOrElt(0, S32); 1901 1902 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1903 .customFor({V2S16, S32}) 1904 .lower(); 1905 } 1906 1907 BuildVector.legalIf(isRegisterType(0)); 1908 1909 // FIXME: Clamp maximum size 1910 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1911 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1912 .clampMaxNumElements(0, S32, 32) 1913 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1914 .clampMaxNumElements(0, S16, 64); 1915 1916 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1917 1918 // Merge/Unmerge 1919 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1920 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1921 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1922 1923 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1924 const LLT Ty = Query.Types[TypeIdx]; 1925 if (Ty.isVector()) { 1926 const LLT &EltTy = Ty.getElementType(); 1927 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1928 return true; 1929 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits())) 1930 return true; 1931 } 1932 return false; 1933 }; 1934 1935 auto &Builder = getActionDefinitionsBuilder(Op) 1936 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1937 .lowerFor({{S16, V2S16}}) 1938 .lowerIf([=](const LegalityQuery &Query) { 1939 const LLT BigTy = Query.Types[BigTyIdx]; 1940 return BigTy.getSizeInBits() == 32; 1941 }) 1942 // Try to widen to s16 first for small types. 1943 // TODO: Only do this on targets with legal s16 shifts 1944 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1945 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1946 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1947 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1948 elementTypeIs(1, S16)), 1949 changeTo(1, V2S16)) 1950 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1951 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1952 // valid. 1953 .clampScalar(LitTyIdx, S32, S512) 1954 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1955 // Break up vectors with weird elements into scalars 1956 .fewerElementsIf( 1957 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1958 scalarize(0)) 1959 .fewerElementsIf( 1960 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1961 scalarize(1)) 1962 .clampScalar(BigTyIdx, S32, MaxScalar); 1963 1964 if (Op == G_MERGE_VALUES) { 1965 Builder.widenScalarIf( 1966 // TODO: Use 16-bit shifts if legal for 8-bit values? 1967 [=](const LegalityQuery &Query) { 1968 const LLT Ty = Query.Types[LitTyIdx]; 1969 return Ty.getSizeInBits() < 32; 1970 }, 1971 changeTo(LitTyIdx, S32)); 1972 } 1973 1974 Builder.widenScalarIf( 1975 [=](const LegalityQuery &Query) { 1976 const LLT Ty = Query.Types[BigTyIdx]; 1977 return Ty.getSizeInBits() % 16 != 0; 1978 }, 1979 [=](const LegalityQuery &Query) { 1980 // Pick the next power of 2, or a multiple of 64 over 128. 1981 // Whichever is smaller. 1982 const LLT &Ty = Query.Types[BigTyIdx]; 1983 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1984 if (NewSizeInBits >= 256) { 1985 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1986 if (RoundedTo < NewSizeInBits) 1987 NewSizeInBits = RoundedTo; 1988 } 1989 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1990 }) 1991 // Any vectors left are the wrong size. Scalarize them. 1992 .scalarize(0) 1993 .scalarize(1); 1994 } 1995 1996 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1997 // RegBankSelect. 1998 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1999 .legalFor({{S32}, {S64}}); 2000 2001 if (ST.hasVOP3PInsts()) { 2002 SextInReg.lowerFor({{V2S16}}) 2003 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 2004 // get more vector shift opportunities, since we'll get those when 2005 // expanded. 2006 .clampMaxNumElementsStrict(0, S16, 2); 2007 } else if (ST.has16BitInsts()) { 2008 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 2009 } else { 2010 // Prefer to promote to s32 before lowering if we don't have 16-bit 2011 // shifts. This avoid a lot of intermediate truncate and extend operations. 2012 SextInReg.lowerFor({{S32}, {S64}}); 2013 } 2014 2015 SextInReg 2016 .scalarize(0) 2017 .clampScalar(0, S32, S64) 2018 .lower(); 2019 2020 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 2021 .scalarize(0) 2022 .lower(); 2023 2024 // TODO: Only Try to form v2s16 with legal packed instructions. 2025 getActionDefinitionsBuilder(G_FSHR) 2026 .legalFor({{S32, S32}}) 2027 .lowerFor({{V2S16, V2S16}}) 2028 .clampMaxNumElementsStrict(0, S16, 2) 2029 .scalarize(0) 2030 .lower(); 2031 2032 if (ST.hasVOP3PInsts()) { 2033 getActionDefinitionsBuilder(G_FSHL) 2034 .lowerFor({{V2S16, V2S16}}) 2035 .clampMaxNumElementsStrict(0, S16, 2) 2036 .scalarize(0) 2037 .lower(); 2038 } else { 2039 getActionDefinitionsBuilder(G_FSHL) 2040 .scalarize(0) 2041 .lower(); 2042 } 2043 2044 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 2045 .legalFor({S64}); 2046 2047 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64}); 2048 2049 getActionDefinitionsBuilder(G_FENCE) 2050 .alwaysLegal(); 2051 2052 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 2053 .scalarize(0) 2054 .minScalar(0, S32) 2055 .lower(); 2056 2057 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 2058 .legalFor({{S32, S32}, {S64, S32}}) 2059 .clampScalar(1, S32, S32) 2060 .clampScalar(0, S32, S64) 2061 .widenScalarToNextPow2(0) 2062 .scalarize(0); 2063 2064 getActionDefinitionsBuilder( 2065 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops 2066 G_FCOPYSIGN, 2067 2068 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB, 2069 G_READ_REGISTER, G_WRITE_REGISTER, 2070 2071 G_SADDO, G_SSUBO}) 2072 .lower(); 2073 2074 if (ST.hasIEEEMinMax()) { 2075 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) 2076 .legalFor(FPTypesPK16) 2077 .clampMaxNumElements(0, S16, 2) 2078 .scalarize(0); 2079 } else { 2080 // TODO: Implement 2081 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 2082 } 2083 2084 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 2085 .lower(); 2086 2087 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom(); 2088 2089 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 2090 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 2091 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 2092 .unsupported(); 2093 2094 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal(); 2095 2096 getLegacyLegalizerInfo().computeTables(); 2097 verify(*ST.getInstrInfo()); 2098 } 2099 2100 bool AMDGPULegalizerInfo::legalizeCustom( 2101 LegalizerHelper &Helper, MachineInstr &MI, 2102 LostDebugLocObserver &LocObserver) const { 2103 MachineIRBuilder &B = Helper.MIRBuilder; 2104 MachineRegisterInfo &MRI = *B.getMRI(); 2105 2106 switch (MI.getOpcode()) { 2107 case TargetOpcode::G_ADDRSPACE_CAST: 2108 return legalizeAddrSpaceCast(MI, MRI, B); 2109 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: 2110 return legalizeFroundeven(MI, MRI, B); 2111 case TargetOpcode::G_FCEIL: 2112 return legalizeFceil(MI, MRI, B); 2113 case TargetOpcode::G_FREM: 2114 return legalizeFrem(MI, MRI, B); 2115 case TargetOpcode::G_INTRINSIC_TRUNC: 2116 return legalizeIntrinsicTrunc(MI, MRI, B); 2117 case TargetOpcode::G_SITOFP: 2118 return legalizeITOFP(MI, MRI, B, true); 2119 case TargetOpcode::G_UITOFP: 2120 return legalizeITOFP(MI, MRI, B, false); 2121 case TargetOpcode::G_FPTOSI: 2122 return legalizeFPTOI(MI, MRI, B, true); 2123 case TargetOpcode::G_FPTOUI: 2124 return legalizeFPTOI(MI, MRI, B, false); 2125 case TargetOpcode::G_FMINNUM: 2126 case TargetOpcode::G_FMAXNUM: 2127 case TargetOpcode::G_FMINNUM_IEEE: 2128 case TargetOpcode::G_FMAXNUM_IEEE: 2129 return legalizeMinNumMaxNum(Helper, MI); 2130 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2131 return legalizeExtractVectorElt(MI, MRI, B); 2132 case TargetOpcode::G_INSERT_VECTOR_ELT: 2133 return legalizeInsertVectorElt(MI, MRI, B); 2134 case TargetOpcode::G_FSIN: 2135 case TargetOpcode::G_FCOS: 2136 return legalizeSinCos(MI, MRI, B); 2137 case TargetOpcode::G_GLOBAL_VALUE: 2138 return legalizeGlobalValue(MI, MRI, B); 2139 case TargetOpcode::G_LOAD: 2140 case TargetOpcode::G_SEXTLOAD: 2141 case TargetOpcode::G_ZEXTLOAD: 2142 return legalizeLoad(Helper, MI); 2143 case TargetOpcode::G_STORE: 2144 return legalizeStore(Helper, MI); 2145 case TargetOpcode::G_FMAD: 2146 return legalizeFMad(MI, MRI, B); 2147 case TargetOpcode::G_FDIV: 2148 return legalizeFDIV(MI, MRI, B); 2149 case TargetOpcode::G_FFREXP: 2150 return legalizeFFREXP(MI, MRI, B); 2151 case TargetOpcode::G_FSQRT: 2152 return legalizeFSQRT(MI, MRI, B); 2153 case TargetOpcode::G_UDIV: 2154 case TargetOpcode::G_UREM: 2155 case TargetOpcode::G_UDIVREM: 2156 return legalizeUnsignedDIV_REM(MI, MRI, B); 2157 case TargetOpcode::G_SDIV: 2158 case TargetOpcode::G_SREM: 2159 case TargetOpcode::G_SDIVREM: 2160 return legalizeSignedDIV_REM(MI, MRI, B); 2161 case TargetOpcode::G_ATOMIC_CMPXCHG: 2162 return legalizeAtomicCmpXChg(MI, MRI, B); 2163 case TargetOpcode::G_FLOG2: 2164 return legalizeFlog2(MI, B); 2165 case TargetOpcode::G_FLOG: 2166 case TargetOpcode::G_FLOG10: 2167 return legalizeFlogCommon(MI, B); 2168 case TargetOpcode::G_FEXP2: 2169 return legalizeFExp2(MI, B); 2170 case TargetOpcode::G_FEXP: 2171 case TargetOpcode::G_FEXP10: 2172 return legalizeFExp(MI, B); 2173 case TargetOpcode::G_FPOW: 2174 return legalizeFPow(MI, B); 2175 case TargetOpcode::G_FFLOOR: 2176 return legalizeFFloor(MI, MRI, B); 2177 case TargetOpcode::G_BUILD_VECTOR: 2178 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2179 return legalizeBuildVector(MI, MRI, B); 2180 case TargetOpcode::G_MUL: 2181 return legalizeMul(Helper, MI); 2182 case TargetOpcode::G_CTLZ: 2183 case TargetOpcode::G_CTTZ: 2184 return legalizeCTLZ_CTTZ(MI, MRI, B); 2185 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 2186 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B); 2187 case TargetOpcode::G_STACKSAVE: 2188 return legalizeStackSave(MI, B); 2189 case TargetOpcode::G_GET_FPENV: 2190 return legalizeGetFPEnv(MI, MRI, B); 2191 case TargetOpcode::G_SET_FPENV: 2192 return legalizeSetFPEnv(MI, MRI, B); 2193 case TargetOpcode::G_TRAP: 2194 return legalizeTrap(MI, MRI, B); 2195 case TargetOpcode::G_DEBUGTRAP: 2196 return legalizeDebugTrap(MI, MRI, B); 2197 default: 2198 return false; 2199 } 2200 2201 llvm_unreachable("expected switch to return"); 2202 } 2203 2204 Register AMDGPULegalizerInfo::getSegmentAperture( 2205 unsigned AS, 2206 MachineRegisterInfo &MRI, 2207 MachineIRBuilder &B) const { 2208 MachineFunction &MF = B.getMF(); 2209 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 2210 const LLT S32 = LLT::scalar(32); 2211 const LLT S64 = LLT::scalar(64); 2212 2213 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 2214 2215 if (ST.hasApertureRegs()) { 2216 // Note: this register is somewhat broken. When used as a 32-bit operand, 2217 // it only returns zeroes. The real value is in the upper 32 bits. 2218 // Thus, we must emit extract the high 32 bits. 2219 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 2220 ? AMDGPU::SRC_SHARED_BASE 2221 : AMDGPU::SRC_PRIVATE_BASE; 2222 // FIXME: It would be more natural to emit a COPY here, but then copy 2223 // coalescing would kick in and it would think it's okay to use the "HI" 2224 // subregister (instead of extracting the HI 32 bits) which is an artificial 2225 // (unusable) register. 2226 // Register TableGen definitions would need an overhaul to get rid of the 2227 // artificial "HI" aperture registers and prevent this kind of issue from 2228 // happening. 2229 Register Dst = MRI.createGenericVirtualRegister(S64); 2230 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); 2231 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); 2232 return B.buildUnmerge(S32, Dst).getReg(1); 2233 } 2234 2235 // TODO: can we be smarter about machine pointer info? 2236 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 2237 Register LoadAddr = MRI.createGenericVirtualRegister( 2238 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2239 // For code object version 5, private_base and shared_base are passed through 2240 // implicit kernargs. 2241 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= 2242 AMDGPU::AMDHSA_COV5) { 2243 AMDGPUTargetLowering::ImplicitParameter Param = 2244 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 2245 : AMDGPUTargetLowering::PRIVATE_BASE; 2246 uint64_t Offset = 2247 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 2248 2249 Register KernargPtrReg = MRI.createGenericVirtualRegister( 2250 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2251 2252 if (!loadInputValue(KernargPtrReg, B, 2253 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 2254 return Register(); 2255 2256 MachineMemOperand *MMO = MF.getMachineMemOperand( 2257 PtrInfo, 2258 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2259 MachineMemOperand::MOInvariant, 2260 LLT::scalar(32), commonAlignment(Align(64), Offset)); 2261 2262 // Pointer address 2263 B.buildPtrAdd(LoadAddr, KernargPtrReg, 2264 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 2265 // Load address 2266 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2267 } 2268 2269 Register QueuePtr = MRI.createGenericVirtualRegister( 2270 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2271 2272 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 2273 return Register(); 2274 2275 // Offset into amd_queue_t for group_segment_aperture_base_hi / 2276 // private_segment_aperture_base_hi. 2277 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 2278 2279 MachineMemOperand *MMO = MF.getMachineMemOperand( 2280 PtrInfo, 2281 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2282 MachineMemOperand::MOInvariant, 2283 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 2284 2285 B.buildPtrAdd(LoadAddr, QueuePtr, 2286 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 2287 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2288 } 2289 2290 /// Return true if the value is a known valid address, such that a null check is 2291 /// not necessary. 2292 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 2293 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 2294 MachineInstr *Def = MRI.getVRegDef(Val); 2295 switch (Def->getOpcode()) { 2296 case AMDGPU::G_FRAME_INDEX: 2297 case AMDGPU::G_GLOBAL_VALUE: 2298 case AMDGPU::G_BLOCK_ADDR: 2299 return true; 2300 case AMDGPU::G_CONSTANT: { 2301 const ConstantInt *CI = Def->getOperand(1).getCImm(); 2302 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 2303 } 2304 default: 2305 return false; 2306 } 2307 2308 return false; 2309 } 2310 2311 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 2312 MachineInstr &MI, MachineRegisterInfo &MRI, 2313 MachineIRBuilder &B) const { 2314 MachineFunction &MF = B.getMF(); 2315 2316 // MI can either be a G_ADDRSPACE_CAST or a 2317 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull 2318 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST || 2319 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() == 2320 Intrinsic::amdgcn_addrspacecast_nonnull)); 2321 2322 const LLT S32 = LLT::scalar(32); 2323 Register Dst = MI.getOperand(0).getReg(); 2324 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg() 2325 : MI.getOperand(1).getReg(); 2326 LLT DstTy = MRI.getType(Dst); 2327 LLT SrcTy = MRI.getType(Src); 2328 unsigned DestAS = DstTy.getAddressSpace(); 2329 unsigned SrcAS = SrcTy.getAddressSpace(); 2330 2331 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 2332 // vector element. 2333 assert(!DstTy.isVector()); 2334 2335 const AMDGPUTargetMachine &TM 2336 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 2337 2338 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 2339 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 2340 return true; 2341 } 2342 2343 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 2344 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 2345 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2346 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for 2347 // G_ADDRSPACE_CAST we need to guess. 2348 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) { 2349 // Extract low 32-bits of the pointer. 2350 B.buildExtract(Dst, Src, 0); 2351 MI.eraseFromParent(); 2352 return true; 2353 } 2354 2355 unsigned NullVal = TM.getNullPointerValue(DestAS); 2356 2357 auto SegmentNull = B.buildConstant(DstTy, NullVal); 2358 auto FlatNull = B.buildConstant(SrcTy, 0); 2359 2360 // Extract low 32-bits of the pointer. 2361 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 2362 2363 auto CmpRes = 2364 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 2365 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 2366 2367 MI.eraseFromParent(); 2368 return true; 2369 } 2370 2371 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 2372 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 2373 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2374 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register { 2375 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 2376 if (!ApertureReg.isValid()) 2377 return false; 2378 2379 // Coerce the type of the low half of the result so we can use 2380 // merge_values. 2381 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 2382 2383 // TODO: Should we allow mismatched types but matching sizes in merges to 2384 // avoid the ptrtoint? 2385 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0); 2386 }; 2387 2388 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for 2389 // G_ADDRSPACE_CAST we need to guess. 2390 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) { 2391 castLocalOrPrivateToFlat(Dst); 2392 MI.eraseFromParent(); 2393 return true; 2394 } 2395 2396 Register BuildPtr = castLocalOrPrivateToFlat(DstTy); 2397 2398 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 2399 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 2400 2401 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 2402 SegmentNull.getReg(0)); 2403 2404 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 2405 2406 MI.eraseFromParent(); 2407 return true; 2408 } 2409 2410 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2411 SrcTy.getSizeInBits() == 64) { 2412 // Truncate. 2413 B.buildExtract(Dst, Src, 0); 2414 MI.eraseFromParent(); 2415 return true; 2416 } 2417 2418 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2419 DstTy.getSizeInBits() == 64) { 2420 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2421 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2422 auto PtrLo = B.buildPtrToInt(S32, Src); 2423 auto HighAddr = B.buildConstant(S32, AddrHiVal); 2424 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); 2425 MI.eraseFromParent(); 2426 return true; 2427 } 2428 2429 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 2430 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2431 2432 LLVMContext &Ctx = MF.getFunction().getContext(); 2433 Ctx.diagnose(InvalidAddrSpaceCast); 2434 B.buildUndef(Dst); 2435 MI.eraseFromParent(); 2436 return true; 2437 } 2438 2439 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI, 2440 MachineRegisterInfo &MRI, 2441 MachineIRBuilder &B) const { 2442 Register Src = MI.getOperand(1).getReg(); 2443 LLT Ty = MRI.getType(Src); 2444 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2445 2446 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2447 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2448 2449 auto C1 = B.buildFConstant(Ty, C1Val); 2450 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2451 2452 // TODO: Should this propagate fast-math-flags? 2453 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2454 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2455 2456 auto C2 = B.buildFConstant(Ty, C2Val); 2457 auto Fabs = B.buildFAbs(Ty, Src); 2458 2459 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2460 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2461 MI.eraseFromParent(); 2462 return true; 2463 } 2464 2465 bool AMDGPULegalizerInfo::legalizeFceil( 2466 MachineInstr &MI, MachineRegisterInfo &MRI, 2467 MachineIRBuilder &B) const { 2468 2469 const LLT S1 = LLT::scalar(1); 2470 const LLT S64 = LLT::scalar(64); 2471 2472 Register Src = MI.getOperand(1).getReg(); 2473 assert(MRI.getType(Src) == S64); 2474 2475 // result = trunc(src) 2476 // if (src > 0.0 && src != result) 2477 // result += 1.0 2478 2479 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2480 2481 const auto Zero = B.buildFConstant(S64, 0.0); 2482 const auto One = B.buildFConstant(S64, 1.0); 2483 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2484 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2485 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2486 auto Add = B.buildSelect(S64, And, One, Zero); 2487 2488 // TODO: Should this propagate fast-math-flags? 2489 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2490 MI.eraseFromParent(); 2491 return true; 2492 } 2493 2494 bool AMDGPULegalizerInfo::legalizeFrem( 2495 MachineInstr &MI, MachineRegisterInfo &MRI, 2496 MachineIRBuilder &B) const { 2497 Register DstReg = MI.getOperand(0).getReg(); 2498 Register Src0Reg = MI.getOperand(1).getReg(); 2499 Register Src1Reg = MI.getOperand(2).getReg(); 2500 auto Flags = MI.getFlags(); 2501 LLT Ty = MRI.getType(DstReg); 2502 2503 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2504 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2505 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2506 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2507 MI.eraseFromParent(); 2508 return true; 2509 } 2510 2511 static MachineInstrBuilder extractF64Exponent(Register Hi, 2512 MachineIRBuilder &B) { 2513 const unsigned FractBits = 52; 2514 const unsigned ExpBits = 11; 2515 LLT S32 = LLT::scalar(32); 2516 2517 auto Const0 = B.buildConstant(S32, FractBits - 32); 2518 auto Const1 = B.buildConstant(S32, ExpBits); 2519 2520 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}) 2521 .addUse(Hi) 2522 .addUse(Const0.getReg(0)) 2523 .addUse(Const1.getReg(0)); 2524 2525 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2526 } 2527 2528 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2529 MachineInstr &MI, MachineRegisterInfo &MRI, 2530 MachineIRBuilder &B) const { 2531 const LLT S1 = LLT::scalar(1); 2532 const LLT S32 = LLT::scalar(32); 2533 const LLT S64 = LLT::scalar(64); 2534 2535 Register Src = MI.getOperand(1).getReg(); 2536 assert(MRI.getType(Src) == S64); 2537 2538 // TODO: Should this use extract since the low half is unused? 2539 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2540 Register Hi = Unmerge.getReg(1); 2541 2542 // Extract the upper half, since this is where we will find the sign and 2543 // exponent. 2544 auto Exp = extractF64Exponent(Hi, B); 2545 2546 const unsigned FractBits = 52; 2547 2548 // Extract the sign bit. 2549 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2550 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2551 2552 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2553 2554 const auto Zero32 = B.buildConstant(S32, 0); 2555 2556 // Extend back to 64-bits. 2557 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); 2558 2559 auto Shr = B.buildAShr(S64, FractMask, Exp); 2560 auto Not = B.buildNot(S64, Shr); 2561 auto Tmp0 = B.buildAnd(S64, Src, Not); 2562 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2563 2564 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2565 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2566 2567 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2568 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2569 MI.eraseFromParent(); 2570 return true; 2571 } 2572 2573 bool AMDGPULegalizerInfo::legalizeITOFP( 2574 MachineInstr &MI, MachineRegisterInfo &MRI, 2575 MachineIRBuilder &B, bool Signed) const { 2576 2577 Register Dst = MI.getOperand(0).getReg(); 2578 Register Src = MI.getOperand(1).getReg(); 2579 2580 const LLT S64 = LLT::scalar(64); 2581 const LLT S32 = LLT::scalar(32); 2582 2583 assert(MRI.getType(Src) == S64); 2584 2585 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2586 auto ThirtyTwo = B.buildConstant(S32, 32); 2587 2588 if (MRI.getType(Dst) == S64) { 2589 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2590 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2591 2592 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2593 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo); 2594 2595 // TODO: Should this propagate fast-math-flags? 2596 B.buildFAdd(Dst, LdExp, CvtLo); 2597 MI.eraseFromParent(); 2598 return true; 2599 } 2600 2601 assert(MRI.getType(Dst) == S32); 2602 2603 auto One = B.buildConstant(S32, 1); 2604 2605 MachineInstrBuilder ShAmt; 2606 if (Signed) { 2607 auto ThirtyOne = B.buildConstant(S32, 31); 2608 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2609 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2610 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2611 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}) 2612 .addUse(Unmerge.getReg(1)); 2613 auto LS2 = B.buildSub(S32, LS, One); 2614 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2615 } else 2616 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2617 auto Norm = B.buildShl(S64, Src, ShAmt); 2618 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2619 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2620 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2621 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2622 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2623 B.buildFLdexp(Dst, FVal, Scale); 2624 MI.eraseFromParent(); 2625 return true; 2626 } 2627 2628 // TODO: Copied from DAG implementation. Verify logic and document how this 2629 // actually works. 2630 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2631 MachineRegisterInfo &MRI, 2632 MachineIRBuilder &B, 2633 bool Signed) const { 2634 2635 Register Dst = MI.getOperand(0).getReg(); 2636 Register Src = MI.getOperand(1).getReg(); 2637 2638 const LLT S64 = LLT::scalar(64); 2639 const LLT S32 = LLT::scalar(32); 2640 2641 const LLT SrcLT = MRI.getType(Src); 2642 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2643 2644 unsigned Flags = MI.getFlags(); 2645 2646 // The basic idea of converting a floating point number into a pair of 32-bit 2647 // integers is illustrated as follows: 2648 // 2649 // tf := trunc(val); 2650 // hif := floor(tf * 2^-32); 2651 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2652 // hi := fptoi(hif); 2653 // lo := fptoi(lof); 2654 // 2655 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2656 MachineInstrBuilder Sign; 2657 if (Signed && SrcLT == S32) { 2658 // However, a 32-bit floating point number has only 23 bits mantissa and 2659 // it's not enough to hold all the significant bits of `lof` if val is 2660 // negative. To avoid the loss of precision, We need to take the absolute 2661 // value after truncating and flip the result back based on the original 2662 // signedness. 2663 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2664 Trunc = B.buildFAbs(S32, Trunc, Flags); 2665 } 2666 MachineInstrBuilder K0, K1; 2667 if (SrcLT == S64) { 2668 K0 = B.buildFConstant( 2669 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2670 K1 = B.buildFConstant( 2671 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2672 } else { 2673 K0 = B.buildFConstant( 2674 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000))); 2675 K1 = B.buildFConstant( 2676 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000))); 2677 } 2678 2679 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2680 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2681 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2682 2683 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2684 : B.buildFPTOUI(S32, FloorMul); 2685 auto Lo = B.buildFPTOUI(S32, Fma); 2686 2687 if (Signed && SrcLT == S32) { 2688 // Flip the result based on the signedness, which is either all 0s or 1s. 2689 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); 2690 // r := xor({lo, hi}, sign) - sign; 2691 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), 2692 Sign); 2693 } else 2694 B.buildMergeLikeInstr(Dst, {Lo, Hi}); 2695 MI.eraseFromParent(); 2696 2697 return true; 2698 } 2699 2700 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2701 MachineInstr &MI) const { 2702 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2703 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2704 2705 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2706 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2707 2708 // With ieee_mode disabled, the instructions have the correct behavior 2709 // already for G_FMINNUM/G_FMAXNUM 2710 if (!MFI->getMode().IEEE) 2711 return !IsIEEEOp; 2712 2713 if (IsIEEEOp) 2714 return true; 2715 2716 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2717 } 2718 2719 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2720 MachineInstr &MI, MachineRegisterInfo &MRI, 2721 MachineIRBuilder &B) const { 2722 // TODO: Should move some of this into LegalizerHelper. 2723 2724 // TODO: Promote dynamic indexing of s16 to s32 2725 2726 Register Dst = MI.getOperand(0).getReg(); 2727 Register Vec = MI.getOperand(1).getReg(); 2728 2729 LLT VecTy = MRI.getType(Vec); 2730 LLT EltTy = VecTy.getElementType(); 2731 assert(EltTy == MRI.getType(Dst)); 2732 2733 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2734 // but we can't go directly to that logic becasue you can't bitcast a vector 2735 // of pointers to a vector of integers. Therefore, introduce an intermediate 2736 // vector of integers using ptrtoint (and inttoptr on the output) in order to 2737 // drive the legalization forward. 2738 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2739 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2740 LLT IntVecTy = VecTy.changeElementType(IntTy); 2741 2742 auto IntVec = B.buildPtrToInt(IntVecTy, Vec); 2743 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2)); 2744 B.buildIntToPtr(Dst, IntElt); 2745 2746 MI.eraseFromParent(); 2747 return true; 2748 } 2749 2750 // FIXME: Artifact combiner probably should have replaced the truncated 2751 // constant before this, so we shouldn't need 2752 // getIConstantVRegValWithLookThrough. 2753 std::optional<ValueAndVReg> MaybeIdxVal = 2754 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2755 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2756 return true; 2757 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2758 2759 if (IdxVal < VecTy.getNumElements()) { 2760 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2761 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2762 } else { 2763 B.buildUndef(Dst); 2764 } 2765 2766 MI.eraseFromParent(); 2767 return true; 2768 } 2769 2770 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2771 MachineInstr &MI, MachineRegisterInfo &MRI, 2772 MachineIRBuilder &B) const { 2773 // TODO: Should move some of this into LegalizerHelper. 2774 2775 // TODO: Promote dynamic indexing of s16 to s32 2776 2777 Register Dst = MI.getOperand(0).getReg(); 2778 Register Vec = MI.getOperand(1).getReg(); 2779 Register Ins = MI.getOperand(2).getReg(); 2780 2781 LLT VecTy = MRI.getType(Vec); 2782 LLT EltTy = VecTy.getElementType(); 2783 assert(EltTy == MRI.getType(Ins)); 2784 2785 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2786 // but we can't go directly to that logic becasue you can't bitcast a vector 2787 // of pointers to a vector of integers. Therefore, make the pointer vector 2788 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd 2789 // new value, and then inttoptr the result vector back. This will then allow 2790 // the rest of legalization to take over. 2791 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2792 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2793 LLT IntVecTy = VecTy.changeElementType(IntTy); 2794 2795 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec); 2796 auto IntIns = B.buildPtrToInt(IntTy, Ins); 2797 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns, 2798 MI.getOperand(3)); 2799 B.buildIntToPtr(Dst, IntVecDest); 2800 MI.eraseFromParent(); 2801 return true; 2802 } 2803 2804 // FIXME: Artifact combiner probably should have replaced the truncated 2805 // constant before this, so we shouldn't need 2806 // getIConstantVRegValWithLookThrough. 2807 std::optional<ValueAndVReg> MaybeIdxVal = 2808 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2809 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2810 return true; 2811 2812 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2813 2814 unsigned NumElts = VecTy.getNumElements(); 2815 if (IdxVal < NumElts) { 2816 SmallVector<Register, 8> SrcRegs; 2817 for (unsigned i = 0; i < NumElts; ++i) 2818 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2819 B.buildUnmerge(SrcRegs, Vec); 2820 2821 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2822 B.buildMergeLikeInstr(Dst, SrcRegs); 2823 } else { 2824 B.buildUndef(Dst); 2825 } 2826 2827 MI.eraseFromParent(); 2828 return true; 2829 } 2830 2831 bool AMDGPULegalizerInfo::legalizeSinCos( 2832 MachineInstr &MI, MachineRegisterInfo &MRI, 2833 MachineIRBuilder &B) const { 2834 2835 Register DstReg = MI.getOperand(0).getReg(); 2836 Register SrcReg = MI.getOperand(1).getReg(); 2837 LLT Ty = MRI.getType(DstReg); 2838 unsigned Flags = MI.getFlags(); 2839 2840 Register TrigVal; 2841 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2842 if (ST.hasTrigReducedRange()) { 2843 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2844 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}) 2845 .addUse(MulVal.getReg(0)) 2846 .setMIFlags(Flags) 2847 .getReg(0); 2848 } else 2849 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2850 2851 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2852 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2853 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg)) 2854 .addUse(TrigVal) 2855 .setMIFlags(Flags); 2856 MI.eraseFromParent(); 2857 return true; 2858 } 2859 2860 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2861 MachineIRBuilder &B, 2862 const GlobalValue *GV, 2863 int64_t Offset, 2864 unsigned GAFlags) const { 2865 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2866 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2867 // to the following code sequence: 2868 // 2869 // For constant address space: 2870 // s_getpc_b64 s[0:1] 2871 // s_add_u32 s0, s0, $symbol 2872 // s_addc_u32 s1, s1, 0 2873 // 2874 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2875 // a fixup or relocation is emitted to replace $symbol with a literal 2876 // constant, which is a pc-relative offset from the encoding of the $symbol 2877 // operand to the global variable. 2878 // 2879 // For global address space: 2880 // s_getpc_b64 s[0:1] 2881 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2882 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2883 // 2884 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2885 // fixups or relocations are emitted to replace $symbol@*@lo and 2886 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2887 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2888 // operand to the global variable. 2889 2890 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2891 2892 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2893 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2894 2895 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2896 .addDef(PCReg); 2897 2898 MIB.addGlobalAddress(GV, Offset, GAFlags); 2899 if (GAFlags == SIInstrInfo::MO_NONE) 2900 MIB.addImm(0); 2901 else 2902 MIB.addGlobalAddress(GV, Offset, GAFlags + 1); 2903 2904 if (!B.getMRI()->getRegClassOrNull(PCReg)) 2905 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2906 2907 if (PtrTy.getSizeInBits() == 32) 2908 B.buildExtract(DstReg, PCReg, 0); 2909 return true; 2910 } 2911 2912 // Emit a ABS32_LO / ABS32_HI relocation stub. 2913 void AMDGPULegalizerInfo::buildAbsGlobalAddress( 2914 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, 2915 MachineRegisterInfo &MRI) const { 2916 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32; 2917 2918 LLT S32 = LLT::scalar(32); 2919 2920 // Use the destination directly, if and only if we store the lower address 2921 // part only and we don't have a register class being set. 2922 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg) 2923 ? DstReg 2924 : MRI.createGenericVirtualRegister(S32); 2925 2926 if (!MRI.getRegClassOrNull(AddrLo)) 2927 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass); 2928 2929 // Write the lower half. 2930 B.buildInstr(AMDGPU::S_MOV_B32) 2931 .addDef(AddrLo) 2932 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); 2933 2934 // If required, write the upper half as well. 2935 if (RequiresHighHalf) { 2936 assert(PtrTy.getSizeInBits() == 64 && 2937 "Must provide a 64-bit pointer type!"); 2938 2939 Register AddrHi = MRI.createGenericVirtualRegister(S32); 2940 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass); 2941 2942 B.buildInstr(AMDGPU::S_MOV_B32) 2943 .addDef(AddrHi) 2944 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI); 2945 2946 // Use the destination directly, if and only if we don't have a register 2947 // class being set. 2948 Register AddrDst = !MRI.getRegClassOrNull(DstReg) 2949 ? DstReg 2950 : MRI.createGenericVirtualRegister(LLT::scalar(64)); 2951 2952 if (!MRI.getRegClassOrNull(AddrDst)) 2953 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass); 2954 2955 B.buildMergeValues(AddrDst, {AddrLo, AddrHi}); 2956 2957 // If we created a new register for the destination, cast the result into 2958 // the final output. 2959 if (AddrDst != DstReg) 2960 B.buildCast(DstReg, AddrDst); 2961 } else if (AddrLo != DstReg) { 2962 // If we created a new register for the destination, cast the result into 2963 // the final output. 2964 B.buildCast(DstReg, AddrLo); 2965 } 2966 } 2967 2968 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2969 MachineInstr &MI, MachineRegisterInfo &MRI, 2970 MachineIRBuilder &B) const { 2971 Register DstReg = MI.getOperand(0).getReg(); 2972 LLT Ty = MRI.getType(DstReg); 2973 unsigned AS = Ty.getAddressSpace(); 2974 2975 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2976 MachineFunction &MF = B.getMF(); 2977 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2978 2979 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2980 if (!MFI->isModuleEntryFunction() && 2981 GV->getName() != "llvm.amdgcn.module.lds" && 2982 !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) { 2983 const Function &Fn = MF.getFunction(); 2984 DiagnosticInfoUnsupported BadLDSDecl( 2985 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2986 DS_Warning); 2987 Fn.getContext().diagnose(BadLDSDecl); 2988 2989 // We currently don't have a way to correctly allocate LDS objects that 2990 // aren't directly associated with a kernel. We do force inlining of 2991 // functions that use local objects. However, if these dead functions are 2992 // not eliminated, we don't want a compile time error. Just emit a warning 2993 // and a trap, since there should be no callable path here. 2994 B.buildTrap(); 2995 B.buildUndef(DstReg); 2996 MI.eraseFromParent(); 2997 return true; 2998 } 2999 3000 // TODO: We could emit code to handle the initialization somewhere. 3001 // We ignore the initializer for now and legalize it to allow selection. 3002 // The initializer will anyway get errored out during assembly emission. 3003 const SITargetLowering *TLI = ST.getTargetLowering(); 3004 if (!TLI->shouldUseLDSConstAddress(GV)) { 3005 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 3006 return true; // Leave in place; 3007 } 3008 3009 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 3010 Type *Ty = GV->getValueType(); 3011 // HIP uses an unsized array `extern __shared__ T s[]` or similar 3012 // zero-sized type in other languages to declare the dynamic shared 3013 // memory which size is not known at the compile time. They will be 3014 // allocated by the runtime and placed directly after the static 3015 // allocated ones. They all share the same offset. 3016 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 3017 // Adjust alignment for that dynamic shared memory array. 3018 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); 3019 LLT S32 = LLT::scalar(32); 3020 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}); 3021 B.buildIntToPtr(DstReg, Sz); 3022 MI.eraseFromParent(); 3023 return true; 3024 } 3025 } 3026 3027 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 3028 *cast<GlobalVariable>(GV))); 3029 MI.eraseFromParent(); 3030 return true; 3031 } 3032 3033 if (ST.isAmdPalOS() || ST.isMesa3DOS()) { 3034 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI); 3035 MI.eraseFromParent(); 3036 return true; 3037 } 3038 3039 const SITargetLowering *TLI = ST.getTargetLowering(); 3040 3041 if (TLI->shouldEmitFixup(GV)) { 3042 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 3043 MI.eraseFromParent(); 3044 return true; 3045 } 3046 3047 if (TLI->shouldEmitPCReloc(GV)) { 3048 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 3049 MI.eraseFromParent(); 3050 return true; 3051 } 3052 3053 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3054 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 3055 3056 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 3057 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 3058 MachinePointerInfo::getGOT(MF), 3059 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3060 MachineMemOperand::MOInvariant, 3061 LoadTy, Align(8)); 3062 3063 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 3064 3065 if (Ty.getSizeInBits() == 32) { 3066 // Truncate if this is a 32-bit constant address. 3067 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 3068 B.buildExtract(DstReg, Load, 0); 3069 } else 3070 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 3071 3072 MI.eraseFromParent(); 3073 return true; 3074 } 3075 3076 static LLT widenToNextPowerOf2(LLT Ty) { 3077 if (Ty.isVector()) 3078 return Ty.changeElementCount( 3079 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 3080 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 3081 } 3082 3083 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 3084 MachineInstr &MI) const { 3085 MachineIRBuilder &B = Helper.MIRBuilder; 3086 MachineRegisterInfo &MRI = *B.getMRI(); 3087 GISelChangeObserver &Observer = Helper.Observer; 3088 3089 Register PtrReg = MI.getOperand(1).getReg(); 3090 LLT PtrTy = MRI.getType(PtrReg); 3091 unsigned AddrSpace = PtrTy.getAddressSpace(); 3092 3093 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 3094 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3095 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 3096 Observer.changingInstr(MI); 3097 MI.getOperand(1).setReg(Cast.getReg(0)); 3098 Observer.changedInstr(MI); 3099 return true; 3100 } 3101 3102 if (MI.getOpcode() != AMDGPU::G_LOAD) 3103 return false; 3104 3105 Register ValReg = MI.getOperand(0).getReg(); 3106 LLT ValTy = MRI.getType(ValReg); 3107 3108 if (hasBufferRsrcWorkaround(ValTy)) { 3109 Observer.changingInstr(MI); 3110 castBufferRsrcFromV4I32(MI, B, MRI, 0); 3111 Observer.changedInstr(MI); 3112 return true; 3113 } 3114 3115 MachineMemOperand *MMO = *MI.memoperands_begin(); 3116 const unsigned ValSize = ValTy.getSizeInBits(); 3117 const LLT MemTy = MMO->getMemoryType(); 3118 const Align MemAlign = MMO->getAlign(); 3119 const unsigned MemSize = MemTy.getSizeInBits(); 3120 const uint64_t AlignInBits = 8 * MemAlign.value(); 3121 3122 // Widen non-power-of-2 loads to the alignment if needed 3123 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 3124 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 3125 3126 // This was already the correct extending load result type, so just adjust 3127 // the memory type. 3128 if (WideMemSize == ValSize) { 3129 MachineFunction &MF = B.getMF(); 3130 3131 MachineMemOperand *WideMMO = 3132 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 3133 Observer.changingInstr(MI); 3134 MI.setMemRefs(MF, {WideMMO}); 3135 Observer.changedInstr(MI); 3136 return true; 3137 } 3138 3139 // Don't bother handling edge case that should probably never be produced. 3140 if (ValSize > WideMemSize) 3141 return false; 3142 3143 LLT WideTy = widenToNextPowerOf2(ValTy); 3144 3145 Register WideLoad; 3146 if (!WideTy.isVector()) { 3147 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3148 B.buildTrunc(ValReg, WideLoad).getReg(0); 3149 } else { 3150 // Extract the subvector. 3151 3152 if (isRegisterType(ValTy)) { 3153 // If this a case where G_EXTRACT is legal, use it. 3154 // (e.g. <3 x s32> -> <4 x s32>) 3155 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3156 B.buildExtract(ValReg, WideLoad, 0); 3157 } else { 3158 // For cases where the widened type isn't a nice register value, unmerge 3159 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 3160 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3161 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 3162 } 3163 } 3164 3165 MI.eraseFromParent(); 3166 return true; 3167 } 3168 3169 return false; 3170 } 3171 3172 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper, 3173 MachineInstr &MI) const { 3174 MachineIRBuilder &B = Helper.MIRBuilder; 3175 MachineRegisterInfo &MRI = *B.getMRI(); 3176 GISelChangeObserver &Observer = Helper.Observer; 3177 3178 Register DataReg = MI.getOperand(0).getReg(); 3179 LLT DataTy = MRI.getType(DataReg); 3180 3181 if (hasBufferRsrcWorkaround(DataTy)) { 3182 Observer.changingInstr(MI); 3183 castBufferRsrcArgToV4I32(MI, B, 0); 3184 Observer.changedInstr(MI); 3185 return true; 3186 } 3187 return false; 3188 } 3189 3190 bool AMDGPULegalizerInfo::legalizeFMad( 3191 MachineInstr &MI, MachineRegisterInfo &MRI, 3192 MachineIRBuilder &B) const { 3193 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3194 assert(Ty.isScalar()); 3195 3196 MachineFunction &MF = B.getMF(); 3197 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3198 3199 // TODO: Always legal with future ftz flag. 3200 // FIXME: Do we need just output? 3201 if (Ty == LLT::float32() && 3202 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()) 3203 return true; 3204 if (Ty == LLT::float16() && 3205 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()) 3206 return true; 3207 3208 MachineIRBuilder HelperBuilder(MI); 3209 GISelObserverWrapper DummyObserver; 3210 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 3211 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 3212 } 3213 3214 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 3215 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3216 Register DstReg = MI.getOperand(0).getReg(); 3217 Register PtrReg = MI.getOperand(1).getReg(); 3218 Register CmpVal = MI.getOperand(2).getReg(); 3219 Register NewVal = MI.getOperand(3).getReg(); 3220 3221 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 3222 "this should not have been custom lowered"); 3223 3224 LLT ValTy = MRI.getType(CmpVal); 3225 LLT VecTy = LLT::fixed_vector(2, ValTy); 3226 3227 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 3228 3229 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 3230 .addDef(DstReg) 3231 .addUse(PtrReg) 3232 .addUse(PackedVal) 3233 .setMemRefs(MI.memoperands()); 3234 3235 MI.eraseFromParent(); 3236 return true; 3237 } 3238 3239 /// Return true if it's known that \p Src can never be an f32 denormal value. 3240 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, 3241 Register Src) { 3242 const MachineInstr *DefMI = MRI.getVRegDef(Src); 3243 switch (DefMI->getOpcode()) { 3244 case TargetOpcode::G_INTRINSIC: { 3245 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) { 3246 case Intrinsic::amdgcn_frexp_mant: 3247 return true; 3248 default: 3249 break; 3250 } 3251 3252 break; 3253 } 3254 case TargetOpcode::G_FFREXP: { 3255 if (DefMI->getOperand(0).getReg() == Src) 3256 return true; 3257 break; 3258 } 3259 case TargetOpcode::G_FPEXT: { 3260 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16); 3261 } 3262 default: 3263 return false; 3264 } 3265 3266 return false; 3267 } 3268 3269 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { 3270 if (Flags & MachineInstr::FmAfn) 3271 return true; 3272 const auto &Options = MF.getTarget().Options; 3273 return Options.UnsafeFPMath || Options.ApproxFuncFPMath; 3274 } 3275 3276 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, 3277 unsigned Flags) { 3278 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) && 3279 MF.getDenormalMode(APFloat::IEEEsingle()).Input != 3280 DenormalMode::PreserveSign; 3281 } 3282 3283 std::pair<Register, Register> 3284 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src, 3285 unsigned Flags) const { 3286 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) 3287 return {}; 3288 3289 const LLT F32 = LLT::scalar(32); 3290 auto SmallestNormal = B.buildFConstant( 3291 F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle())); 3292 auto IsLtSmallestNormal = 3293 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal); 3294 3295 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32); 3296 auto One = B.buildFConstant(F32, 1.0); 3297 auto ScaleFactor = 3298 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags); 3299 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags); 3300 3301 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)}; 3302 } 3303 3304 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI, 3305 MachineIRBuilder &B) const { 3306 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. 3307 // If we have to handle denormals, scale up the input and adjust the result. 3308 3309 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) 3310 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) 3311 3312 Register Dst = MI.getOperand(0).getReg(); 3313 Register Src = MI.getOperand(1).getReg(); 3314 LLT Ty = B.getMRI()->getType(Dst); 3315 unsigned Flags = MI.getFlags(); 3316 3317 if (Ty == LLT::scalar(16)) { 3318 const LLT F32 = LLT::scalar(32); 3319 // Nothing in half is a denormal when promoted to f32. 3320 auto Ext = B.buildFPExt(F32, Src, Flags); 3321 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}) 3322 .addUse(Ext.getReg(0)) 3323 .setMIFlags(Flags); 3324 B.buildFPTrunc(Dst, Log2, Flags); 3325 MI.eraseFromParent(); 3326 return true; 3327 } 3328 3329 assert(Ty == LLT::scalar(32)); 3330 3331 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags); 3332 if (!ScaledInput) { 3333 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}) 3334 .addUse(Src) 3335 .setMIFlags(Flags); 3336 MI.eraseFromParent(); 3337 return true; 3338 } 3339 3340 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3341 .addUse(ScaledInput) 3342 .setMIFlags(Flags); 3343 3344 auto ThirtyTwo = B.buildFConstant(Ty, 32.0); 3345 auto Zero = B.buildFConstant(Ty, 0.0); 3346 auto ResultOffset = 3347 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags); 3348 B.buildFSub(Dst, Log2, ResultOffset, Flags); 3349 3350 MI.eraseFromParent(); 3351 return true; 3352 } 3353 3354 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y, 3355 Register Z, unsigned Flags) { 3356 auto FMul = B.buildFMul(Ty, X, Y, Flags); 3357 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0); 3358 } 3359 3360 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, 3361 MachineIRBuilder &B) const { 3362 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10; 3363 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG); 3364 3365 MachineRegisterInfo &MRI = *B.getMRI(); 3366 Register Dst = MI.getOperand(0).getReg(); 3367 Register X = MI.getOperand(1).getReg(); 3368 unsigned Flags = MI.getFlags(); 3369 const LLT Ty = MRI.getType(X); 3370 MachineFunction &MF = B.getMF(); 3371 3372 const LLT F32 = LLT::scalar(32); 3373 const LLT F16 = LLT::scalar(16); 3374 3375 const AMDGPUTargetMachine &TM = 3376 static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 3377 3378 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || 3379 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { 3380 if (Ty == F16 && !ST.has16BitInsts()) { 3381 Register LogVal = MRI.createGenericVirtualRegister(F32); 3382 auto PromoteSrc = B.buildFPExt(F32, X); 3383 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags); 3384 B.buildFPTrunc(Dst, LogVal); 3385 } else { 3386 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags); 3387 } 3388 3389 MI.eraseFromParent(); 3390 return true; 3391 } 3392 3393 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags); 3394 if (ScaledInput) 3395 X = ScaledInput; 3396 3397 auto Y = 3398 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags); 3399 3400 Register R; 3401 if (ST.hasFastFMAF32()) { 3402 // c+cc are ln(2)/ln(10) to more than 49 bits 3403 const float c_log10 = 0x1.344134p-2f; 3404 const float cc_log10 = 0x1.09f79ep-26f; 3405 3406 // c + cc is ln(2) to more than 49 bits 3407 const float c_log = 0x1.62e42ep-1f; 3408 const float cc_log = 0x1.efa39ep-25f; 3409 3410 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); 3411 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); 3412 3413 R = B.buildFMul(Ty, Y, C, Flags).getReg(0); 3414 auto NegR = B.buildFNeg(Ty, R, Flags); 3415 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); 3416 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); 3417 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); 3418 } else { 3419 // ch+ct is ln(2)/ln(10) to more than 36 bits 3420 const float ch_log10 = 0x1.344000p-2f; 3421 const float ct_log10 = 0x1.3509f6p-18f; 3422 3423 // ch + ct is ln(2) to more than 36 bits 3424 const float ch_log = 0x1.62e000p-1f; 3425 const float ct_log = 0x1.0bfbe8p-15f; 3426 3427 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log); 3428 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log); 3429 3430 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3431 auto YH = B.buildAnd(Ty, Y, MaskConst); 3432 auto YT = B.buildFSub(Ty, Y, YH, Flags); 3433 auto YTCT = B.buildFMul(Ty, YT, CT, Flags); 3434 3435 Register Mad0 = 3436 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); 3437 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); 3438 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); 3439 } 3440 3441 const bool IsFiniteOnly = 3442 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) && 3443 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath); 3444 3445 if (!IsFiniteOnly) { 3446 // Expand isfinite(x) => fabs(x) < inf 3447 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3448 auto Fabs = B.buildFAbs(Ty, Y); 3449 auto IsFinite = 3450 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 3451 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0); 3452 } 3453 3454 if (ScaledInput) { 3455 auto Zero = B.buildFConstant(Ty, 0.0); 3456 auto ShiftK = 3457 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f); 3458 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags); 3459 B.buildFSub(Dst, R, Shift, Flags); 3460 } else { 3461 B.buildCopy(Dst, R); 3462 } 3463 3464 MI.eraseFromParent(); 3465 return true; 3466 } 3467 3468 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, 3469 Register Src, bool IsLog10, 3470 unsigned Flags) const { 3471 const double Log2BaseInverted = 3472 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; 3473 3474 LLT Ty = B.getMRI()->getType(Dst); 3475 3476 if (Ty == LLT::scalar(32)) { 3477 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags); 3478 if (ScaledInput) { 3479 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3480 .addUse(Src) 3481 .setMIFlags(Flags); 3482 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted); 3483 auto Zero = B.buildFConstant(Ty, 0.0); 3484 auto ResultOffset = 3485 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags); 3486 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted); 3487 3488 if (ST.hasFastFMAF32()) 3489 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags); 3490 else { 3491 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags); 3492 B.buildFAdd(Dst, Mul, ResultOffset, Flags); 3493 } 3494 3495 return true; 3496 } 3497 } 3498 3499 auto Log2Operand = Ty == LLT::scalar(16) 3500 ? B.buildFLog2(Ty, Src, Flags) 3501 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3502 .addUse(Src) 3503 .setMIFlags(Flags); 3504 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 3505 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 3506 return true; 3507 } 3508 3509 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, 3510 MachineIRBuilder &B) const { 3511 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. 3512 // If we have to handle denormals, scale up the input and adjust the result. 3513 3514 Register Dst = MI.getOperand(0).getReg(); 3515 Register Src = MI.getOperand(1).getReg(); 3516 unsigned Flags = MI.getFlags(); 3517 LLT Ty = B.getMRI()->getType(Dst); 3518 const LLT F16 = LLT::scalar(16); 3519 const LLT F32 = LLT::scalar(32); 3520 3521 if (Ty == F16) { 3522 // Nothing in half is a denormal when promoted to f32. 3523 auto Ext = B.buildFPExt(F32, Src, Flags); 3524 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}) 3525 .addUse(Ext.getReg(0)) 3526 .setMIFlags(Flags); 3527 B.buildFPTrunc(Dst, Log2, Flags); 3528 MI.eraseFromParent(); 3529 return true; 3530 } 3531 3532 assert(Ty == F32); 3533 3534 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) { 3535 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) 3536 .addUse(Src) 3537 .setMIFlags(Flags); 3538 MI.eraseFromParent(); 3539 return true; 3540 } 3541 3542 // bool needs_scaling = x < -0x1.f80000p+6f; 3543 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); 3544 3545 // -nextafter(128.0, -1) 3546 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f); 3547 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, 3548 RangeCheckConst, Flags); 3549 3550 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f); 3551 auto Zero = B.buildFConstant(Ty, 0.0); 3552 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags); 3553 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags); 3554 3555 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3556 .addUse(AddInput.getReg(0)) 3557 .setMIFlags(Flags); 3558 3559 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f); 3560 auto One = B.buildFConstant(Ty, 1.0); 3561 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags); 3562 B.buildFMul(Dst, Exp2, ResultScale, Flags); 3563 MI.eraseFromParent(); 3564 return true; 3565 } 3566 3567 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, 3568 Register X, unsigned Flags) const { 3569 LLT Ty = B.getMRI()->getType(Dst); 3570 LLT F32 = LLT::scalar(32); 3571 3572 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) { 3573 auto Log2E = B.buildFConstant(Ty, numbers::log2e); 3574 auto Mul = B.buildFMul(Ty, X, Log2E, Flags); 3575 3576 if (Ty == F32) { 3577 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) 3578 .addUse(Mul.getReg(0)) 3579 .setMIFlags(Flags); 3580 } else { 3581 B.buildFExp2(Dst, Mul.getReg(0), Flags); 3582 } 3583 3584 return true; 3585 } 3586 3587 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f); 3588 auto NeedsScaling = 3589 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags); 3590 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f); 3591 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags); 3592 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags); 3593 3594 auto Log2E = B.buildFConstant(Ty, numbers::log2e); 3595 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags); 3596 3597 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3598 .addUse(ExpInput.getReg(0)) 3599 .setMIFlags(Flags); 3600 3601 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f); 3602 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags); 3603 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags); 3604 return true; 3605 } 3606 3607 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 3608 MachineIRBuilder &B) const { 3609 Register Dst = MI.getOperand(0).getReg(); 3610 Register X = MI.getOperand(1).getReg(); 3611 const unsigned Flags = MI.getFlags(); 3612 MachineFunction &MF = B.getMF(); 3613 MachineRegisterInfo &MRI = *B.getMRI(); 3614 LLT Ty = MRI.getType(Dst); 3615 const LLT F16 = LLT::scalar(16); 3616 const LLT F32 = LLT::scalar(32); 3617 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10; 3618 3619 if (Ty == F16) { 3620 // v_exp_f16 (fmul x, log2e) 3621 if (allowApproxFunc(MF, Flags)) { 3622 // TODO: Does this really require fast? 3623 legalizeFExpUnsafe(B, Dst, X, Flags); 3624 MI.eraseFromParent(); 3625 return true; 3626 } 3627 3628 // exp(f16 x) -> 3629 // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) 3630 3631 // Nothing in half is a denormal when promoted to f32. 3632 auto Ext = B.buildFPExt(F32, X, Flags); 3633 Register Lowered = MRI.createGenericVirtualRegister(F32); 3634 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags); 3635 B.buildFPTrunc(Dst, Lowered, Flags); 3636 MI.eraseFromParent(); 3637 return true; 3638 } 3639 3640 assert(Ty == F32); 3641 3642 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying 3643 // library behavior. Also, is known-not-daz source sufficient? 3644 if (allowApproxFunc(MF, Flags)) { 3645 legalizeFExpUnsafe(B, Dst, X, Flags); 3646 MI.eraseFromParent(); 3647 return true; 3648 } 3649 3650 // Algorithm: 3651 // 3652 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 3653 // 3654 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 3655 // n = 64*m + j, 0 <= j < 64 3656 // 3657 // e^x = 2^((64*m + j + f)/64) 3658 // = (2^m) * (2^(j/64)) * 2^(f/64) 3659 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 3660 // 3661 // f = x*(64/ln(2)) - n 3662 // r = f*(ln(2)/64) = x - n*(ln(2)/64) 3663 // 3664 // e^x = (2^m) * (2^(j/64)) * e^r 3665 // 3666 // (2^(j/64)) is precomputed 3667 // 3668 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3669 // e^r = 1 + q 3670 // 3671 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3672 // 3673 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 3674 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract; 3675 Register PH, PL; 3676 3677 if (ST.hasFastFMAF32()) { 3678 const float c_exp = numbers::log2ef; 3679 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits 3680 const float c_exp10 = 0x1.a934f0p+1f; 3681 const float cc_exp10 = 0x1.2f346ep-24f; 3682 3683 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp); 3684 PH = B.buildFMul(Ty, X, C, Flags).getReg(0); 3685 auto NegPH = B.buildFNeg(Ty, PH, Flags); 3686 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags); 3687 3688 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp); 3689 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0); 3690 } else { 3691 const float ch_exp = 0x1.714000p+0f; 3692 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits 3693 3694 const float ch_exp10 = 0x1.a92000p+1f; 3695 const float cl_exp10 = 0x1.4f0978p-11f; 3696 3697 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3698 auto XH = B.buildAnd(Ty, X, MaskConst); 3699 auto XL = B.buildFSub(Ty, X, XH, Flags); 3700 3701 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp); 3702 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0); 3703 3704 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp); 3705 auto XLCL = B.buildFMul(Ty, XL, CL, Flags); 3706 3707 Register Mad0 = 3708 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags); 3709 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags); 3710 } 3711 3712 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags); 3713 3714 // It is unsafe to contract this fsub into the PH multiply. 3715 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract); 3716 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags); 3717 auto IntE = B.buildFPTOSI(LLT::scalar(32), E); 3718 3719 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3720 .addUse(A.getReg(0)) 3721 .setMIFlags(Flags); 3722 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags); 3723 3724 auto UnderflowCheckConst = 3725 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f); 3726 auto Zero = B.buildFConstant(Ty, 0.0); 3727 auto Underflow = 3728 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst); 3729 3730 R = B.buildSelect(Ty, Underflow, Zero, R); 3731 3732 const auto &Options = MF.getTarget().Options; 3733 3734 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) { 3735 auto OverflowCheckConst = 3736 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f); 3737 3738 auto Overflow = 3739 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst); 3740 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3741 R = B.buildSelect(Ty, Overflow, Inf, R, Flags); 3742 } 3743 3744 B.buildCopy(Dst, R); 3745 MI.eraseFromParent(); 3746 return true; 3747 } 3748 3749 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 3750 MachineIRBuilder &B) const { 3751 Register Dst = MI.getOperand(0).getReg(); 3752 Register Src0 = MI.getOperand(1).getReg(); 3753 Register Src1 = MI.getOperand(2).getReg(); 3754 unsigned Flags = MI.getFlags(); 3755 LLT Ty = B.getMRI()->getType(Dst); 3756 const LLT F16 = LLT::float16(); 3757 const LLT F32 = LLT::float32(); 3758 3759 if (Ty == F32) { 3760 auto Log = B.buildFLog2(F32, Src0, Flags); 3761 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) 3762 .addUse(Log.getReg(0)) 3763 .addUse(Src1) 3764 .setMIFlags(Flags); 3765 B.buildFExp2(Dst, Mul, Flags); 3766 } else if (Ty == F16) { 3767 // There's no f16 fmul_legacy, so we need to convert for it. 3768 auto Log = B.buildFLog2(F16, Src0, Flags); 3769 auto Ext0 = B.buildFPExt(F32, Log, Flags); 3770 auto Ext1 = B.buildFPExt(F32, Src1, Flags); 3771 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) 3772 .addUse(Ext0.getReg(0)) 3773 .addUse(Ext1.getReg(0)) 3774 .setMIFlags(Flags); 3775 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags); 3776 } else 3777 return false; 3778 3779 MI.eraseFromParent(); 3780 return true; 3781 } 3782 3783 // Find a source register, ignoring any possible source modifiers. 3784 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 3785 Register ModSrc = OrigSrc; 3786 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 3787 ModSrc = SrcFNeg->getOperand(1).getReg(); 3788 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3789 ModSrc = SrcFAbs->getOperand(1).getReg(); 3790 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3791 ModSrc = SrcFAbs->getOperand(1).getReg(); 3792 return ModSrc; 3793 } 3794 3795 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 3796 MachineRegisterInfo &MRI, 3797 MachineIRBuilder &B) const { 3798 3799 const LLT S1 = LLT::scalar(1); 3800 const LLT F64 = LLT::float64(); 3801 Register Dst = MI.getOperand(0).getReg(); 3802 Register OrigSrc = MI.getOperand(1).getReg(); 3803 unsigned Flags = MI.getFlags(); 3804 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 && 3805 "this should not have been custom lowered"); 3806 3807 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 3808 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 3809 // efficient way to implement it is using V_FRACT_F64. The workaround for the 3810 // V_FRACT bug is: 3811 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 3812 // 3813 // Convert floor(x) to (x - fract(x)) 3814 3815 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64}) 3816 .addUse(OrigSrc) 3817 .setMIFlags(Flags); 3818 3819 // Give source modifier matching some assistance before obscuring a foldable 3820 // pattern. 3821 3822 // TODO: We can avoid the neg on the fract? The input sign to fract 3823 // shouldn't matter? 3824 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 3825 3826 auto Const = 3827 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff)); 3828 3829 Register Min = MRI.createGenericVirtualRegister(F64); 3830 3831 // We don't need to concern ourselves with the snan handling difference, so 3832 // use the one which will directly select. 3833 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3834 if (MFI->getMode().IEEE) 3835 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 3836 else 3837 B.buildFMinNum(Min, Fract, Const, Flags); 3838 3839 Register CorrectedFract = Min; 3840 if (!MI.getFlag(MachineInstr::FmNoNans)) { 3841 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 3842 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0); 3843 } 3844 3845 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags); 3846 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 3847 3848 MI.eraseFromParent(); 3849 return true; 3850 } 3851 3852 // Turn an illegal packed v2s16 build vector into bit operations. 3853 // TODO: This should probably be a bitcast action in LegalizerHelper. 3854 bool AMDGPULegalizerInfo::legalizeBuildVector( 3855 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3856 Register Dst = MI.getOperand(0).getReg(); 3857 const LLT S32 = LLT::scalar(32); 3858 const LLT S16 = LLT::scalar(16); 3859 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 3860 3861 Register Src0 = MI.getOperand(1).getReg(); 3862 Register Src1 = MI.getOperand(2).getReg(); 3863 3864 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { 3865 assert(MRI.getType(Src0) == S32); 3866 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); 3867 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); 3868 } 3869 3870 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); 3871 B.buildBitcast(Dst, Merge); 3872 3873 MI.eraseFromParent(); 3874 return true; 3875 } 3876 3877 // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 3878 // 3879 // Source and accumulation registers must all be 32-bits. 3880 // 3881 // TODO: When the multiply is uniform, we should produce a code sequence 3882 // that is better suited to instruction selection on the SALU. Instead of 3883 // the outer loop going over parts of the result, the outer loop should go 3884 // over parts of one of the factors. This should result in instruction 3885 // selection that makes full use of S_ADDC_U32 instructions. 3886 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper, 3887 MutableArrayRef<Register> Accum, 3888 ArrayRef<Register> Src0, 3889 ArrayRef<Register> Src1, 3890 bool UsePartialMad64_32, 3891 bool SeparateOddAlignedProducts) const { 3892 // Use (possibly empty) vectors of S1 registers to represent the set of 3893 // carries from one pair of positions to the next. 3894 using Carry = SmallVector<Register, 2>; 3895 3896 MachineIRBuilder &B = Helper.MIRBuilder; 3897 GISelKnownBits &KB = *Helper.getKnownBits(); 3898 3899 const LLT S1 = LLT::scalar(1); 3900 const LLT S32 = LLT::scalar(32); 3901 const LLT S64 = LLT::scalar(64); 3902 3903 Register Zero32; 3904 Register Zero64; 3905 3906 auto getZero32 = [&]() -> Register { 3907 if (!Zero32) 3908 Zero32 = B.buildConstant(S32, 0).getReg(0); 3909 return Zero32; 3910 }; 3911 auto getZero64 = [&]() -> Register { 3912 if (!Zero64) 3913 Zero64 = B.buildConstant(S64, 0).getReg(0); 3914 return Zero64; 3915 }; 3916 3917 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros; 3918 for (unsigned i = 0; i < Src0.size(); ++i) { 3919 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero()); 3920 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero()); 3921 } 3922 3923 // Merge the given carries into the 32-bit LocalAccum, which is modified 3924 // in-place. 3925 // 3926 // Returns the carry-out, which is a single S1 register or null. 3927 auto mergeCarry = 3928 [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 3929 if (CarryIn.empty()) 3930 return Register(); 3931 3932 bool HaveCarryOut = true; 3933 Register CarryAccum; 3934 if (CarryIn.size() == 1) { 3935 if (!LocalAccum) { 3936 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3937 return Register(); 3938 } 3939 3940 CarryAccum = getZero32(); 3941 } else { 3942 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3943 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 3944 CarryAccum = 3945 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 3946 .getReg(0); 3947 } 3948 3949 if (!LocalAccum) { 3950 LocalAccum = getZero32(); 3951 HaveCarryOut = false; 3952 } 3953 } 3954 3955 auto Add = 3956 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 3957 LocalAccum = Add.getReg(0); 3958 return HaveCarryOut ? Add.getReg(1) : Register(); 3959 }; 3960 3961 // Build a multiply-add chain to compute 3962 // 3963 // LocalAccum + (partial products at DstIndex) 3964 // + (opportunistic subset of CarryIn) 3965 // 3966 // LocalAccum is an array of one or two 32-bit registers that are updated 3967 // in-place. The incoming registers may be null. 3968 // 3969 // In some edge cases, carry-ins can be consumed "for free". In that case, 3970 // the consumed carry bits are removed from CarryIn in-place. 3971 auto buildMadChain = 3972 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 3973 -> Carry { 3974 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 3975 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 3976 3977 Carry CarryOut; 3978 unsigned j0 = 0; 3979 3980 // Use plain 32-bit multiplication for the most significant part of the 3981 // result by default. 3982 if (LocalAccum.size() == 1 && 3983 (!UsePartialMad64_32 || !CarryIn.empty())) { 3984 do { 3985 // Skip multiplication if one of the operands is 0 3986 unsigned j1 = DstIndex - j0; 3987 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 3988 ++j0; 3989 continue; 3990 } 3991 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 3992 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) { 3993 LocalAccum[0] = Mul.getReg(0); 3994 } else { 3995 if (CarryIn.empty()) { 3996 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 3997 } else { 3998 LocalAccum[0] = 3999 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 4000 .getReg(0); 4001 CarryIn.pop_back(); 4002 } 4003 } 4004 ++j0; 4005 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 4006 } 4007 4008 // Build full 64-bit multiplies. 4009 if (j0 <= DstIndex) { 4010 bool HaveSmallAccum = false; 4011 Register Tmp; 4012 4013 if (LocalAccum[0]) { 4014 if (LocalAccum.size() == 1) { 4015 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 4016 HaveSmallAccum = true; 4017 } else if (LocalAccum[1]) { 4018 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); 4019 HaveSmallAccum = false; 4020 } else { 4021 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 4022 HaveSmallAccum = true; 4023 } 4024 } else { 4025 assert(LocalAccum.size() == 1 || !LocalAccum[1]); 4026 Tmp = getZero64(); 4027 HaveSmallAccum = true; 4028 } 4029 4030 do { 4031 unsigned j1 = DstIndex - j0; 4032 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 4033 ++j0; 4034 continue; 4035 } 4036 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 4037 {Src0[j0], Src1[j1], Tmp}); 4038 Tmp = Mad.getReg(0); 4039 if (!HaveSmallAccum) 4040 CarryOut.push_back(Mad.getReg(1)); 4041 HaveSmallAccum = false; 4042 4043 ++j0; 4044 } while (j0 <= DstIndex); 4045 4046 auto Unmerge = B.buildUnmerge(S32, Tmp); 4047 LocalAccum[0] = Unmerge.getReg(0); 4048 if (LocalAccum.size() > 1) 4049 LocalAccum[1] = Unmerge.getReg(1); 4050 } 4051 4052 return CarryOut; 4053 }; 4054 4055 // Outer multiply loop, iterating over destination parts from least 4056 // significant to most significant parts. 4057 // 4058 // The columns of the following diagram correspond to the destination parts 4059 // affected by one iteration of the outer loop (ignoring boundary 4060 // conditions). 4061 // 4062 // Dest index relative to 2 * i: 1 0 -1 4063 // ------ 4064 // Carries from previous iteration: e o 4065 // Even-aligned partial product sum: E E . 4066 // Odd-aligned partial product sum: O O 4067 // 4068 // 'o' is OddCarry, 'e' is EvenCarry. 4069 // EE and OO are computed from partial products via buildMadChain and use 4070 // accumulation where possible and appropriate. 4071 // 4072 Register SeparateOddCarry; 4073 Carry EvenCarry; 4074 Carry OddCarry; 4075 4076 for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 4077 Carry OddCarryIn = std::move(OddCarry); 4078 Carry EvenCarryIn = std::move(EvenCarry); 4079 OddCarry.clear(); 4080 EvenCarry.clear(); 4081 4082 // Partial products at offset 2 * i. 4083 if (2 * i < Accum.size()) { 4084 auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 4085 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 4086 } 4087 4088 // Partial products at offset 2 * i - 1. 4089 if (i > 0) { 4090 if (!SeparateOddAlignedProducts) { 4091 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 4092 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 4093 } else { 4094 bool IsHighest = 2 * i >= Accum.size(); 4095 Register SeparateOddOut[2]; 4096 auto LocalAccum = MutableArrayRef(SeparateOddOut) 4097 .take_front(IsHighest ? 1 : 2); 4098 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 4099 4100 MachineInstr *Lo; 4101 4102 if (i == 1) { 4103 if (!IsHighest) 4104 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 4105 else 4106 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 4107 } else { 4108 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 4109 SeparateOddCarry); 4110 } 4111 Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 4112 4113 if (!IsHighest) { 4114 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 4115 Lo->getOperand(1).getReg()); 4116 Accum[2 * i] = Hi.getReg(0); 4117 SeparateOddCarry = Hi.getReg(1); 4118 } 4119 } 4120 } 4121 4122 // Add in the carries from the previous iteration 4123 if (i > 0) { 4124 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 4125 EvenCarryIn.push_back(CarryOut); 4126 4127 if (2 * i < Accum.size()) { 4128 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 4129 OddCarry.push_back(CarryOut); 4130 } 4131 } 4132 } 4133 } 4134 4135 // Custom narrowing of wide multiplies using wide multiply-add instructions. 4136 // 4137 // TODO: If the multiply is followed by an addition, we should attempt to 4138 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 4139 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 4140 MachineInstr &MI) const { 4141 assert(ST.hasMad64_32()); 4142 assert(MI.getOpcode() == TargetOpcode::G_MUL); 4143 4144 MachineIRBuilder &B = Helper.MIRBuilder; 4145 MachineRegisterInfo &MRI = *B.getMRI(); 4146 4147 Register DstReg = MI.getOperand(0).getReg(); 4148 Register Src0 = MI.getOperand(1).getReg(); 4149 Register Src1 = MI.getOperand(2).getReg(); 4150 4151 LLT Ty = MRI.getType(DstReg); 4152 assert(Ty.isScalar()); 4153 4154 unsigned Size = Ty.getSizeInBits(); 4155 unsigned NumParts = Size / 32; 4156 assert((Size % 32) == 0); 4157 assert(NumParts >= 2); 4158 4159 // Whether to use MAD_64_32 for partial products whose high half is 4160 // discarded. This avoids some ADD instructions but risks false dependency 4161 // stalls on some subtargets in some cases. 4162 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 4163 4164 // Whether to compute odd-aligned partial products separately. This is 4165 // advisable on subtargets where the accumulator of MAD_64_32 must be placed 4166 // in an even-aligned VGPR. 4167 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 4168 4169 LLT S32 = LLT::scalar(32); 4170 SmallVector<Register, 2> Src0Parts, Src1Parts; 4171 for (unsigned i = 0; i < NumParts; ++i) { 4172 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 4173 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 4174 } 4175 B.buildUnmerge(Src0Parts, Src0); 4176 B.buildUnmerge(Src1Parts, Src1); 4177 4178 SmallVector<Register, 2> AccumRegs(NumParts); 4179 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 4180 SeparateOddAlignedProducts); 4181 4182 B.buildMergeLikeInstr(DstReg, AccumRegs); 4183 MI.eraseFromParent(); 4184 return true; 4185 } 4186 4187 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 4188 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 4189 // case with a single min instruction instead of a compare+select. 4190 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 4191 MachineRegisterInfo &MRI, 4192 MachineIRBuilder &B) const { 4193 Register Dst = MI.getOperand(0).getReg(); 4194 Register Src = MI.getOperand(1).getReg(); 4195 LLT DstTy = MRI.getType(Dst); 4196 LLT SrcTy = MRI.getType(Src); 4197 4198 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 4199 ? AMDGPU::G_AMDGPU_FFBH_U32 4200 : AMDGPU::G_AMDGPU_FFBL_B32; 4201 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 4202 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 4203 4204 MI.eraseFromParent(); 4205 return true; 4206 } 4207 4208 bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, 4209 MachineRegisterInfo &MRI, 4210 MachineIRBuilder &B) const { 4211 Register Dst = MI.getOperand(0).getReg(); 4212 Register Src = MI.getOperand(1).getReg(); 4213 LLT SrcTy = MRI.getType(Src); 4214 TypeSize NumBits = SrcTy.getSizeInBits(); 4215 4216 assert(NumBits < 32u); 4217 4218 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits); 4219 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u); 4220 auto Shift = B.buildShl(S32, Extend, ShiftAmt); 4221 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift}); 4222 B.buildTrunc(Dst, Ctlz); 4223 MI.eraseFromParent(); 4224 return true; 4225 } 4226 4227 // Check that this is a G_XOR x, -1 4228 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 4229 if (MI.getOpcode() != TargetOpcode::G_XOR) 4230 return false; 4231 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 4232 return ConstVal && *ConstVal == -1; 4233 } 4234 4235 // Return the use branch instruction, otherwise null if the usage is invalid. 4236 static MachineInstr * 4237 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 4238 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 4239 Register CondDef = MI.getOperand(0).getReg(); 4240 if (!MRI.hasOneNonDBGUse(CondDef)) 4241 return nullptr; 4242 4243 MachineBasicBlock *Parent = MI.getParent(); 4244 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 4245 4246 if (isNot(MRI, *UseMI)) { 4247 Register NegatedCond = UseMI->getOperand(0).getReg(); 4248 if (!MRI.hasOneNonDBGUse(NegatedCond)) 4249 return nullptr; 4250 4251 // We're deleting the def of this value, so we need to remove it. 4252 eraseInstr(*UseMI, MRI); 4253 4254 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 4255 Negated = true; 4256 } 4257 4258 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 4259 return nullptr; 4260 4261 // Make sure the cond br is followed by a G_BR, or is the last instruction. 4262 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 4263 if (Next == Parent->end()) { 4264 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 4265 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 4266 return nullptr; 4267 UncondBrTarget = &*NextMBB; 4268 } else { 4269 if (Next->getOpcode() != AMDGPU::G_BR) 4270 return nullptr; 4271 Br = &*Next; 4272 UncondBrTarget = Br->getOperand(0).getMBB(); 4273 } 4274 4275 return UseMI; 4276 } 4277 4278 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 4279 const ArgDescriptor *Arg, 4280 const TargetRegisterClass *ArgRC, 4281 LLT ArgTy) const { 4282 MCRegister SrcReg = Arg->getRegister(); 4283 assert(SrcReg.isPhysical() && "Physical register expected"); 4284 assert(DstReg.isVirtual() && "Virtual register expected"); 4285 4286 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 4287 *ArgRC, B.getDebugLoc(), ArgTy); 4288 if (Arg->isMasked()) { 4289 // TODO: Should we try to emit this once in the entry block? 4290 const LLT S32 = LLT::scalar(32); 4291 const unsigned Mask = Arg->getMask(); 4292 const unsigned Shift = llvm::countr_zero<unsigned>(Mask); 4293 4294 Register AndMaskSrc = LiveIn; 4295 4296 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 4297 // 0. 4298 if (Shift != 0) { 4299 auto ShiftAmt = B.buildConstant(S32, Shift); 4300 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 4301 } 4302 4303 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 4304 } else { 4305 B.buildCopy(DstReg, LiveIn); 4306 } 4307 4308 return true; 4309 } 4310 4311 bool AMDGPULegalizerInfo::loadInputValue( 4312 Register DstReg, MachineIRBuilder &B, 4313 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4314 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4315 const ArgDescriptor *Arg = nullptr; 4316 const TargetRegisterClass *ArgRC; 4317 LLT ArgTy; 4318 4319 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 4320 const ArgDescriptor WorkGroupIDX = 4321 ArgDescriptor::createRegister(AMDGPU::TTMP9); 4322 // If GridZ is not programmed in an entry function then the hardware will set 4323 // it to all zeros, so there is no need to mask the GridY value in the low 4324 // order bits. 4325 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( 4326 AMDGPU::TTMP7, 4327 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu); 4328 const ArgDescriptor WorkGroupIDZ = 4329 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); 4330 if (ST.hasArchitectedSGPRs() && 4331 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) { 4332 switch (ArgType) { 4333 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: 4334 Arg = &WorkGroupIDX; 4335 ArgRC = &AMDGPU::SReg_32RegClass; 4336 ArgTy = LLT::scalar(32); 4337 break; 4338 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: 4339 Arg = &WorkGroupIDY; 4340 ArgRC = &AMDGPU::SReg_32RegClass; 4341 ArgTy = LLT::scalar(32); 4342 break; 4343 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: 4344 Arg = &WorkGroupIDZ; 4345 ArgRC = &AMDGPU::SReg_32RegClass; 4346 ArgTy = LLT::scalar(32); 4347 break; 4348 default: 4349 break; 4350 } 4351 } 4352 4353 if (!Arg) 4354 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4355 4356 if (!Arg) { 4357 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 4358 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 4359 // case the pointer argument may be missing and we use null. 4360 B.buildConstant(DstReg, 0); 4361 return true; 4362 } 4363 4364 // It's undefined behavior if a function marked with the amdgpu-no-* 4365 // attributes uses the corresponding intrinsic. 4366 B.buildUndef(DstReg); 4367 return true; 4368 } 4369 4370 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 4371 return false; // TODO: Handle these 4372 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 4373 } 4374 4375 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 4376 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4377 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4378 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 4379 return false; 4380 4381 MI.eraseFromParent(); 4382 return true; 4383 } 4384 4385 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 4386 int64_t C) { 4387 B.buildConstant(MI.getOperand(0).getReg(), C); 4388 MI.eraseFromParent(); 4389 return true; 4390 } 4391 4392 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 4393 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4394 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4395 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 4396 if (MaxID == 0) 4397 return replaceWithConstant(B, MI, 0); 4398 4399 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4400 const ArgDescriptor *Arg; 4401 const TargetRegisterClass *ArgRC; 4402 LLT ArgTy; 4403 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4404 4405 Register DstReg = MI.getOperand(0).getReg(); 4406 if (!Arg) { 4407 // It's undefined behavior if a function marked with the amdgpu-no-* 4408 // attributes uses the corresponding intrinsic. 4409 B.buildUndef(DstReg); 4410 MI.eraseFromParent(); 4411 return true; 4412 } 4413 4414 if (Arg->isMasked()) { 4415 // Don't bother inserting AssertZext for packed IDs since we're emitting the 4416 // masking operations anyway. 4417 // 4418 // TODO: We could assert the top bit is 0 for the source copy. 4419 if (!loadInputValue(DstReg, B, ArgType)) 4420 return false; 4421 } else { 4422 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 4423 if (!loadInputValue(TmpReg, B, ArgType)) 4424 return false; 4425 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); 4426 } 4427 4428 MI.eraseFromParent(); 4429 return true; 4430 } 4431 4432 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 4433 int64_t Offset) const { 4434 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 4435 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 4436 4437 // TODO: If we passed in the base kernel offset we could have a better 4438 // alignment than 4, but we don't really need it. 4439 if (!loadInputValue(KernArgReg, B, 4440 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4441 llvm_unreachable("failed to find kernarg segment ptr"); 4442 4443 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 4444 // TODO: Should get nuw 4445 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 4446 } 4447 4448 /// Legalize a value that's loaded from kernel arguments. This is only used by 4449 /// legacy intrinsics. 4450 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 4451 MachineIRBuilder &B, 4452 uint64_t Offset, 4453 Align Alignment) const { 4454 Register DstReg = MI.getOperand(0).getReg(); 4455 4456 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 4457 "unexpected kernarg parameter type"); 4458 4459 Register Ptr = getKernargParameterPtr(B, Offset); 4460 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 4461 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 4462 MachineMemOperand::MODereferenceable | 4463 MachineMemOperand::MOInvariant); 4464 MI.eraseFromParent(); 4465 return true; 4466 } 4467 4468 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 4469 MachineRegisterInfo &MRI, 4470 MachineIRBuilder &B) const { 4471 Register Dst = MI.getOperand(0).getReg(); 4472 LLT DstTy = MRI.getType(Dst); 4473 LLT S16 = LLT::scalar(16); 4474 LLT S32 = LLT::scalar(32); 4475 LLT S64 = LLT::scalar(64); 4476 4477 if (DstTy == S16) 4478 return legalizeFDIV16(MI, MRI, B); 4479 if (DstTy == S32) 4480 return legalizeFDIV32(MI, MRI, B); 4481 if (DstTy == S64) 4482 return legalizeFDIV64(MI, MRI, B); 4483 4484 return false; 4485 } 4486 4487 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 4488 Register DstDivReg, 4489 Register DstRemReg, 4490 Register X, 4491 Register Y) const { 4492 const LLT S1 = LLT::scalar(1); 4493 const LLT S32 = LLT::scalar(32); 4494 4495 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 4496 // algorithm used here. 4497 4498 // Initial estimate of inv(y). 4499 auto FloatY = B.buildUITOFP(S32, Y); 4500 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 4501 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe)); 4502 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 4503 auto Z = B.buildFPTOUI(S32, ScaledY); 4504 4505 // One round of UNR. 4506 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 4507 auto NegYZ = B.buildMul(S32, NegY, Z); 4508 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 4509 4510 // Quotient/remainder estimate. 4511 auto Q = B.buildUMulH(S32, X, Z); 4512 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 4513 4514 // First quotient/remainder refinement. 4515 auto One = B.buildConstant(S32, 1); 4516 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4517 if (DstDivReg) 4518 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 4519 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 4520 4521 // Second quotient/remainder refinement. 4522 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4523 if (DstDivReg) 4524 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 4525 4526 if (DstRemReg) 4527 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 4528 } 4529 4530 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 4531 // 4532 // Return lo, hi of result 4533 // 4534 // %cvt.lo = G_UITOFP Val.lo 4535 // %cvt.hi = G_UITOFP Val.hi 4536 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 4537 // %rcp = G_AMDGPU_RCP_IFLAG %mad 4538 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 4539 // %mul2 = G_FMUL %mul1, 2**(-32) 4540 // %trunc = G_INTRINSIC_TRUNC %mul2 4541 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 4542 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 4543 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 4544 Register Val) { 4545 const LLT S32 = LLT::scalar(32); 4546 auto Unmerge = B.buildUnmerge(S32, Val); 4547 4548 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 4549 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 4550 4551 auto Mad = B.buildFMAD( 4552 S32, CvtHi, // 2**32 4553 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo); 4554 4555 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 4556 auto Mul1 = B.buildFMul( 4557 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc))); 4558 4559 // 2**(-32) 4560 auto Mul2 = B.buildFMul( 4561 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000))); 4562 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 4563 4564 // -(2**32) 4565 auto Mad2 = B.buildFMAD( 4566 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)), 4567 Mul1); 4568 4569 auto ResultLo = B.buildFPTOUI(S32, Mad2); 4570 auto ResultHi = B.buildFPTOUI(S32, Trunc); 4571 4572 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 4573 } 4574 4575 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 4576 Register DstDivReg, 4577 Register DstRemReg, 4578 Register Numer, 4579 Register Denom) const { 4580 const LLT S32 = LLT::scalar(32); 4581 const LLT S64 = LLT::scalar(64); 4582 const LLT S1 = LLT::scalar(1); 4583 Register RcpLo, RcpHi; 4584 4585 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 4586 4587 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); 4588 4589 auto Zero64 = B.buildConstant(S64, 0); 4590 auto NegDenom = B.buildSub(S64, Zero64, Denom); 4591 4592 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 4593 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 4594 4595 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 4596 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 4597 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 4598 4599 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 4600 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 4601 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); 4602 4603 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 4604 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 4605 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 4606 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 4607 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 4608 4609 auto Zero32 = B.buildConstant(S32, 0); 4610 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 4611 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 4612 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); 4613 4614 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 4615 Register NumerLo = UnmergeNumer.getReg(0); 4616 Register NumerHi = UnmergeNumer.getReg(1); 4617 4618 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 4619 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 4620 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 4621 Register Mul3_Lo = UnmergeMul3.getReg(0); 4622 Register Mul3_Hi = UnmergeMul3.getReg(1); 4623 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 4624 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 4625 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 4626 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); 4627 4628 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 4629 Register DenomLo = UnmergeDenom.getReg(0); 4630 Register DenomHi = UnmergeDenom.getReg(1); 4631 4632 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 4633 auto C1 = B.buildSExt(S32, CmpHi); 4634 4635 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 4636 auto C2 = B.buildSExt(S32, CmpLo); 4637 4638 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 4639 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 4640 4641 // TODO: Here and below portions of the code can be enclosed into if/endif. 4642 // Currently control flow is unconditional and we have 4 selects after 4643 // potential endif to substitute PHIs. 4644 4645 // if C3 != 0 ... 4646 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 4647 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 4648 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 4649 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); 4650 4651 auto One64 = B.buildConstant(S64, 1); 4652 auto Add3 = B.buildAdd(S64, MulHi3, One64); 4653 4654 auto C4 = 4655 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 4656 auto C5 = 4657 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 4658 auto C6 = B.buildSelect( 4659 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 4660 4661 // if (C6 != 0) 4662 auto Add4 = B.buildAdd(S64, Add3, One64); 4663 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 4664 4665 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 4666 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 4667 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); 4668 4669 // endif C6 4670 // endif C3 4671 4672 if (DstDivReg) { 4673 auto Sel1 = B.buildSelect( 4674 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 4675 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4676 Sel1, MulHi3); 4677 } 4678 4679 if (DstRemReg) { 4680 auto Sel2 = B.buildSelect( 4681 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 4682 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4683 Sel2, Sub1); 4684 } 4685 } 4686 4687 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 4688 MachineRegisterInfo &MRI, 4689 MachineIRBuilder &B) const { 4690 Register DstDivReg, DstRemReg; 4691 switch (MI.getOpcode()) { 4692 default: 4693 llvm_unreachable("Unexpected opcode!"); 4694 case AMDGPU::G_UDIV: { 4695 DstDivReg = MI.getOperand(0).getReg(); 4696 break; 4697 } 4698 case AMDGPU::G_UREM: { 4699 DstRemReg = MI.getOperand(0).getReg(); 4700 break; 4701 } 4702 case AMDGPU::G_UDIVREM: { 4703 DstDivReg = MI.getOperand(0).getReg(); 4704 DstRemReg = MI.getOperand(1).getReg(); 4705 break; 4706 } 4707 } 4708 4709 const LLT S64 = LLT::scalar(64); 4710 const LLT S32 = LLT::scalar(32); 4711 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4712 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 4713 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4714 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4715 4716 if (Ty == S32) 4717 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 4718 else if (Ty == S64) 4719 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 4720 else 4721 return false; 4722 4723 MI.eraseFromParent(); 4724 return true; 4725 } 4726 4727 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 4728 MachineRegisterInfo &MRI, 4729 MachineIRBuilder &B) const { 4730 const LLT S64 = LLT::scalar(64); 4731 const LLT S32 = LLT::scalar(32); 4732 4733 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4734 if (Ty != S32 && Ty != S64) 4735 return false; 4736 4737 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4738 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 4739 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4740 4741 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 4742 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 4743 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 4744 4745 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 4746 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 4747 4748 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 4749 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 4750 4751 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 4752 switch (MI.getOpcode()) { 4753 default: 4754 llvm_unreachable("Unexpected opcode!"); 4755 case AMDGPU::G_SDIV: { 4756 DstDivReg = MI.getOperand(0).getReg(); 4757 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4758 break; 4759 } 4760 case AMDGPU::G_SREM: { 4761 DstRemReg = MI.getOperand(0).getReg(); 4762 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4763 break; 4764 } 4765 case AMDGPU::G_SDIVREM: { 4766 DstDivReg = MI.getOperand(0).getReg(); 4767 DstRemReg = MI.getOperand(1).getReg(); 4768 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4769 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4770 break; 4771 } 4772 } 4773 4774 if (Ty == S32) 4775 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4776 else 4777 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4778 4779 if (DstDivReg) { 4780 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 4781 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 4782 B.buildSub(DstDivReg, SignXor, Sign); 4783 } 4784 4785 if (DstRemReg) { 4786 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 4787 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 4788 B.buildSub(DstRemReg, SignXor, Sign); 4789 } 4790 4791 MI.eraseFromParent(); 4792 return true; 4793 } 4794 4795 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 4796 MachineRegisterInfo &MRI, 4797 MachineIRBuilder &B) const { 4798 Register Res = MI.getOperand(0).getReg(); 4799 Register LHS = MI.getOperand(1).getReg(); 4800 Register RHS = MI.getOperand(2).getReg(); 4801 uint16_t Flags = MI.getFlags(); 4802 LLT ResTy = MRI.getType(Res); 4803 4804 const MachineFunction &MF = B.getMF(); 4805 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || 4806 MF.getTarget().Options.UnsafeFPMath; 4807 4808 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) { 4809 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) 4810 return false; 4811 4812 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 4813 // the CI documentation has a worst case error of 1 ulp. 4814 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 4815 // use it as long as we aren't trying to use denormals. 4816 // 4817 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. 4818 4819 // 1 / x -> RCP(x) 4820 if (CLHS->isExactlyValue(1.0)) { 4821 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) 4822 .addUse(RHS) 4823 .setMIFlags(Flags); 4824 4825 MI.eraseFromParent(); 4826 return true; 4827 } 4828 4829 // -1 / x -> RCP( FNEG(x) ) 4830 if (CLHS->isExactlyValue(-1.0)) { 4831 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 4832 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) 4833 .addUse(FNeg.getReg(0)) 4834 .setMIFlags(Flags); 4835 4836 MI.eraseFromParent(); 4837 return true; 4838 } 4839 } 4840 4841 // For f16 require afn or arcp. 4842 // For f32 require afn. 4843 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) || 4844 !MI.getFlag(MachineInstr::FmArcp))) 4845 return false; 4846 4847 // x / y -> x * (1.0 / y) 4848 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) 4849 .addUse(RHS) 4850 .setMIFlags(Flags); 4851 B.buildFMul(Res, LHS, RCP, Flags); 4852 4853 MI.eraseFromParent(); 4854 return true; 4855 } 4856 4857 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 4858 MachineRegisterInfo &MRI, 4859 MachineIRBuilder &B) const { 4860 Register Res = MI.getOperand(0).getReg(); 4861 Register X = MI.getOperand(1).getReg(); 4862 Register Y = MI.getOperand(2).getReg(); 4863 uint16_t Flags = MI.getFlags(); 4864 LLT ResTy = MRI.getType(Res); 4865 4866 const MachineFunction &MF = B.getMF(); 4867 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 4868 MI.getFlag(MachineInstr::FmAfn); 4869 4870 if (!AllowInaccurateRcp) 4871 return false; 4872 4873 auto NegY = B.buildFNeg(ResTy, Y); 4874 auto One = B.buildFConstant(ResTy, 1.0); 4875 4876 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) 4877 .addUse(Y) 4878 .setMIFlags(Flags); 4879 4880 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 4881 R = B.buildFMA(ResTy, Tmp0, R, R); 4882 4883 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 4884 R = B.buildFMA(ResTy, Tmp1, R, R); 4885 4886 auto Ret = B.buildFMul(ResTy, X, R); 4887 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 4888 4889 B.buildFMA(Res, Tmp2, R, Ret); 4890 MI.eraseFromParent(); 4891 return true; 4892 } 4893 4894 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 4895 MachineRegisterInfo &MRI, 4896 MachineIRBuilder &B) const { 4897 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4898 return true; 4899 4900 Register Res = MI.getOperand(0).getReg(); 4901 Register LHS = MI.getOperand(1).getReg(); 4902 Register RHS = MI.getOperand(2).getReg(); 4903 4904 uint16_t Flags = MI.getFlags(); 4905 4906 LLT S16 = LLT::scalar(16); 4907 LLT S32 = LLT::scalar(32); 4908 4909 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32 4910 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32 4911 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d 4912 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp 4913 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n 4914 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp 4915 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n 4916 // tmp.u = opx(V_MUL_F32, e32.u, r32.u); 4917 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000) 4918 // q32.u = opx(V_ADD_F32, tmp.u, q32.u); 4919 // q16.u = opx(V_CVT_F16_F32, q32.u); 4920 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n) 4921 4922 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 4923 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 4924 auto NegRHSExt = B.buildFNeg(S32, RHSExt); 4925 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 4926 .addUse(RHSExt.getReg(0)) 4927 .setMIFlags(Flags); 4928 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags); 4929 MachineInstrBuilder Err; 4930 if (ST.hasMadMacF32Insts()) { 4931 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags); 4932 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags); 4933 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags); 4934 } else { 4935 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags); 4936 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags); 4937 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags); 4938 } 4939 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags); 4940 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000)); 4941 Quot = B.buildFAdd(S32, Tmp, Quot, Flags); 4942 auto RDst = B.buildFPTrunc(S16, Quot, Flags); 4943 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) 4944 .addUse(RDst.getReg(0)) 4945 .addUse(RHS) 4946 .addUse(LHS) 4947 .setMIFlags(Flags); 4948 4949 MI.eraseFromParent(); 4950 return true; 4951 } 4952 4953 static constexpr unsigned SPDenormModeBitField = 4954 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2); 4955 4956 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 4957 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 4958 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, 4959 const GCNSubtarget &ST, 4960 SIModeRegisterDefaults Mode) { 4961 // Set SP denorm mode to this value. 4962 unsigned SPDenormMode = 4963 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 4964 4965 if (ST.hasDenormModeInst()) { 4966 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 4967 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 4968 4969 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 4970 B.buildInstr(AMDGPU::S_DENORM_MODE) 4971 .addImm(NewDenormModeValue); 4972 4973 } else { 4974 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 4975 .addImm(SPDenormMode) 4976 .addImm(SPDenormModeBitField); 4977 } 4978 } 4979 4980 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 4981 MachineRegisterInfo &MRI, 4982 MachineIRBuilder &B) const { 4983 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4984 return true; 4985 4986 Register Res = MI.getOperand(0).getReg(); 4987 Register LHS = MI.getOperand(1).getReg(); 4988 Register RHS = MI.getOperand(2).getReg(); 4989 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4990 SIModeRegisterDefaults Mode = MFI->getMode(); 4991 4992 uint16_t Flags = MI.getFlags(); 4993 4994 LLT S32 = LLT::scalar(32); 4995 LLT S1 = LLT::scalar(1); 4996 4997 auto One = B.buildFConstant(S32, 1.0f); 4998 4999 auto DenominatorScaled = 5000 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) 5001 .addUse(LHS) 5002 .addUse(RHS) 5003 .addImm(0) 5004 .setMIFlags(Flags); 5005 auto NumeratorScaled = 5006 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) 5007 .addUse(LHS) 5008 .addUse(RHS) 5009 .addImm(1) 5010 .setMIFlags(Flags); 5011 5012 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 5013 .addUse(DenominatorScaled.getReg(0)) 5014 .setMIFlags(Flags); 5015 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 5016 5017 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE(); 5018 const bool HasDynamicDenormals = 5019 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) || 5020 (Mode.FP32Denormals.Output == DenormalMode::Dynamic); 5021 5022 Register SavedSPDenormMode; 5023 if (!PreservesDenormals) { 5024 if (HasDynamicDenormals) { 5025 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5026 B.buildInstr(AMDGPU::S_GETREG_B32) 5027 .addDef(SavedSPDenormMode) 5028 .addImm(SPDenormModeBitField); 5029 } 5030 toggleSPDenormMode(true, B, ST, Mode); 5031 } 5032 5033 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 5034 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 5035 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 5036 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 5037 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 5038 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 5039 5040 if (!PreservesDenormals) { 5041 if (HasDynamicDenormals) { 5042 assert(SavedSPDenormMode); 5043 B.buildInstr(AMDGPU::S_SETREG_B32) 5044 .addReg(SavedSPDenormMode) 5045 .addImm(SPDenormModeBitField); 5046 } else 5047 toggleSPDenormMode(false, B, ST, Mode); 5048 } 5049 5050 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}) 5051 .addUse(Fma4.getReg(0)) 5052 .addUse(Fma1.getReg(0)) 5053 .addUse(Fma3.getReg(0)) 5054 .addUse(NumeratorScaled.getReg(1)) 5055 .setMIFlags(Flags); 5056 5057 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) 5058 .addUse(Fmas.getReg(0)) 5059 .addUse(RHS) 5060 .addUse(LHS) 5061 .setMIFlags(Flags); 5062 5063 MI.eraseFromParent(); 5064 return true; 5065 } 5066 5067 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 5068 MachineRegisterInfo &MRI, 5069 MachineIRBuilder &B) const { 5070 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 5071 return true; 5072 5073 Register Res = MI.getOperand(0).getReg(); 5074 Register LHS = MI.getOperand(1).getReg(); 5075 Register RHS = MI.getOperand(2).getReg(); 5076 5077 uint16_t Flags = MI.getFlags(); 5078 5079 LLT S64 = LLT::scalar(64); 5080 LLT S1 = LLT::scalar(1); 5081 5082 auto One = B.buildFConstant(S64, 1.0); 5083 5084 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) 5085 .addUse(LHS) 5086 .addUse(RHS) 5087 .addImm(0) 5088 .setMIFlags(Flags); 5089 5090 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 5091 5092 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}) 5093 .addUse(DivScale0.getReg(0)) 5094 .setMIFlags(Flags); 5095 5096 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 5097 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 5098 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 5099 5100 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) 5101 .addUse(LHS) 5102 .addUse(RHS) 5103 .addImm(1) 5104 .setMIFlags(Flags); 5105 5106 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 5107 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 5108 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 5109 5110 Register Scale; 5111 if (!ST.hasUsableDivScaleConditionOutput()) { 5112 // Workaround a hardware bug on SI where the condition output from div_scale 5113 // is not usable. 5114 5115 LLT S32 = LLT::scalar(32); 5116 5117 auto NumUnmerge = B.buildUnmerge(S32, LHS); 5118 auto DenUnmerge = B.buildUnmerge(S32, RHS); 5119 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 5120 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 5121 5122 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 5123 Scale1Unmerge.getReg(1)); 5124 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 5125 Scale0Unmerge.getReg(1)); 5126 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 5127 } else { 5128 Scale = DivScale1.getReg(1); 5129 } 5130 5131 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}) 5132 .addUse(Fma4.getReg(0)) 5133 .addUse(Fma3.getReg(0)) 5134 .addUse(Mul.getReg(0)) 5135 .addUse(Scale) 5136 .setMIFlags(Flags); 5137 5138 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res)) 5139 .addUse(Fmas.getReg(0)) 5140 .addUse(RHS) 5141 .addUse(LHS) 5142 .setMIFlags(Flags); 5143 5144 MI.eraseFromParent(); 5145 return true; 5146 } 5147 5148 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI, 5149 MachineRegisterInfo &MRI, 5150 MachineIRBuilder &B) const { 5151 Register Res0 = MI.getOperand(0).getReg(); 5152 Register Res1 = MI.getOperand(1).getReg(); 5153 Register Val = MI.getOperand(2).getReg(); 5154 uint16_t Flags = MI.getFlags(); 5155 5156 LLT Ty = MRI.getType(Res0); 5157 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32); 5158 5159 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}) 5160 .addUse(Val) 5161 .setMIFlags(Flags); 5162 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}) 5163 .addUse(Val) 5164 .setMIFlags(Flags); 5165 5166 if (ST.hasFractBug()) { 5167 auto Fabs = B.buildFAbs(Ty, Val); 5168 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty))); 5169 auto IsFinite = 5170 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 5171 auto Zero = B.buildConstant(InstrExpTy, 0); 5172 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero); 5173 Mant = B.buildSelect(Ty, IsFinite, Mant, Val); 5174 } 5175 5176 B.buildCopy(Res0, Mant); 5177 B.buildSExtOrTrunc(Res1, Exp); 5178 5179 MI.eraseFromParent(); 5180 return true; 5181 } 5182 5183 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 5184 MachineRegisterInfo &MRI, 5185 MachineIRBuilder &B) const { 5186 Register Res = MI.getOperand(0).getReg(); 5187 Register LHS = MI.getOperand(2).getReg(); 5188 Register RHS = MI.getOperand(3).getReg(); 5189 uint16_t Flags = MI.getFlags(); 5190 5191 LLT S32 = LLT::scalar(32); 5192 LLT S1 = LLT::scalar(1); 5193 5194 auto Abs = B.buildFAbs(S32, RHS, Flags); 5195 const APFloat C0Val(1.0f); 5196 5197 auto C0 = B.buildFConstant(S32, 0x1p+96f); 5198 auto C1 = B.buildFConstant(S32, 0x1p-32f); 5199 auto C2 = B.buildFConstant(S32, 1.0f); 5200 5201 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 5202 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 5203 5204 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 5205 5206 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 5207 .addUse(Mul0.getReg(0)) 5208 .setMIFlags(Flags); 5209 5210 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 5211 5212 B.buildFMul(Res, Sel, Mul1, Flags); 5213 5214 MI.eraseFromParent(); 5215 return true; 5216 } 5217 5218 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI, 5219 MachineRegisterInfo &MRI, 5220 MachineIRBuilder &B) const { 5221 // Bypass the correct expansion a standard promotion through G_FSQRT would 5222 // get. The f32 op is accurate enough for the f16 cas. 5223 unsigned Flags = MI.getFlags(); 5224 assert(!ST.has16BitInsts()); 5225 const LLT F32 = LLT::scalar(32); 5226 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags); 5227 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32}) 5228 .addUse(Ext.getReg(0)) 5229 .setMIFlags(Flags); 5230 B.buildFPTrunc(MI.getOperand(0), Log2, Flags); 5231 MI.eraseFromParent(); 5232 return true; 5233 } 5234 5235 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI, 5236 MachineRegisterInfo &MRI, 5237 MachineIRBuilder &B) const { 5238 MachineFunction &MF = B.getMF(); 5239 Register Dst = MI.getOperand(0).getReg(); 5240 Register X = MI.getOperand(1).getReg(); 5241 const unsigned Flags = MI.getFlags(); 5242 const LLT S1 = LLT::scalar(1); 5243 const LLT F32 = LLT::scalar(32); 5244 const LLT I32 = LLT::scalar(32); 5245 5246 if (allowApproxFunc(MF, Flags)) { 5247 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst})) 5248 .addUse(X) 5249 .setMIFlags(Flags); 5250 MI.eraseFromParent(); 5251 return true; 5252 } 5253 5254 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f); 5255 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags); 5256 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f); 5257 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags); 5258 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags); 5259 5260 Register SqrtS = MRI.createGenericVirtualRegister(F32); 5261 if (needsDenormHandlingF32(MF, X, Flags)) { 5262 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS})) 5263 .addUse(SqrtX.getReg(0)) 5264 .setMIFlags(Flags); 5265 5266 auto NegOne = B.buildConstant(I32, -1); 5267 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne); 5268 5269 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags); 5270 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags); 5271 5272 auto PosOne = B.buildConstant(I32, 1); 5273 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne); 5274 5275 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags); 5276 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags); 5277 5278 auto Zero = B.buildFConstant(F32, 0.0f); 5279 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags); 5280 5281 SqrtS = 5282 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0); 5283 5284 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags); 5285 SqrtS = 5286 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0); 5287 } else { 5288 auto SqrtR = 5289 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0)); 5290 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags); 5291 5292 auto Half = B.buildFConstant(F32, 0.5f); 5293 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags); 5294 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags); 5295 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags); 5296 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags); 5297 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0); 5298 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags); 5299 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags); 5300 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0); 5301 } 5302 5303 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f); 5304 5305 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags); 5306 5307 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0); 5308 5309 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 5310 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags); 5311 5312 MI.eraseFromParent(); 5313 return true; 5314 } 5315 5316 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI, 5317 MachineRegisterInfo &MRI, 5318 MachineIRBuilder &B) const { 5319 // For double type, the SQRT and RSQ instructions don't have required 5320 // precision, we apply Goldschmidt's algorithm to improve the result: 5321 // 5322 // y0 = rsq(x) 5323 // g0 = x * y0 5324 // h0 = 0.5 * y0 5325 // 5326 // r0 = 0.5 - h0 * g0 5327 // g1 = g0 * r0 + g0 5328 // h1 = h0 * r0 + h0 5329 // 5330 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 5331 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 5332 // h2 = h1 * r1 + h1 5333 // 5334 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 5335 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 5336 // 5337 // sqrt(x) = g3 5338 5339 const LLT S1 = LLT::scalar(1); 5340 const LLT S32 = LLT::scalar(32); 5341 const LLT F64 = LLT::scalar(64); 5342 5343 Register Dst = MI.getOperand(0).getReg(); 5344 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt"); 5345 5346 Register X = MI.getOperand(1).getReg(); 5347 unsigned Flags = MI.getFlags(); 5348 5349 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); 5350 5351 auto ZeroInt = B.buildConstant(S32, 0); 5352 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant); 5353 5354 // Scale up input if it is too small. 5355 auto ScaleUpFactor = B.buildConstant(S32, 256); 5356 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); 5357 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); 5358 5359 auto SqrtY = 5360 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0)); 5361 5362 auto Half = B.buildFConstant(F64, 0.5); 5363 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); 5364 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY); 5365 5366 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0); 5367 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half); 5368 5369 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0); 5370 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0); 5371 5372 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1); 5373 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX); 5374 5375 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1); 5376 5377 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); 5378 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); 5379 5380 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); 5381 5382 // Scale down the result. 5383 auto ScaleDownFactor = B.buildConstant(S32, -128); 5384 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); 5385 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags); 5386 5387 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check 5388 // with finite only or nsz because rsq(+/-0) = +/-inf 5389 5390 // TODO: Check for DAZ and expand to subnormals 5391 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 5392 5393 // If x is +INF, +0, or -0, use its original value 5394 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags); 5395 5396 MI.eraseFromParent(); 5397 return true; 5398 } 5399 5400 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, 5401 MachineRegisterInfo &MRI, 5402 MachineIRBuilder &B) const { 5403 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 5404 if (Ty == LLT::scalar(32)) 5405 return legalizeFSQRTF32(MI, MRI, B); 5406 if (Ty == LLT::scalar(64)) 5407 return legalizeFSQRTF64(MI, MRI, B); 5408 if (Ty == LLT::scalar(16)) 5409 return legalizeFSQRTF16(MI, MRI, B); 5410 return false; 5411 } 5412 5413 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 5414 // FIXME: Why do we handle this one but not other removed instructions? 5415 // 5416 // Reciprocal square root. The clamp prevents infinite results, clamping 5417 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 5418 // +-max_float. 5419 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 5420 MachineRegisterInfo &MRI, 5421 MachineIRBuilder &B) const { 5422 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 5423 return true; 5424 5425 Register Dst = MI.getOperand(0).getReg(); 5426 Register Src = MI.getOperand(2).getReg(); 5427 auto Flags = MI.getFlags(); 5428 5429 LLT Ty = MRI.getType(Dst); 5430 5431 const fltSemantics *FltSemantics; 5432 if (Ty == LLT::scalar(32)) 5433 FltSemantics = &APFloat::IEEEsingle(); 5434 else if (Ty == LLT::scalar(64)) 5435 FltSemantics = &APFloat::IEEEdouble(); 5436 else 5437 return false; 5438 5439 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}) 5440 .addUse(Src) 5441 .setMIFlags(Flags); 5442 5443 // We don't need to concern ourselves with the snan handling difference, since 5444 // the rsq quieted (or not) so use the one which will directly select. 5445 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5446 const bool UseIEEE = MFI->getMode().IEEE; 5447 5448 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 5449 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 5450 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 5451 5452 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 5453 5454 if (UseIEEE) 5455 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 5456 else 5457 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 5458 MI.eraseFromParent(); 5459 return true; 5460 } 5461 5462 // TODO: Fix pointer type handling 5463 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, 5464 MachineInstr &MI, 5465 Intrinsic::ID IID) const { 5466 5467 MachineIRBuilder &B = Helper.MIRBuilder; 5468 MachineRegisterInfo &MRI = *B.getMRI(); 5469 5470 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || 5471 IID == Intrinsic::amdgcn_permlanex16; 5472 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive || 5473 IID == Intrinsic::amdgcn_set_inactive_chain_arg; 5474 5475 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1, 5476 Register Src2, LLT VT) -> Register { 5477 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0); 5478 switch (IID) { 5479 case Intrinsic::amdgcn_readfirstlane: 5480 case Intrinsic::amdgcn_permlane64: 5481 return LaneOp.getReg(0); 5482 case Intrinsic::amdgcn_readlane: 5483 case Intrinsic::amdgcn_set_inactive: 5484 case Intrinsic::amdgcn_set_inactive_chain_arg: 5485 return LaneOp.addUse(Src1).getReg(0); 5486 case Intrinsic::amdgcn_writelane: 5487 return LaneOp.addUse(Src1).addUse(Src2).getReg(0); 5488 case Intrinsic::amdgcn_permlane16: 5489 case Intrinsic::amdgcn_permlanex16: { 5490 Register Src3 = MI.getOperand(5).getReg(); 5491 Register Src4 = MI.getOperand(6).getImm(); 5492 Register Src5 = MI.getOperand(7).getImm(); 5493 return LaneOp.addUse(Src1) 5494 .addUse(Src2) 5495 .addUse(Src3) 5496 .addImm(Src4) 5497 .addImm(Src5) 5498 .getReg(0); 5499 } 5500 case Intrinsic::amdgcn_mov_dpp8: 5501 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0); 5502 case Intrinsic::amdgcn_update_dpp: 5503 return LaneOp.addUse(Src1) 5504 .addImm(MI.getOperand(4).getImm()) 5505 .addImm(MI.getOperand(5).getImm()) 5506 .addImm(MI.getOperand(6).getImm()) 5507 .addImm(MI.getOperand(7).getImm()) 5508 .getReg(0); 5509 default: 5510 llvm_unreachable("unhandled lane op"); 5511 } 5512 }; 5513 5514 Register DstReg = MI.getOperand(0).getReg(); 5515 Register Src0 = MI.getOperand(2).getReg(); 5516 Register Src1, Src2; 5517 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || 5518 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) { 5519 Src1 = MI.getOperand(3).getReg(); 5520 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) { 5521 Src2 = MI.getOperand(4).getReg(); 5522 } 5523 } 5524 5525 LLT Ty = MRI.getType(DstReg); 5526 unsigned Size = Ty.getSizeInBits(); 5527 5528 unsigned SplitSize = 32; 5529 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) && 5530 ST.hasDPALU_DPP() && 5531 AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm())) 5532 SplitSize = 64; 5533 5534 if (Size == SplitSize) { 5535 // Already legal 5536 return true; 5537 } 5538 5539 if (Size < 32) { 5540 Src0 = B.buildAnyExt(S32, Src0).getReg(0); 5541 5542 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) 5543 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0); 5544 5545 if (IID == Intrinsic::amdgcn_writelane) 5546 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0); 5547 5548 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32); 5549 B.buildTrunc(DstReg, LaneOpDst); 5550 MI.eraseFromParent(); 5551 return true; 5552 } 5553 5554 if (Size % SplitSize != 0) 5555 return false; 5556 5557 LLT PartialResTy = LLT::scalar(SplitSize); 5558 if (Ty.isVector()) { 5559 LLT EltTy = Ty.getElementType(); 5560 unsigned EltSize = EltTy.getSizeInBits(); 5561 if (EltSize == SplitSize) { 5562 PartialResTy = EltTy; 5563 } else if (EltSize == 16 || EltSize == 32) { 5564 unsigned NElem = SplitSize / EltSize; 5565 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem)); 5566 } 5567 // Handle all other cases via S32/S64 pieces; 5568 } 5569 5570 SmallVector<Register, 4> PartialRes; 5571 unsigned NumParts = Size / SplitSize; 5572 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0); 5573 MachineInstrBuilder Src1Parts, Src2Parts; 5574 5575 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) 5576 Src1Parts = B.buildUnmerge(PartialResTy, Src1); 5577 5578 if (IID == Intrinsic::amdgcn_writelane) 5579 Src2Parts = B.buildUnmerge(PartialResTy, Src2); 5580 5581 for (unsigned i = 0; i < NumParts; ++i) { 5582 Src0 = Src0Parts.getReg(i); 5583 5584 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) 5585 Src1 = Src1Parts.getReg(i); 5586 5587 if (IID == Intrinsic::amdgcn_writelane) 5588 Src2 = Src2Parts.getReg(i); 5589 5590 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy)); 5591 } 5592 5593 B.buildMergeLikeInstr(DstReg, PartialRes); 5594 MI.eraseFromParent(); 5595 return true; 5596 } 5597 5598 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 5599 MachineRegisterInfo &MRI, 5600 MachineIRBuilder &B) const { 5601 uint64_t Offset = 5602 ST.getTargetLowering()->getImplicitParameterOffset( 5603 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 5604 LLT DstTy = MRI.getType(DstReg); 5605 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 5606 5607 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 5608 if (!loadInputValue(KernargPtrReg, B, 5609 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5610 return false; 5611 5612 // FIXME: This should be nuw 5613 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 5614 return true; 5615 } 5616 5617 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32 5618 /// bits of the pointer and replace them with the stride argument, then 5619 /// merge_values everything together. In the common case of a raw buffer (the 5620 /// stride component is 0), we can just AND off the upper half. 5621 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin( 5622 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5623 Register Result = MI.getOperand(0).getReg(); 5624 Register Pointer = MI.getOperand(2).getReg(); 5625 Register Stride = MI.getOperand(3).getReg(); 5626 Register NumRecords = MI.getOperand(4).getReg(); 5627 Register Flags = MI.getOperand(5).getReg(); 5628 5629 LLT S32 = LLT::scalar(32); 5630 5631 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5632 auto Unmerge = B.buildUnmerge(S32, Pointer); 5633 Register LowHalf = Unmerge.getReg(0); 5634 Register HighHalf = Unmerge.getReg(1); 5635 5636 auto AndMask = B.buildConstant(S32, 0x0000ffff); 5637 auto Masked = B.buildAnd(S32, HighHalf, AndMask); 5638 5639 MachineInstrBuilder NewHighHalf = Masked; 5640 std::optional<ValueAndVReg> StrideConst = 5641 getIConstantVRegValWithLookThrough(Stride, MRI); 5642 if (!StrideConst || !StrideConst->Value.isZero()) { 5643 MachineInstrBuilder ShiftedStride; 5644 if (StrideConst) { 5645 uint32_t StrideVal = StrideConst->Value.getZExtValue(); 5646 uint32_t ShiftedStrideVal = StrideVal << 16; 5647 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal); 5648 } else { 5649 auto ExtStride = B.buildAnyExt(S32, Stride); 5650 auto ShiftConst = B.buildConstant(S32, 16); 5651 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); 5652 } 5653 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); 5654 } 5655 Register NewHighHalfReg = NewHighHalf.getReg(0); 5656 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); 5657 MI.eraseFromParent(); 5658 return true; 5659 } 5660 5661 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 5662 MachineRegisterInfo &MRI, 5663 MachineIRBuilder &B) const { 5664 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5665 if (!MFI->isEntryFunction()) { 5666 return legalizePreloadedArgIntrin(MI, MRI, B, 5667 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 5668 } 5669 5670 Register DstReg = MI.getOperand(0).getReg(); 5671 if (!getImplicitArgPtr(DstReg, MRI, B)) 5672 return false; 5673 5674 MI.eraseFromParent(); 5675 return true; 5676 } 5677 5678 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, 5679 MachineRegisterInfo &MRI, 5680 MachineIRBuilder &B) const { 5681 Function &F = B.getMF().getFunction(); 5682 std::optional<uint32_t> KnownSize = 5683 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 5684 if (KnownSize.has_value()) 5685 B.buildConstant(DstReg, *KnownSize); 5686 return false; 5687 } 5688 5689 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, 5690 MachineRegisterInfo &MRI, 5691 MachineIRBuilder &B) const { 5692 5693 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5694 if (!MFI->isEntryFunction()) { 5695 return legalizePreloadedArgIntrin(MI, MRI, B, 5696 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 5697 } 5698 5699 Register DstReg = MI.getOperand(0).getReg(); 5700 if (!getLDSKernelId(DstReg, MRI, B)) 5701 return false; 5702 5703 MI.eraseFromParent(); 5704 return true; 5705 } 5706 5707 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 5708 MachineRegisterInfo &MRI, 5709 MachineIRBuilder &B, 5710 unsigned AddrSpace) const { 5711 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 5712 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 5713 Register Hi32 = Unmerge.getReg(1); 5714 5715 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 5716 MI.eraseFromParent(); 5717 return true; 5718 } 5719 5720 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 5721 // offset (the offset that is included in bounds checking and swizzling, to be 5722 // split between the instruction's voffset and immoffset fields) and soffset 5723 // (the offset that is excluded from bounds checking and swizzling, to go in 5724 // the instruction's soffset field). This function takes the first kind of 5725 // offset and figures out how to split it between voffset and immoffset. 5726 std::pair<Register, unsigned> 5727 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 5728 Register OrigOffset) const { 5729 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST); 5730 Register BaseReg; 5731 unsigned ImmOffset; 5732 const LLT S32 = LLT::scalar(32); 5733 MachineRegisterInfo &MRI = *B.getMRI(); 5734 5735 std::tie(BaseReg, ImmOffset) = 5736 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 5737 5738 // If BaseReg is a pointer, convert it to int. 5739 if (MRI.getType(BaseReg).isPointer()) 5740 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 5741 5742 // If the immediate value is too big for the immoffset field, put only bits 5743 // that would normally fit in the immoffset field. The remaining value that 5744 // is copied/added for the voffset field is a large power of 2, and it 5745 // stands more chance of being CSEd with the copy/add for another similar 5746 // load/store. 5747 // However, do not do that rounding down if that is a negative 5748 // number, as it appears to be illegal to have a negative offset in the 5749 // vgpr, even if adding the immediate offset makes it positive. 5750 unsigned Overflow = ImmOffset & ~MaxImm; 5751 ImmOffset -= Overflow; 5752 if ((int32_t)Overflow < 0) { 5753 Overflow += ImmOffset; 5754 ImmOffset = 0; 5755 } 5756 5757 if (Overflow != 0) { 5758 if (!BaseReg) { 5759 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 5760 } else { 5761 auto OverflowVal = B.buildConstant(S32, Overflow); 5762 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 5763 } 5764 } 5765 5766 if (!BaseReg) 5767 BaseReg = B.buildConstant(S32, 0).getReg(0); 5768 5769 return std::pair(BaseReg, ImmOffset); 5770 } 5771 5772 /// Handle register layout difference for f16 images for some subtargets. 5773 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 5774 MachineRegisterInfo &MRI, 5775 Register Reg, 5776 bool ImageStore) const { 5777 const LLT S16 = LLT::scalar(16); 5778 const LLT S32 = LLT::scalar(32); 5779 LLT StoreVT = MRI.getType(Reg); 5780 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 5781 5782 if (ST.hasUnpackedD16VMem()) { 5783 auto Unmerge = B.buildUnmerge(S16, Reg); 5784 5785 SmallVector<Register, 4> WideRegs; 5786 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5787 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 5788 5789 int NumElts = StoreVT.getNumElements(); 5790 5791 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 5792 .getReg(0); 5793 } 5794 5795 if (ImageStore && ST.hasImageStoreD16Bug()) { 5796 if (StoreVT.getNumElements() == 2) { 5797 SmallVector<Register, 4> PackedRegs; 5798 Reg = B.buildBitcast(S32, Reg).getReg(0); 5799 PackedRegs.push_back(Reg); 5800 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 5801 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 5802 .getReg(0); 5803 } 5804 5805 if (StoreVT.getNumElements() == 3) { 5806 SmallVector<Register, 4> PackedRegs; 5807 auto Unmerge = B.buildUnmerge(S16, Reg); 5808 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5809 PackedRegs.push_back(Unmerge.getReg(I)); 5810 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 5811 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 5812 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 5813 } 5814 5815 if (StoreVT.getNumElements() == 4) { 5816 SmallVector<Register, 4> PackedRegs; 5817 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 5818 auto Unmerge = B.buildUnmerge(S32, Reg); 5819 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5820 PackedRegs.push_back(Unmerge.getReg(I)); 5821 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 5822 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 5823 .getReg(0); 5824 } 5825 5826 llvm_unreachable("invalid data type"); 5827 } 5828 5829 if (StoreVT == LLT::fixed_vector(3, S16)) { 5830 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 5831 .getReg(0); 5832 } 5833 return Reg; 5834 } 5835 5836 Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B, 5837 Register VData, LLT MemTy, 5838 bool IsFormat) const { 5839 MachineRegisterInfo *MRI = B.getMRI(); 5840 LLT Ty = MRI->getType(VData); 5841 5842 const LLT S16 = LLT::scalar(16); 5843 5844 // Fixup buffer resources themselves needing to be v4i128. 5845 if (hasBufferRsrcWorkaround(Ty)) 5846 return castBufferRsrcToV4I32(VData, B); 5847 5848 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) { 5849 Ty = getBitcastRegisterType(Ty); 5850 VData = B.buildBitcast(Ty, VData).getReg(0); 5851 } 5852 // Fixup illegal register types for i8 stores. 5853 if (Ty == LLT::scalar(8) || Ty == S16) { 5854 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 5855 return AnyExt; 5856 } 5857 5858 if (Ty.isVector()) { 5859 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 5860 if (IsFormat) 5861 return handleD16VData(B, *MRI, VData); 5862 } 5863 } 5864 5865 return VData; 5866 } 5867 5868 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 5869 LegalizerHelper &Helper, 5870 bool IsTyped, 5871 bool IsFormat) const { 5872 MachineIRBuilder &B = Helper.MIRBuilder; 5873 MachineRegisterInfo &MRI = *B.getMRI(); 5874 5875 Register VData = MI.getOperand(1).getReg(); 5876 LLT Ty = MRI.getType(VData); 5877 LLT EltTy = Ty.getScalarType(); 5878 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 5879 const LLT S32 = LLT::scalar(32); 5880 5881 MachineMemOperand *MMO = *MI.memoperands_begin(); 5882 const int MemSize = MMO->getSize().getValue(); 5883 LLT MemTy = MMO->getMemoryType(); 5884 5885 VData = fixStoreSourceType(B, VData, MemTy, IsFormat); 5886 5887 castBufferRsrcArgToV4I32(MI, B, 2); 5888 Register RSrc = MI.getOperand(2).getReg(); 5889 5890 unsigned ImmOffset; 5891 5892 // The typed intrinsics add an immediate after the registers. 5893 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 5894 5895 // The struct intrinsic variants add one additional operand over raw. 5896 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 5897 Register VIndex; 5898 int OpOffset = 0; 5899 if (HasVIndex) { 5900 VIndex = MI.getOperand(3).getReg(); 5901 OpOffset = 1; 5902 } else { 5903 VIndex = B.buildConstant(S32, 0).getReg(0); 5904 } 5905 5906 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 5907 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 5908 5909 unsigned Format = 0; 5910 if (IsTyped) { 5911 Format = MI.getOperand(5 + OpOffset).getImm(); 5912 ++OpOffset; 5913 } 5914 5915 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 5916 5917 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5918 5919 unsigned Opc; 5920 if (IsTyped) { 5921 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 5922 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 5923 } else if (IsFormat) { 5924 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 5925 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 5926 } else { 5927 switch (MemSize) { 5928 case 1: 5929 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 5930 break; 5931 case 2: 5932 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 5933 break; 5934 default: 5935 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 5936 break; 5937 } 5938 } 5939 5940 auto MIB = B.buildInstr(Opc) 5941 .addUse(VData) // vdata 5942 .addUse(RSrc) // rsrc 5943 .addUse(VIndex) // vindex 5944 .addUse(VOffset) // voffset 5945 .addUse(SOffset) // soffset 5946 .addImm(ImmOffset); // offset(imm) 5947 5948 if (IsTyped) 5949 MIB.addImm(Format); 5950 5951 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5952 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5953 .addMemOperand(MMO); 5954 5955 MI.eraseFromParent(); 5956 return true; 5957 } 5958 5959 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, 5960 Register VIndex, Register VOffset, Register SOffset, 5961 unsigned ImmOffset, unsigned Format, 5962 unsigned AuxiliaryData, MachineMemOperand *MMO, 5963 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { 5964 auto MIB = B.buildInstr(Opc) 5965 .addDef(LoadDstReg) // vdata 5966 .addUse(RSrc) // rsrc 5967 .addUse(VIndex) // vindex 5968 .addUse(VOffset) // voffset 5969 .addUse(SOffset) // soffset 5970 .addImm(ImmOffset); // offset(imm) 5971 5972 if (IsTyped) 5973 MIB.addImm(Format); 5974 5975 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5976 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5977 .addMemOperand(MMO); 5978 } 5979 5980 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 5981 LegalizerHelper &Helper, 5982 bool IsFormat, 5983 bool IsTyped) const { 5984 MachineIRBuilder &B = Helper.MIRBuilder; 5985 MachineRegisterInfo &MRI = *B.getMRI(); 5986 GISelChangeObserver &Observer = Helper.Observer; 5987 5988 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 5989 MachineMemOperand *MMO = *MI.memoperands_begin(); 5990 const LLT MemTy = MMO->getMemoryType(); 5991 const LLT S32 = LLT::scalar(32); 5992 5993 Register Dst = MI.getOperand(0).getReg(); 5994 5995 Register StatusDst; 5996 int OpOffset = 0; 5997 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2); 5998 bool IsTFE = MI.getNumExplicitDefs() == 2; 5999 if (IsTFE) { 6000 StatusDst = MI.getOperand(1).getReg(); 6001 ++OpOffset; 6002 } 6003 6004 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset); 6005 Register RSrc = MI.getOperand(2 + OpOffset).getReg(); 6006 6007 // The typed intrinsics add an immediate after the registers. 6008 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 6009 6010 // The struct intrinsic variants add one additional operand over raw. 6011 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; 6012 Register VIndex; 6013 if (HasVIndex) { 6014 VIndex = MI.getOperand(3 + OpOffset).getReg(); 6015 ++OpOffset; 6016 } else { 6017 VIndex = B.buildConstant(S32, 0).getReg(0); 6018 } 6019 6020 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 6021 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 6022 6023 unsigned Format = 0; 6024 if (IsTyped) { 6025 Format = MI.getOperand(5 + OpOffset).getImm(); 6026 ++OpOffset; 6027 } 6028 6029 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 6030 unsigned ImmOffset; 6031 6032 LLT Ty = MRI.getType(Dst); 6033 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the 6034 // logic doesn't have to handle that case. 6035 if (hasBufferRsrcWorkaround(Ty)) { 6036 Observer.changingInstr(MI); 6037 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0); 6038 Observer.changedInstr(MI); 6039 Dst = MI.getOperand(0).getReg(); 6040 B.setInsertPt(B.getMBB(), MI); 6041 } 6042 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) { 6043 Ty = getBitcastRegisterType(Ty); 6044 Observer.changingInstr(MI); 6045 Helper.bitcastDst(MI, Ty, 0); 6046 Observer.changedInstr(MI); 6047 Dst = MI.getOperand(0).getReg(); 6048 B.setInsertPt(B.getMBB(), MI); 6049 } 6050 6051 LLT EltTy = Ty.getScalarType(); 6052 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 6053 const bool Unpacked = ST.hasUnpackedD16VMem(); 6054 6055 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 6056 6057 unsigned Opc; 6058 6059 // TODO: Support TFE for typed and narrow loads. 6060 if (IsTyped) { 6061 if (IsTFE) 6062 return false; 6063 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 6064 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 6065 } else if (IsFormat) { 6066 if (IsD16) { 6067 if (IsTFE) 6068 return false; 6069 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; 6070 } else { 6071 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE 6072 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 6073 } 6074 } else { 6075 switch (MemTy.getSizeInBits()) { 6076 case 8: 6077 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE 6078 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 6079 break; 6080 case 16: 6081 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE 6082 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 6083 break; 6084 default: 6085 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE 6086 : AMDGPU::G_AMDGPU_BUFFER_LOAD; 6087 break; 6088 } 6089 } 6090 6091 if (IsTFE) { 6092 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); 6093 unsigned NumLoadDWords = NumValueDWords + 1; 6094 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); 6095 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); 6096 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 6097 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 6098 if (MemTy.getSizeInBits() < 32) { 6099 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32); 6100 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg); 6101 B.buildTrunc(Dst, ExtDst); 6102 } else if (NumValueDWords == 1) { 6103 B.buildUnmerge({Dst, StatusDst}, LoadDstReg); 6104 } else { 6105 SmallVector<Register, 5> LoadElts; 6106 for (unsigned I = 0; I != NumValueDWords; ++I) 6107 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); 6108 LoadElts.push_back(StatusDst); 6109 B.buildUnmerge(LoadElts, LoadDstReg); 6110 LoadElts.truncate(NumValueDWords); 6111 B.buildMergeLikeInstr(Dst, LoadElts); 6112 } 6113 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || 6114 (IsD16 && !Ty.isVector())) { 6115 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 6116 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 6117 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 6118 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 6119 B.buildTrunc(Dst, LoadDstReg); 6120 } else if (Unpacked && IsD16 && Ty.isVector()) { 6121 LLT UnpackedTy = Ty.changeElementSize(32); 6122 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 6123 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 6124 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 6125 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 6126 // FIXME: G_TRUNC should work, but legalization currently fails 6127 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 6128 SmallVector<Register, 4> Repack; 6129 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 6130 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 6131 B.buildMergeLikeInstr(Dst, Repack); 6132 } else { 6133 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, 6134 AuxiliaryData, MMO, IsTyped, HasVIndex, B); 6135 } 6136 6137 MI.eraseFromParent(); 6138 return true; 6139 } 6140 6141 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 6142 switch (IntrID) { 6143 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 6144 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 6145 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 6146 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 6147 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 6148 case Intrinsic::amdgcn_raw_buffer_atomic_add: 6149 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 6150 case Intrinsic::amdgcn_struct_buffer_atomic_add: 6151 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 6152 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 6153 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 6154 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 6155 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 6156 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 6157 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 6158 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 6159 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 6160 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 6161 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 6162 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 6163 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 6164 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 6165 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 6166 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 6167 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 6168 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 6169 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 6170 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 6171 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 6172 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 6173 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 6174 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 6175 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 6176 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 6177 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 6178 case Intrinsic::amdgcn_raw_buffer_atomic_and: 6179 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 6180 case Intrinsic::amdgcn_struct_buffer_atomic_and: 6181 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 6182 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 6183 case Intrinsic::amdgcn_raw_buffer_atomic_or: 6184 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 6185 case Intrinsic::amdgcn_struct_buffer_atomic_or: 6186 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 6187 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 6188 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 6189 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 6190 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 6191 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 6192 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 6193 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 6194 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 6195 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 6196 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 6197 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 6198 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 6199 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 6200 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 6201 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 6202 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 6203 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 6204 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 6205 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 6206 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 6207 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 6208 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 6209 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 6210 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 6211 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 6212 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 6213 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 6214 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 6215 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 6216 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 6217 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 6218 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 6219 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 6220 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 6221 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 6222 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 6223 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: 6224 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: 6225 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32; 6226 default: 6227 llvm_unreachable("unhandled atomic opcode"); 6228 } 6229 } 6230 6231 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 6232 MachineIRBuilder &B, 6233 Intrinsic::ID IID) const { 6234 const bool IsCmpSwap = 6235 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 6236 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap || 6237 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap || 6238 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap; 6239 6240 Register Dst = MI.getOperand(0).getReg(); 6241 // Since we don't have 128-bit atomics, we don't need to handle the case of 6242 // p8 argmunents to the atomic itself 6243 Register VData = MI.getOperand(2).getReg(); 6244 6245 Register CmpVal; 6246 int OpOffset = 0; 6247 6248 if (IsCmpSwap) { 6249 CmpVal = MI.getOperand(3).getReg(); 6250 ++OpOffset; 6251 } 6252 6253 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset); 6254 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 6255 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 6256 6257 // The struct intrinsic variants add one additional operand over raw. 6258 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 6259 Register VIndex; 6260 if (HasVIndex) { 6261 VIndex = MI.getOperand(4 + OpOffset).getReg(); 6262 ++OpOffset; 6263 } else { 6264 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 6265 } 6266 6267 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 6268 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 6269 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 6270 6271 MachineMemOperand *MMO = *MI.memoperands_begin(); 6272 6273 unsigned ImmOffset; 6274 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 6275 6276 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 6277 .addDef(Dst) 6278 .addUse(VData); // vdata 6279 6280 if (IsCmpSwap) 6281 MIB.addReg(CmpVal); 6282 6283 MIB.addUse(RSrc) // rsrc 6284 .addUse(VIndex) // vindex 6285 .addUse(VOffset) // voffset 6286 .addUse(SOffset) // soffset 6287 .addImm(ImmOffset) // offset(imm) 6288 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 6289 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 6290 .addMemOperand(MMO); 6291 6292 MI.eraseFromParent(); 6293 return true; 6294 } 6295 6296 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 6297 /// vector with s16 typed elements. 6298 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 6299 SmallVectorImpl<Register> &PackedAddrs, 6300 unsigned ArgOffset, 6301 const AMDGPU::ImageDimIntrinsicInfo *Intr, 6302 bool IsA16, bool IsG16) { 6303 const LLT S16 = LLT::scalar(16); 6304 const LLT V2S16 = LLT::fixed_vector(2, 16); 6305 auto EndIdx = Intr->VAddrEnd; 6306 6307 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 6308 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 6309 if (!SrcOp.isReg()) 6310 continue; // _L to _LZ may have eliminated this. 6311 6312 Register AddrReg = SrcOp.getReg(); 6313 6314 if ((I < Intr->GradientStart) || 6315 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 6316 (I >= Intr->CoordStart && !IsA16)) { 6317 if ((I < Intr->GradientStart) && IsA16 && 6318 (B.getMRI()->getType(AddrReg) == S16)) { 6319 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 6320 // Special handling of bias when A16 is on. Bias is of type half but 6321 // occupies full 32-bit. 6322 PackedAddrs.push_back( 6323 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 6324 .getReg(0)); 6325 } else { 6326 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 6327 "Bias needs to be converted to 16 bit in A16 mode"); 6328 // Handle any gradient or coordinate operands that should not be packed 6329 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 6330 PackedAddrs.push_back(AddrReg); 6331 } 6332 } else { 6333 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 6334 // derivatives dx/dh and dx/dv are packed with undef. 6335 if (((I + 1) >= EndIdx) || 6336 ((Intr->NumGradients / 2) % 2 == 1 && 6337 (I == static_cast<unsigned>(Intr->GradientStart + 6338 (Intr->NumGradients / 2) - 1) || 6339 I == static_cast<unsigned>(Intr->GradientStart + 6340 Intr->NumGradients - 1))) || 6341 // Check for _L to _LZ optimization 6342 !MI.getOperand(ArgOffset + I + 1).isReg()) { 6343 PackedAddrs.push_back( 6344 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 6345 .getReg(0)); 6346 } else { 6347 PackedAddrs.push_back( 6348 B.buildBuildVector( 6349 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 6350 .getReg(0)); 6351 ++I; 6352 } 6353 } 6354 } 6355 } 6356 6357 /// Convert from separate vaddr components to a single vector address register, 6358 /// and replace the remaining operands with $noreg. 6359 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 6360 int DimIdx, int NumVAddrs) { 6361 const LLT S32 = LLT::scalar(32); 6362 (void)S32; 6363 SmallVector<Register, 8> AddrRegs; 6364 for (int I = 0; I != NumVAddrs; ++I) { 6365 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 6366 if (SrcOp.isReg()) { 6367 AddrRegs.push_back(SrcOp.getReg()); 6368 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 6369 } 6370 } 6371 6372 int NumAddrRegs = AddrRegs.size(); 6373 if (NumAddrRegs != 1) { 6374 auto VAddr = 6375 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 6376 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 6377 } 6378 6379 for (int I = 1; I != NumVAddrs; ++I) { 6380 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 6381 if (SrcOp.isReg()) 6382 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 6383 } 6384 } 6385 6386 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 6387 /// 6388 /// Depending on the subtarget, load/store with 16-bit element data need to be 6389 /// rewritten to use the low half of 32-bit registers, or directly use a packed 6390 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 6391 /// registers. 6392 /// 6393 /// We don't want to directly select image instructions just yet, but also want 6394 /// to exposes all register repacking to the legalizer/combiners. We also don't 6395 /// want a selected instruction entering RegBankSelect. In order to avoid 6396 /// defining a multitude of intermediate image instructions, directly hack on 6397 /// the intrinsic's arguments. In cases like a16 addresses, this requires 6398 /// padding now unnecessary arguments with $noreg. 6399 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 6400 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 6401 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 6402 6403 const MachineFunction &MF = *MI.getMF(); 6404 const unsigned NumDefs = MI.getNumExplicitDefs(); 6405 const unsigned ArgOffset = NumDefs + 1; 6406 bool IsTFE = NumDefs == 2; 6407 // We are only processing the operands of d16 image operations on subtargets 6408 // that use the unpacked register layout, or need to repack the TFE result. 6409 6410 // TODO: Do we need to guard against already legalized intrinsics? 6411 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 6412 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 6413 6414 MachineRegisterInfo *MRI = B.getMRI(); 6415 const LLT S32 = LLT::scalar(32); 6416 const LLT S16 = LLT::scalar(16); 6417 const LLT V2S16 = LLT::fixed_vector(2, 16); 6418 6419 unsigned DMask = 0; 6420 Register VData; 6421 LLT Ty; 6422 6423 if (!BaseOpcode->NoReturn || BaseOpcode->Store) { 6424 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 6425 Ty = MRI->getType(VData); 6426 } 6427 6428 const bool IsAtomicPacked16Bit = 6429 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || 6430 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); 6431 6432 // Check for 16 bit addresses and pack if true. 6433 LLT GradTy = 6434 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 6435 LLT AddrTy = 6436 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 6437 const bool IsG16 = 6438 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; 6439 const bool IsA16 = AddrTy == S16; 6440 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16; 6441 6442 int DMaskLanes = 0; 6443 if (!BaseOpcode->Atomic) { 6444 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 6445 if (BaseOpcode->Gather4) { 6446 DMaskLanes = 4; 6447 } else if (DMask != 0) { 6448 DMaskLanes = llvm::popcount(DMask); 6449 } else if (!IsTFE && !BaseOpcode->Store) { 6450 // If dmask is 0, this is a no-op load. This can be eliminated. 6451 B.buildUndef(MI.getOperand(0)); 6452 MI.eraseFromParent(); 6453 return true; 6454 } 6455 } 6456 6457 Observer.changingInstr(MI); 6458 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 6459 6460 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 6461 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 6462 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 6463 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 6464 unsigned NewOpcode = LoadOpcode; 6465 if (BaseOpcode->Store) 6466 NewOpcode = StoreOpcode; 6467 else if (BaseOpcode->NoReturn) 6468 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET; 6469 6470 // Track that we legalized this 6471 MI.setDesc(B.getTII().get(NewOpcode)); 6472 6473 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 6474 // dmask to be at least 1 otherwise the instruction will fail 6475 if (IsTFE && DMask == 0) { 6476 DMask = 0x1; 6477 DMaskLanes = 1; 6478 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 6479 } 6480 6481 if (BaseOpcode->Atomic) { 6482 Register VData0 = MI.getOperand(2).getReg(); 6483 LLT Ty = MRI->getType(VData0); 6484 6485 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 6486 if (Ty.isVector() && !IsAtomicPacked16Bit) 6487 return false; 6488 6489 if (BaseOpcode->AtomicX2) { 6490 Register VData1 = MI.getOperand(3).getReg(); 6491 // The two values are packed in one register. 6492 LLT PackedTy = LLT::fixed_vector(2, Ty); 6493 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 6494 MI.getOperand(2).setReg(Concat.getReg(0)); 6495 MI.getOperand(3).setReg(AMDGPU::NoRegister); 6496 } 6497 } 6498 6499 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 6500 6501 // Rewrite the addressing register layout before doing anything else. 6502 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 6503 // 16 bit gradients are supported, but are tied to the A16 control 6504 // so both gradients and addresses must be 16 bit 6505 return false; 6506 } 6507 6508 if (IsA16 && !ST.hasA16()) { 6509 // A16 not supported 6510 return false; 6511 } 6512 6513 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler); 6514 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); 6515 6516 if (IsA16 || IsG16) { 6517 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the 6518 // instructions expect VGPR_32 6519 SmallVector<Register, 4> PackedRegs; 6520 6521 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16); 6522 6523 // See also below in the non-a16 branch 6524 const bool UseNSA = ST.hasNSAEncoding() && 6525 PackedRegs.size() >= ST.getNSAThreshold(MF) && 6526 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); 6527 const bool UsePartialNSA = 6528 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; 6529 6530 if (UsePartialNSA) { 6531 // Pack registers that would go over NSAMaxSize into last VAddr register 6532 LLT PackedAddrTy = 6533 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); 6534 auto Concat = B.buildConcatVectors( 6535 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); 6536 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); 6537 PackedRegs.resize(NSAMaxSize); 6538 } else if (!UseNSA && PackedRegs.size() > 1) { 6539 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 6540 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 6541 PackedRegs[0] = Concat.getReg(0); 6542 PackedRegs.resize(1); 6543 } 6544 6545 const unsigned NumPacked = PackedRegs.size(); 6546 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 6547 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 6548 if (!SrcOp.isReg()) { 6549 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 6550 continue; 6551 } 6552 6553 assert(SrcOp.getReg() != AMDGPU::NoRegister); 6554 6555 if (I - Intr->VAddrStart < NumPacked) 6556 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 6557 else 6558 SrcOp.setReg(AMDGPU::NoRegister); 6559 } 6560 } else { 6561 // If the register allocator cannot place the address registers contiguously 6562 // without introducing moves, then using the non-sequential address encoding 6563 // is always preferable, since it saves VALU instructions and is usually a 6564 // wash in terms of code size or even better. 6565 // 6566 // However, we currently have no way of hinting to the register allocator 6567 // that MIMG addresses should be placed contiguously when it is possible to 6568 // do so, so force non-NSA for the common 2-address case as a heuristic. 6569 // 6570 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 6571 // allocation when possible. 6572 // 6573 // Partial NSA is allowed on GFX11+ where the final register is a contiguous 6574 // set of the remaining addresses. 6575 const bool UseNSA = ST.hasNSAEncoding() && 6576 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && 6577 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA); 6578 const bool UsePartialNSA = 6579 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize; 6580 6581 if (UsePartialNSA) { 6582 convertImageAddrToPacked(B, MI, 6583 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1, 6584 Intr->NumVAddrs - NSAMaxSize + 1); 6585 } else if (!UseNSA && Intr->NumVAddrs > 1) { 6586 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 6587 Intr->NumVAddrs); 6588 } 6589 } 6590 6591 int Flags = 0; 6592 if (IsA16) 6593 Flags |= 1; 6594 if (IsG16) 6595 Flags |= 2; 6596 MI.addOperand(MachineOperand::CreateImm(Flags)); 6597 6598 if (BaseOpcode->NoReturn) { // No TFE for stores? 6599 // TODO: Handle dmask trim 6600 if (!Ty.isVector() || !IsD16) 6601 return true; 6602 6603 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 6604 if (RepackedReg != VData) { 6605 MI.getOperand(1).setReg(RepackedReg); 6606 } 6607 6608 return true; 6609 } 6610 6611 Register DstReg = MI.getOperand(0).getReg(); 6612 const LLT EltTy = Ty.getScalarType(); 6613 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 6614 6615 // Confirm that the return type is large enough for the dmask specified 6616 if (NumElts < DMaskLanes) 6617 return false; 6618 6619 if (NumElts > 4 || DMaskLanes > 4) 6620 return false; 6621 6622 // Image atomic instructions are using DMask to specify how many bits 6623 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16). 6624 // DMaskLanes for image atomic has default value '0'. 6625 // We must be sure that atomic variants (especially packed) will not be 6626 // truncated from v2s16 or v4s16 to s16 type. 6627 // 6628 // ChangeElementCount will be needed for image load where Ty is always scalar. 6629 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 6630 const LLT AdjustedTy = 6631 DMaskLanes == 0 6632 ? Ty 6633 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 6634 6635 // The raw dword aligned data component of the load. The only legal cases 6636 // where this matters should be when using the packed D16 format, for 6637 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 6638 LLT RoundedTy; 6639 6640 // S32 vector to cover all data, plus TFE result element. 6641 LLT TFETy; 6642 6643 // Register type to use for each loaded component. Will be S32 or V2S16. 6644 LLT RegTy; 6645 6646 if (IsD16 && ST.hasUnpackedD16VMem()) { 6647 RoundedTy = 6648 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 6649 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 6650 RegTy = S32; 6651 } else { 6652 unsigned EltSize = EltTy.getSizeInBits(); 6653 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 6654 unsigned RoundedSize = 32 * RoundedElts; 6655 RoundedTy = LLT::scalarOrVector( 6656 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 6657 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 6658 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 6659 } 6660 6661 // The return type does not need adjustment. 6662 // TODO: Should we change s16 case to s32 or <2 x s16>? 6663 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 6664 return true; 6665 6666 Register Dst1Reg; 6667 6668 // Insert after the instruction. 6669 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 6670 6671 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 6672 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 6673 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 6674 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 6675 6676 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 6677 6678 MI.getOperand(0).setReg(NewResultReg); 6679 6680 // In the IR, TFE is supposed to be used with a 2 element struct return 6681 // type. The instruction really returns these two values in one contiguous 6682 // register, with one additional dword beyond the loaded data. Rewrite the 6683 // return type to use a single register result. 6684 6685 if (IsTFE) { 6686 Dst1Reg = MI.getOperand(1).getReg(); 6687 if (MRI->getType(Dst1Reg) != S32) 6688 return false; 6689 6690 // TODO: Make sure the TFE operand bit is set. 6691 MI.removeOperand(1); 6692 6693 // Handle the easy case that requires no repack instructions. 6694 if (Ty == S32) { 6695 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 6696 return true; 6697 } 6698 } 6699 6700 // Now figure out how to copy the new result register back into the old 6701 // result. 6702 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 6703 6704 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 6705 6706 if (ResultNumRegs == 1) { 6707 assert(!IsTFE); 6708 ResultRegs[0] = NewResultReg; 6709 } else { 6710 // We have to repack into a new vector of some kind. 6711 for (int I = 0; I != NumDataRegs; ++I) 6712 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 6713 B.buildUnmerge(ResultRegs, NewResultReg); 6714 6715 // Drop the final TFE element to get the data part. The TFE result is 6716 // directly written to the right place already. 6717 if (IsTFE) 6718 ResultRegs.resize(NumDataRegs); 6719 } 6720 6721 // For an s16 scalar result, we form an s32 result with a truncate regardless 6722 // of packed vs. unpacked. 6723 if (IsD16 && !Ty.isVector()) { 6724 B.buildTrunc(DstReg, ResultRegs[0]); 6725 return true; 6726 } 6727 6728 // Avoid a build/concat_vector of 1 entry. 6729 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 6730 B.buildBitcast(DstReg, ResultRegs[0]); 6731 return true; 6732 } 6733 6734 assert(Ty.isVector()); 6735 6736 if (IsD16) { 6737 // For packed D16 results with TFE enabled, all the data components are 6738 // S32. Cast back to the expected type. 6739 // 6740 // TODO: We don't really need to use load s32 elements. We would only need one 6741 // cast for the TFE result if a multiple of v2s16 was used. 6742 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 6743 for (Register &Reg : ResultRegs) 6744 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 6745 } else if (ST.hasUnpackedD16VMem()) { 6746 for (Register &Reg : ResultRegs) 6747 Reg = B.buildTrunc(S16, Reg).getReg(0); 6748 } 6749 } 6750 6751 auto padWithUndef = [&](LLT Ty, int NumElts) { 6752 if (NumElts == 0) 6753 return; 6754 Register Undef = B.buildUndef(Ty).getReg(0); 6755 for (int I = 0; I != NumElts; ++I) 6756 ResultRegs.push_back(Undef); 6757 }; 6758 6759 // Pad out any elements eliminated due to the dmask. 6760 LLT ResTy = MRI->getType(ResultRegs[0]); 6761 if (!ResTy.isVector()) { 6762 padWithUndef(ResTy, NumElts - ResultRegs.size()); 6763 B.buildBuildVector(DstReg, ResultRegs); 6764 return true; 6765 } 6766 6767 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 6768 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 6769 6770 // Deal with the one annoying legal case. 6771 const LLT V3S16 = LLT::fixed_vector(3, 16); 6772 if (Ty == V3S16) { 6773 if (IsTFE) { 6774 if (ResultRegs.size() == 1) { 6775 NewResultReg = ResultRegs[0]; 6776 } else if (ResultRegs.size() == 2) { 6777 LLT V4S16 = LLT::fixed_vector(4, 16); 6778 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 6779 } else { 6780 return false; 6781 } 6782 } 6783 6784 if (MRI->getType(DstReg).getNumElements() < 6785 MRI->getType(NewResultReg).getNumElements()) { 6786 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 6787 } else { 6788 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 6789 } 6790 return true; 6791 } 6792 6793 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 6794 B.buildConcatVectors(DstReg, ResultRegs); 6795 return true; 6796 } 6797 6798 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper, 6799 MachineInstr &MI) const { 6800 MachineIRBuilder &B = Helper.MIRBuilder; 6801 GISelChangeObserver &Observer = Helper.Observer; 6802 6803 Register OrigDst = MI.getOperand(0).getReg(); 6804 Register Dst; 6805 LLT Ty = B.getMRI()->getType(OrigDst); 6806 unsigned Size = Ty.getSizeInBits(); 6807 MachineFunction &MF = B.getMF(); 6808 unsigned Opc = 0; 6809 if (Size < 32 && ST.hasScalarSubwordLoads()) { 6810 assert(Size == 8 || Size == 16); 6811 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE 6812 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT; 6813 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit 6814 // destination register. 6815 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32)); 6816 } else { 6817 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD; 6818 Dst = OrigDst; 6819 } 6820 6821 Observer.changingInstr(MI); 6822 6823 // Handle needing to s.buffer.load() a p8 value. 6824 if (hasBufferRsrcWorkaround(Ty)) { 6825 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0); 6826 B.setInsertPt(B.getMBB(), MI); 6827 } 6828 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 6829 Ty = getBitcastRegisterType(Ty); 6830 Helper.bitcastDst(MI, Ty, 0); 6831 B.setInsertPt(B.getMBB(), MI); 6832 } 6833 6834 // FIXME: We don't really need this intermediate instruction. The intrinsic 6835 // should be fixed to have a memory operand. Since it's readnone, we're not 6836 // allowed to add one. 6837 MI.setDesc(B.getTII().get(Opc)); 6838 MI.removeOperand(1); // Remove intrinsic ID 6839 6840 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 6841 const unsigned MemSize = (Size + 7) / 8; 6842 const Align MemAlign = B.getDataLayout().getABITypeAlign( 6843 getTypeForLLT(Ty, MF.getFunction().getContext())); 6844 MachineMemOperand *MMO = MF.getMachineMemOperand( 6845 MachinePointerInfo(), 6846 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6847 MachineMemOperand::MOInvariant, 6848 MemSize, MemAlign); 6849 MI.addMemOperand(MF, MMO); 6850 if (Dst != OrigDst) { 6851 MI.getOperand(0).setReg(Dst); 6852 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 6853 B.buildTrunc(OrigDst, Dst); 6854 } 6855 6856 // If we don't have 96-bit result scalar loads, widening to 128-bit should 6857 // always be legal. We may need to restore this to a 96-bit result if it turns 6858 // out this needs to be converted to a vector load during RegBankSelect. 6859 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) { 6860 if (Ty.isVector()) 6861 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 6862 else 6863 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 6864 } 6865 6866 Observer.changedInstr(MI); 6867 return true; 6868 } 6869 6870 bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper, 6871 MachineInstr &MI) const { 6872 MachineIRBuilder &B = Helper.MIRBuilder; 6873 GISelChangeObserver &Observer = Helper.Observer; 6874 Observer.changingInstr(MI); 6875 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH)); 6876 MI.removeOperand(0); // Remove intrinsic ID 6877 castBufferRsrcArgToV4I32(MI, B, 0); 6878 Observer.changedInstr(MI); 6879 return true; 6880 } 6881 6882 // TODO: Move to selection 6883 bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI, 6884 MachineRegisterInfo &MRI, 6885 MachineIRBuilder &B) const { 6886 if (!ST.isTrapHandlerEnabled() || 6887 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 6888 return legalizeTrapEndpgm(MI, MRI, B); 6889 6890 return ST.supportsGetDoorbellID() ? 6891 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); 6892 } 6893 6894 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 6895 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6896 const DebugLoc &DL = MI.getDebugLoc(); 6897 MachineBasicBlock &BB = B.getMBB(); 6898 MachineFunction *MF = BB.getParent(); 6899 6900 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) { 6901 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6902 .addImm(0); 6903 MI.eraseFromParent(); 6904 return true; 6905 } 6906 6907 // We need a block split to make the real endpgm a terminator. We also don't 6908 // want to break phis in successor blocks, so we can't just delete to the 6909 // end of the block. 6910 BB.splitAt(MI, false /*UpdateLiveIns*/); 6911 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 6912 MF->push_back(TrapBB); 6913 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6914 .addImm(0); 6915 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ)) 6916 .addMBB(TrapBB); 6917 6918 BB.addSuccessor(TrapBB); 6919 MI.eraseFromParent(); 6920 return true; 6921 } 6922 6923 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 6924 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6925 MachineFunction &MF = B.getMF(); 6926 const LLT S64 = LLT::scalar(64); 6927 6928 Register SGPR01(AMDGPU::SGPR0_SGPR1); 6929 // For code object version 5, queue_ptr is passed through implicit kernarg. 6930 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= 6931 AMDGPU::AMDHSA_COV5) { 6932 AMDGPUTargetLowering::ImplicitParameter Param = 6933 AMDGPUTargetLowering::QUEUE_PTR; 6934 uint64_t Offset = 6935 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 6936 6937 Register KernargPtrReg = MRI.createGenericVirtualRegister( 6938 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6939 6940 if (!loadInputValue(KernargPtrReg, B, 6941 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 6942 return false; 6943 6944 // TODO: can we be smarter about machine pointer info? 6945 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 6946 MachineMemOperand *MMO = MF.getMachineMemOperand( 6947 PtrInfo, 6948 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6949 MachineMemOperand::MOInvariant, 6950 LLT::scalar(64), commonAlignment(Align(64), Offset)); 6951 6952 // Pointer address 6953 Register LoadAddr = MRI.createGenericVirtualRegister( 6954 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6955 B.buildPtrAdd(LoadAddr, KernargPtrReg, 6956 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 6957 // Load address 6958 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 6959 B.buildCopy(SGPR01, Temp); 6960 B.buildInstr(AMDGPU::S_TRAP) 6961 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 6962 .addReg(SGPR01, RegState::Implicit); 6963 MI.eraseFromParent(); 6964 return true; 6965 } 6966 6967 // Pass queue pointer to trap handler as input, and insert trap instruction 6968 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 6969 Register LiveIn = 6970 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6971 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 6972 return false; 6973 6974 B.buildCopy(SGPR01, LiveIn); 6975 B.buildInstr(AMDGPU::S_TRAP) 6976 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 6977 .addReg(SGPR01, RegState::Implicit); 6978 6979 MI.eraseFromParent(); 6980 return true; 6981 } 6982 6983 bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI, 6984 MachineRegisterInfo &MRI, 6985 MachineIRBuilder &B) const { 6986 // We need to simulate the 's_trap 2' instruction on targets that run in 6987 // PRIV=1 (where it is treated as a nop). 6988 if (ST.hasPrivEnabledTrap2NopBug()) { 6989 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI, 6990 MI.getDebugLoc()); 6991 MI.eraseFromParent(); 6992 return true; 6993 } 6994 6995 B.buildInstr(AMDGPU::S_TRAP) 6996 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 6997 MI.eraseFromParent(); 6998 return true; 6999 } 7000 7001 bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI, 7002 MachineRegisterInfo &MRI, 7003 MachineIRBuilder &B) const { 7004 // Is non-HSA path or trap-handler disabled? Then, report a warning 7005 // accordingly 7006 if (!ST.isTrapHandlerEnabled() || 7007 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 7008 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 7009 "debugtrap handler not supported", 7010 MI.getDebugLoc(), DS_Warning); 7011 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 7012 Ctx.diagnose(NoTrap); 7013 } else { 7014 // Insert debug-trap instruction 7015 B.buildInstr(AMDGPU::S_TRAP) 7016 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 7017 } 7018 7019 MI.eraseFromParent(); 7020 return true; 7021 } 7022 7023 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 7024 MachineIRBuilder &B) const { 7025 MachineRegisterInfo &MRI = *B.getMRI(); 7026 const LLT S16 = LLT::scalar(16); 7027 const LLT S32 = LLT::scalar(32); 7028 const LLT V2S16 = LLT::fixed_vector(2, 16); 7029 const LLT V3S32 = LLT::fixed_vector(3, 32); 7030 7031 Register DstReg = MI.getOperand(0).getReg(); 7032 Register NodePtr = MI.getOperand(2).getReg(); 7033 Register RayExtent = MI.getOperand(3).getReg(); 7034 Register RayOrigin = MI.getOperand(4).getReg(); 7035 Register RayDir = MI.getOperand(5).getReg(); 7036 Register RayInvDir = MI.getOperand(6).getReg(); 7037 Register TDescr = MI.getOperand(7).getReg(); 7038 7039 if (!ST.hasGFX10_AEncoding()) { 7040 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 7041 "intrinsic not supported on subtarget", 7042 MI.getDebugLoc()); 7043 B.getMF().getFunction().getContext().diagnose(BadIntrin); 7044 return false; 7045 } 7046 7047 const bool IsGFX11 = AMDGPU::isGFX11(ST); 7048 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 7049 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST); 7050 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 7051 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 7052 const unsigned NumVDataDwords = 4; 7053 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 7054 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 7055 const bool UseNSA = 7056 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize()); 7057 7058 const unsigned BaseOpcodes[2][2] = { 7059 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 7060 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 7061 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 7062 int Opcode; 7063 if (UseNSA) { 7064 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 7065 IsGFX12Plus ? AMDGPU::MIMGEncGfx12 7066 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA 7067 : AMDGPU::MIMGEncGfx10NSA, 7068 NumVDataDwords, NumVAddrDwords); 7069 } else { 7070 assert(!IsGFX12Plus); 7071 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 7072 IsGFX11 ? AMDGPU::MIMGEncGfx11Default 7073 : AMDGPU::MIMGEncGfx10Default, 7074 NumVDataDwords, NumVAddrDwords); 7075 } 7076 assert(Opcode != -1); 7077 7078 SmallVector<Register, 12> Ops; 7079 if (UseNSA && IsGFX11Plus) { 7080 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 7081 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 7082 auto Merged = B.buildMergeLikeInstr( 7083 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 7084 Ops.push_back(Merged.getReg(0)); 7085 }; 7086 7087 Ops.push_back(NodePtr); 7088 Ops.push_back(RayExtent); 7089 packLanes(RayOrigin); 7090 7091 if (IsA16) { 7092 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 7093 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 7094 auto MergedDir = B.buildMergeLikeInstr( 7095 V3S32, 7096 {B.buildBitcast( 7097 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), 7098 UnmergeRayDir.getReg(0)})) 7099 .getReg(0), 7100 B.buildBitcast( 7101 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), 7102 UnmergeRayDir.getReg(1)})) 7103 .getReg(0), 7104 B.buildBitcast( 7105 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), 7106 UnmergeRayDir.getReg(2)})) 7107 .getReg(0)}); 7108 Ops.push_back(MergedDir.getReg(0)); 7109 } else { 7110 packLanes(RayDir); 7111 packLanes(RayInvDir); 7112 } 7113 } else { 7114 if (Is64) { 7115 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 7116 Ops.push_back(Unmerge.getReg(0)); 7117 Ops.push_back(Unmerge.getReg(1)); 7118 } else { 7119 Ops.push_back(NodePtr); 7120 } 7121 Ops.push_back(RayExtent); 7122 7123 auto packLanes = [&Ops, &S32, &B](Register Src) { 7124 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 7125 Ops.push_back(Unmerge.getReg(0)); 7126 Ops.push_back(Unmerge.getReg(1)); 7127 Ops.push_back(Unmerge.getReg(2)); 7128 }; 7129 7130 packLanes(RayOrigin); 7131 if (IsA16) { 7132 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 7133 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 7134 Register R1 = MRI.createGenericVirtualRegister(S32); 7135 Register R2 = MRI.createGenericVirtualRegister(S32); 7136 Register R3 = MRI.createGenericVirtualRegister(S32); 7137 B.buildMergeLikeInstr(R1, 7138 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 7139 B.buildMergeLikeInstr( 7140 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 7141 B.buildMergeLikeInstr( 7142 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 7143 Ops.push_back(R1); 7144 Ops.push_back(R2); 7145 Ops.push_back(R3); 7146 } else { 7147 packLanes(RayDir); 7148 packLanes(RayInvDir); 7149 } 7150 } 7151 7152 if (!UseNSA) { 7153 // Build a single vector containing all the operands so far prepared. 7154 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 7155 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); 7156 Ops.clear(); 7157 Ops.push_back(MergedOps); 7158 } 7159 7160 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 7161 .addDef(DstReg) 7162 .addImm(Opcode); 7163 7164 for (Register R : Ops) { 7165 MIB.addUse(R); 7166 } 7167 7168 MIB.addUse(TDescr) 7169 .addImm(IsA16 ? 1 : 0) 7170 .cloneMemRefs(MI); 7171 7172 MI.eraseFromParent(); 7173 return true; 7174 } 7175 7176 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI, 7177 MachineIRBuilder &B) const { 7178 const SITargetLowering *TLI = ST.getTargetLowering(); 7179 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore(); 7180 Register DstReg = MI.getOperand(0).getReg(); 7181 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr}); 7182 MI.eraseFromParent(); 7183 return true; 7184 } 7185 7186 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI, 7187 MachineIRBuilder &B) const { 7188 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. 7189 if (!ST.hasArchitectedSGPRs()) 7190 return false; 7191 LLT S32 = LLT::scalar(32); 7192 Register DstReg = MI.getOperand(0).getReg(); 7193 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8)); 7194 auto LSB = B.buildConstant(S32, 25); 7195 auto Width = B.buildConstant(S32, 5); 7196 B.buildUbfx(DstReg, TTMP8, LSB, Width); 7197 MI.eraseFromParent(); 7198 return true; 7199 } 7200 7201 static constexpr unsigned FPEnvModeBitField = 7202 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23); 7203 7204 static constexpr unsigned FPEnvTrapBitField = 7205 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5); 7206 7207 bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI, 7208 MachineRegisterInfo &MRI, 7209 MachineIRBuilder &B) const { 7210 Register Src = MI.getOperand(0).getReg(); 7211 if (MRI.getType(Src) != S64) 7212 return false; 7213 7214 auto ModeReg = 7215 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32}, 7216 /*HasSideEffects=*/true, /*isConvergent=*/false) 7217 .addImm(FPEnvModeBitField); 7218 auto TrapReg = 7219 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32}, 7220 /*HasSideEffects=*/true, /*isConvergent=*/false) 7221 .addImm(FPEnvTrapBitField); 7222 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg}); 7223 MI.eraseFromParent(); 7224 return true; 7225 } 7226 7227 bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI, 7228 MachineRegisterInfo &MRI, 7229 MachineIRBuilder &B) const { 7230 Register Src = MI.getOperand(0).getReg(); 7231 if (MRI.getType(Src) != S64) 7232 return false; 7233 7234 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0)); 7235 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(), 7236 /*HasSideEffects=*/true, /*isConvergent=*/false) 7237 .addImm(static_cast<int16_t>(FPEnvModeBitField)) 7238 .addReg(Unmerge.getReg(0)); 7239 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(), 7240 /*HasSideEffects=*/true, /*isConvergent=*/false) 7241 .addImm(static_cast<int16_t>(FPEnvTrapBitField)) 7242 .addReg(Unmerge.getReg(1)); 7243 MI.eraseFromParent(); 7244 return true; 7245 } 7246 7247 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 7248 MachineInstr &MI) const { 7249 MachineIRBuilder &B = Helper.MIRBuilder; 7250 MachineRegisterInfo &MRI = *B.getMRI(); 7251 7252 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 7253 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); 7254 switch (IntrID) { 7255 case Intrinsic::amdgcn_if: 7256 case Intrinsic::amdgcn_else: { 7257 MachineInstr *Br = nullptr; 7258 MachineBasicBlock *UncondBrTarget = nullptr; 7259 bool Negated = false; 7260 if (MachineInstr *BrCond = 7261 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 7262 const SIRegisterInfo *TRI 7263 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 7264 7265 Register Def = MI.getOperand(1).getReg(); 7266 Register Use = MI.getOperand(3).getReg(); 7267 7268 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 7269 7270 if (Negated) 7271 std::swap(CondBrTarget, UncondBrTarget); 7272 7273 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 7274 if (IntrID == Intrinsic::amdgcn_if) { 7275 B.buildInstr(AMDGPU::SI_IF) 7276 .addDef(Def) 7277 .addUse(Use) 7278 .addMBB(UncondBrTarget); 7279 } else { 7280 B.buildInstr(AMDGPU::SI_ELSE) 7281 .addDef(Def) 7282 .addUse(Use) 7283 .addMBB(UncondBrTarget); 7284 } 7285 7286 if (Br) { 7287 Br->getOperand(0).setMBB(CondBrTarget); 7288 } else { 7289 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 7290 // since we're swapping branch targets it needs to be reinserted. 7291 // FIXME: IRTranslator should probably not do this 7292 B.buildBr(*CondBrTarget); 7293 } 7294 7295 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 7296 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 7297 MI.eraseFromParent(); 7298 BrCond->eraseFromParent(); 7299 return true; 7300 } 7301 7302 return false; 7303 } 7304 case Intrinsic::amdgcn_loop: { 7305 MachineInstr *Br = nullptr; 7306 MachineBasicBlock *UncondBrTarget = nullptr; 7307 bool Negated = false; 7308 if (MachineInstr *BrCond = 7309 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 7310 const SIRegisterInfo *TRI 7311 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 7312 7313 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 7314 Register Reg = MI.getOperand(2).getReg(); 7315 7316 if (Negated) 7317 std::swap(CondBrTarget, UncondBrTarget); 7318 7319 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 7320 B.buildInstr(AMDGPU::SI_LOOP) 7321 .addUse(Reg) 7322 .addMBB(UncondBrTarget); 7323 7324 if (Br) 7325 Br->getOperand(0).setMBB(CondBrTarget); 7326 else 7327 B.buildBr(*CondBrTarget); 7328 7329 MI.eraseFromParent(); 7330 BrCond->eraseFromParent(); 7331 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 7332 return true; 7333 } 7334 7335 return false; 7336 } 7337 case Intrinsic::amdgcn_addrspacecast_nonnull: 7338 return legalizeAddrSpaceCast(MI, MRI, B); 7339 case Intrinsic::amdgcn_make_buffer_rsrc: 7340 return legalizePointerAsRsrcIntrin(MI, MRI, B); 7341 case Intrinsic::amdgcn_kernarg_segment_ptr: 7342 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 7343 // This only makes sense to call in a kernel, so just lower to null. 7344 B.buildConstant(MI.getOperand(0).getReg(), 0); 7345 MI.eraseFromParent(); 7346 return true; 7347 } 7348 7349 return legalizePreloadedArgIntrin( 7350 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 7351 case Intrinsic::amdgcn_implicitarg_ptr: 7352 return legalizeImplicitArgPtr(MI, MRI, B); 7353 case Intrinsic::amdgcn_workitem_id_x: 7354 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 7355 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 7356 case Intrinsic::amdgcn_workitem_id_y: 7357 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 7358 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 7359 case Intrinsic::amdgcn_workitem_id_z: 7360 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 7361 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 7362 case Intrinsic::amdgcn_workgroup_id_x: 7363 return legalizePreloadedArgIntrin(MI, MRI, B, 7364 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 7365 case Intrinsic::amdgcn_workgroup_id_y: 7366 return legalizePreloadedArgIntrin(MI, MRI, B, 7367 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 7368 case Intrinsic::amdgcn_workgroup_id_z: 7369 return legalizePreloadedArgIntrin(MI, MRI, B, 7370 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 7371 case Intrinsic::amdgcn_wave_id: 7372 return legalizeWaveID(MI, B); 7373 case Intrinsic::amdgcn_lds_kernel_id: 7374 return legalizePreloadedArgIntrin(MI, MRI, B, 7375 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 7376 case Intrinsic::amdgcn_dispatch_ptr: 7377 return legalizePreloadedArgIntrin(MI, MRI, B, 7378 AMDGPUFunctionArgInfo::DISPATCH_PTR); 7379 case Intrinsic::amdgcn_queue_ptr: 7380 return legalizePreloadedArgIntrin(MI, MRI, B, 7381 AMDGPUFunctionArgInfo::QUEUE_PTR); 7382 case Intrinsic::amdgcn_implicit_buffer_ptr: 7383 return legalizePreloadedArgIntrin( 7384 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 7385 case Intrinsic::amdgcn_dispatch_id: 7386 return legalizePreloadedArgIntrin(MI, MRI, B, 7387 AMDGPUFunctionArgInfo::DISPATCH_ID); 7388 case Intrinsic::r600_read_ngroups_x: 7389 // TODO: Emit error for hsa 7390 return legalizeKernargMemParameter(MI, B, 7391 SI::KernelInputOffsets::NGROUPS_X); 7392 case Intrinsic::r600_read_ngroups_y: 7393 return legalizeKernargMemParameter(MI, B, 7394 SI::KernelInputOffsets::NGROUPS_Y); 7395 case Intrinsic::r600_read_ngroups_z: 7396 return legalizeKernargMemParameter(MI, B, 7397 SI::KernelInputOffsets::NGROUPS_Z); 7398 case Intrinsic::r600_read_local_size_x: 7399 // TODO: Could insert G_ASSERT_ZEXT from s16 7400 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 7401 case Intrinsic::r600_read_local_size_y: 7402 // TODO: Could insert G_ASSERT_ZEXT from s16 7403 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 7404 // TODO: Could insert G_ASSERT_ZEXT from s16 7405 case Intrinsic::r600_read_local_size_z: 7406 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 7407 case Intrinsic::r600_read_global_size_x: 7408 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 7409 case Intrinsic::r600_read_global_size_y: 7410 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 7411 case Intrinsic::r600_read_global_size_z: 7412 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 7413 case Intrinsic::amdgcn_fdiv_fast: 7414 return legalizeFDIVFastIntrin(MI, MRI, B); 7415 case Intrinsic::amdgcn_is_shared: 7416 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 7417 case Intrinsic::amdgcn_is_private: 7418 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 7419 case Intrinsic::amdgcn_wavefrontsize: { 7420 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 7421 MI.eraseFromParent(); 7422 return true; 7423 } 7424 case Intrinsic::amdgcn_s_buffer_load: 7425 return legalizeSBufferLoad(Helper, MI); 7426 case Intrinsic::amdgcn_raw_buffer_store: 7427 case Intrinsic::amdgcn_raw_ptr_buffer_store: 7428 case Intrinsic::amdgcn_struct_buffer_store: 7429 case Intrinsic::amdgcn_struct_ptr_buffer_store: 7430 return legalizeBufferStore(MI, Helper, false, false); 7431 case Intrinsic::amdgcn_raw_buffer_store_format: 7432 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: 7433 case Intrinsic::amdgcn_struct_buffer_store_format: 7434 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: 7435 return legalizeBufferStore(MI, Helper, false, true); 7436 case Intrinsic::amdgcn_raw_tbuffer_store: 7437 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: 7438 case Intrinsic::amdgcn_struct_tbuffer_store: 7439 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: 7440 return legalizeBufferStore(MI, Helper, true, true); 7441 case Intrinsic::amdgcn_raw_buffer_load: 7442 case Intrinsic::amdgcn_raw_ptr_buffer_load: 7443 case Intrinsic::amdgcn_raw_atomic_buffer_load: 7444 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: 7445 case Intrinsic::amdgcn_struct_buffer_load: 7446 case Intrinsic::amdgcn_struct_ptr_buffer_load: 7447 case Intrinsic::amdgcn_struct_atomic_buffer_load: 7448 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: 7449 return legalizeBufferLoad(MI, Helper, false, false); 7450 case Intrinsic::amdgcn_raw_buffer_load_format: 7451 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: 7452 case Intrinsic::amdgcn_struct_buffer_load_format: 7453 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 7454 return legalizeBufferLoad(MI, Helper, true, false); 7455 case Intrinsic::amdgcn_raw_tbuffer_load: 7456 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: 7457 case Intrinsic::amdgcn_struct_tbuffer_load: 7458 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: 7459 return legalizeBufferLoad(MI, Helper, true, true); 7460 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 7461 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 7462 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 7463 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 7464 case Intrinsic::amdgcn_raw_buffer_atomic_add: 7465 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 7466 case Intrinsic::amdgcn_struct_buffer_atomic_add: 7467 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 7468 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 7469 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 7470 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 7471 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 7472 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 7473 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 7474 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 7475 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 7476 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 7477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 7478 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 7479 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 7480 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 7481 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 7482 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 7483 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 7484 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 7485 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 7486 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 7487 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 7488 case Intrinsic::amdgcn_raw_buffer_atomic_and: 7489 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 7490 case Intrinsic::amdgcn_struct_buffer_atomic_and: 7491 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 7492 case Intrinsic::amdgcn_raw_buffer_atomic_or: 7493 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 7494 case Intrinsic::amdgcn_struct_buffer_atomic_or: 7495 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 7496 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 7497 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 7498 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 7499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 7500 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 7501 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 7502 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 7503 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 7504 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 7505 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 7506 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 7507 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 7508 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 7509 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 7510 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 7511 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 7512 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 7513 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 7514 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 7515 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 7516 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 7517 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 7518 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 7519 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 7520 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 7521 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 7522 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 7523 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 7524 return legalizeBufferAtomic(MI, B, IntrID); 7525 case Intrinsic::amdgcn_rsq_clamp: 7526 return legalizeRsqClampIntrinsic(MI, MRI, B); 7527 case Intrinsic::amdgcn_image_bvh_intersect_ray: 7528 return legalizeBVHIntrinsic(MI, B); 7529 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: 7530 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: 7531 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: 7532 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: 7533 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: 7534 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: 7535 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: 7536 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { 7537 Register Index = MI.getOperand(5).getReg(); 7538 LLT S32 = LLT::scalar(32); 7539 if (MRI.getType(Index) != S32) 7540 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0)); 7541 return true; 7542 } 7543 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: 7544 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: 7545 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { 7546 Register Index = MI.getOperand(7).getReg(); 7547 LLT S32 = LLT::scalar(32); 7548 if (MRI.getType(Index) != S32) 7549 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0)); 7550 return true; 7551 } 7552 case Intrinsic::amdgcn_fmed3: { 7553 GISelChangeObserver &Observer = Helper.Observer; 7554 7555 // FIXME: This is to workaround the inability of tablegen match combiners to 7556 // match intrinsics in patterns. 7557 Observer.changingInstr(MI); 7558 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3)); 7559 MI.removeOperand(1); 7560 Observer.changedInstr(MI); 7561 return true; 7562 } 7563 case Intrinsic::amdgcn_readlane: 7564 case Intrinsic::amdgcn_writelane: 7565 case Intrinsic::amdgcn_readfirstlane: 7566 case Intrinsic::amdgcn_permlane16: 7567 case Intrinsic::amdgcn_permlanex16: 7568 case Intrinsic::amdgcn_permlane64: 7569 case Intrinsic::amdgcn_set_inactive: 7570 case Intrinsic::amdgcn_set_inactive_chain_arg: 7571 case Intrinsic::amdgcn_mov_dpp8: 7572 case Intrinsic::amdgcn_update_dpp: 7573 return legalizeLaneOp(Helper, MI, IntrID); 7574 case Intrinsic::amdgcn_s_buffer_prefetch_data: 7575 return legalizeSBufferPrefetch(Helper, MI); 7576 default: { 7577 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 7578 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 7579 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 7580 return true; 7581 } 7582 } 7583 7584 return true; 7585 } 7586