1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This file implements the LegalizerHelper class to legalize 10 /// individual instructions and the LegalizeMachineIR wrapper pass for the 11 /// primary legalization. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 16 #include "llvm/CodeGen/GlobalISel/CallLowering.h" 17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" 18 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 19 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 21 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h" 22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 24 #include "llvm/CodeGen/GlobalISel/Utils.h" 25 #include "llvm/CodeGen/MachineConstantPool.h" 26 #include "llvm/CodeGen/MachineFrameInfo.h" 27 #include "llvm/CodeGen/MachineRegisterInfo.h" 28 #include "llvm/CodeGen/RuntimeLibcalls.h" 29 #include "llvm/CodeGen/TargetFrameLowering.h" 30 #include "llvm/CodeGen/TargetInstrInfo.h" 31 #include "llvm/CodeGen/TargetLowering.h" 32 #include "llvm/CodeGen/TargetOpcodes.h" 33 #include "llvm/CodeGen/TargetSubtargetInfo.h" 34 #include "llvm/IR/Instructions.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Support/MathExtras.h" 37 #include "llvm/Support/raw_ostream.h" 38 #include "llvm/Target/TargetMachine.h" 39 #include <numeric> 40 #include <optional> 41 42 #define DEBUG_TYPE "legalizer" 43 44 using namespace llvm; 45 using namespace LegalizeActions; 46 using namespace MIPatternMatch; 47 48 /// Try to break down \p OrigTy into \p NarrowTy sized pieces. 49 /// 50 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy, 51 /// with any leftover piece as type \p LeftoverTy 52 /// 53 /// Returns -1 in the first element of the pair if the breakdown is not 54 /// satisfiable. 55 static std::pair<int, int> 56 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) { 57 assert(!LeftoverTy.isValid() && "this is an out argument"); 58 59 unsigned Size = OrigTy.getSizeInBits(); 60 unsigned NarrowSize = NarrowTy.getSizeInBits(); 61 unsigned NumParts = Size / NarrowSize; 62 unsigned LeftoverSize = Size - NumParts * NarrowSize; 63 assert(Size > NarrowSize); 64 65 if (LeftoverSize == 0) 66 return {NumParts, 0}; 67 68 if (NarrowTy.isVector()) { 69 unsigned EltSize = OrigTy.getScalarSizeInBits(); 70 if (LeftoverSize % EltSize != 0) 71 return {-1, -1}; 72 LeftoverTy = LLT::scalarOrVector( 73 ElementCount::getFixed(LeftoverSize / EltSize), EltSize); 74 } else { 75 LeftoverTy = LLT::scalar(LeftoverSize); 76 } 77 78 int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits(); 79 return std::make_pair(NumParts, NumLeftover); 80 } 81 82 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) { 83 84 if (!Ty.isScalar()) 85 return nullptr; 86 87 switch (Ty.getSizeInBits()) { 88 case 16: 89 return Type::getHalfTy(Ctx); 90 case 32: 91 return Type::getFloatTy(Ctx); 92 case 64: 93 return Type::getDoubleTy(Ctx); 94 case 80: 95 return Type::getX86_FP80Ty(Ctx); 96 case 128: 97 return Type::getFP128Ty(Ctx); 98 default: 99 return nullptr; 100 } 101 } 102 103 LegalizerHelper::LegalizerHelper(MachineFunction &MF, 104 GISelChangeObserver &Observer, 105 MachineIRBuilder &Builder) 106 : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()), 107 LI(*MF.getSubtarget().getLegalizerInfo()), 108 TLI(*MF.getSubtarget().getTargetLowering()), KB(nullptr) {} 109 110 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI, 111 GISelChangeObserver &Observer, 112 MachineIRBuilder &B, GISelKnownBits *KB) 113 : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI), 114 TLI(*MF.getSubtarget().getTargetLowering()), KB(KB) {} 115 116 LegalizerHelper::LegalizeResult 117 LegalizerHelper::legalizeInstrStep(MachineInstr &MI, 118 LostDebugLocObserver &LocObserver) { 119 LLVM_DEBUG(dbgs() << "Legalizing: " << MI); 120 121 MIRBuilder.setInstrAndDebugLoc(MI); 122 123 if (isa<GIntrinsic>(MI)) 124 return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize; 125 auto Step = LI.getAction(MI, MRI); 126 switch (Step.Action) { 127 case Legal: 128 LLVM_DEBUG(dbgs() << ".. Already legal\n"); 129 return AlreadyLegal; 130 case Libcall: 131 LLVM_DEBUG(dbgs() << ".. Convert to libcall\n"); 132 return libcall(MI, LocObserver); 133 case NarrowScalar: 134 LLVM_DEBUG(dbgs() << ".. Narrow scalar\n"); 135 return narrowScalar(MI, Step.TypeIdx, Step.NewType); 136 case WidenScalar: 137 LLVM_DEBUG(dbgs() << ".. Widen scalar\n"); 138 return widenScalar(MI, Step.TypeIdx, Step.NewType); 139 case Bitcast: 140 LLVM_DEBUG(dbgs() << ".. Bitcast type\n"); 141 return bitcast(MI, Step.TypeIdx, Step.NewType); 142 case Lower: 143 LLVM_DEBUG(dbgs() << ".. Lower\n"); 144 return lower(MI, Step.TypeIdx, Step.NewType); 145 case FewerElements: 146 LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n"); 147 return fewerElementsVector(MI, Step.TypeIdx, Step.NewType); 148 case MoreElements: 149 LLVM_DEBUG(dbgs() << ".. Increase number of elements\n"); 150 return moreElementsVector(MI, Step.TypeIdx, Step.NewType); 151 case Custom: 152 LLVM_DEBUG(dbgs() << ".. Custom legalization\n"); 153 return LI.legalizeCustom(*this, MI, LocObserver) ? Legalized 154 : UnableToLegalize; 155 default: 156 LLVM_DEBUG(dbgs() << ".. Unable to legalize\n"); 157 return UnableToLegalize; 158 } 159 } 160 161 void LegalizerHelper::insertParts(Register DstReg, 162 LLT ResultTy, LLT PartTy, 163 ArrayRef<Register> PartRegs, 164 LLT LeftoverTy, 165 ArrayRef<Register> LeftoverRegs) { 166 if (!LeftoverTy.isValid()) { 167 assert(LeftoverRegs.empty()); 168 169 if (!ResultTy.isVector()) { 170 MIRBuilder.buildMergeLikeInstr(DstReg, PartRegs); 171 return; 172 } 173 174 if (PartTy.isVector()) 175 MIRBuilder.buildConcatVectors(DstReg, PartRegs); 176 else 177 MIRBuilder.buildBuildVector(DstReg, PartRegs); 178 return; 179 } 180 181 // Merge sub-vectors with different number of elements and insert into DstReg. 182 if (ResultTy.isVector()) { 183 assert(LeftoverRegs.size() == 1 && "Expected one leftover register"); 184 SmallVector<Register, 8> AllRegs; 185 for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs)) 186 AllRegs.push_back(Reg); 187 return mergeMixedSubvectors(DstReg, AllRegs); 188 } 189 190 SmallVector<Register> GCDRegs; 191 LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy); 192 for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs)) 193 extractGCDType(GCDRegs, GCDTy, PartReg); 194 LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs); 195 buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs); 196 } 197 198 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts, 199 Register Reg) { 200 LLT Ty = MRI.getType(Reg); 201 SmallVector<Register, 8> RegElts; 202 extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts, 203 MIRBuilder, MRI); 204 Elts.append(RegElts); 205 } 206 207 /// Merge \p PartRegs with different types into \p DstReg. 208 void LegalizerHelper::mergeMixedSubvectors(Register DstReg, 209 ArrayRef<Register> PartRegs) { 210 SmallVector<Register, 8> AllElts; 211 for (unsigned i = 0; i < PartRegs.size() - 1; ++i) 212 appendVectorElts(AllElts, PartRegs[i]); 213 214 Register Leftover = PartRegs[PartRegs.size() - 1]; 215 if (MRI.getType(Leftover).isScalar()) 216 AllElts.push_back(Leftover); 217 else 218 appendVectorElts(AllElts, Leftover); 219 220 MIRBuilder.buildMergeLikeInstr(DstReg, AllElts); 221 } 222 223 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs. 224 static void getUnmergeResults(SmallVectorImpl<Register> &Regs, 225 const MachineInstr &MI) { 226 assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES); 227 228 const int StartIdx = Regs.size(); 229 const int NumResults = MI.getNumOperands() - 1; 230 Regs.resize(Regs.size() + NumResults); 231 for (int I = 0; I != NumResults; ++I) 232 Regs[StartIdx + I] = MI.getOperand(I).getReg(); 233 } 234 235 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, 236 LLT GCDTy, Register SrcReg) { 237 LLT SrcTy = MRI.getType(SrcReg); 238 if (SrcTy == GCDTy) { 239 // If the source already evenly divides the result type, we don't need to do 240 // anything. 241 Parts.push_back(SrcReg); 242 } else { 243 // Need to split into common type sized pieces. 244 auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg); 245 getUnmergeResults(Parts, *Unmerge); 246 } 247 } 248 249 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy, 250 LLT NarrowTy, Register SrcReg) { 251 LLT SrcTy = MRI.getType(SrcReg); 252 LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy); 253 extractGCDType(Parts, GCDTy, SrcReg); 254 return GCDTy; 255 } 256 257 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy, 258 SmallVectorImpl<Register> &VRegs, 259 unsigned PadStrategy) { 260 LLT LCMTy = getLCMType(DstTy, NarrowTy); 261 262 int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits(); 263 int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits(); 264 int NumOrigSrc = VRegs.size(); 265 266 Register PadReg; 267 268 // Get a value we can use to pad the source value if the sources won't evenly 269 // cover the result type. 270 if (NumOrigSrc < NumParts * NumSubParts) { 271 if (PadStrategy == TargetOpcode::G_ZEXT) 272 PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0); 273 else if (PadStrategy == TargetOpcode::G_ANYEXT) 274 PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0); 275 else { 276 assert(PadStrategy == TargetOpcode::G_SEXT); 277 278 // Shift the sign bit of the low register through the high register. 279 auto ShiftAmt = 280 MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1); 281 PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0); 282 } 283 } 284 285 // Registers for the final merge to be produced. 286 SmallVector<Register, 4> Remerge(NumParts); 287 288 // Registers needed for intermediate merges, which will be merged into a 289 // source for Remerge. 290 SmallVector<Register, 4> SubMerge(NumSubParts); 291 292 // Once we've fully read off the end of the original source bits, we can reuse 293 // the same high bits for remaining padding elements. 294 Register AllPadReg; 295 296 // Build merges to the LCM type to cover the original result type. 297 for (int I = 0; I != NumParts; ++I) { 298 bool AllMergePartsArePadding = true; 299 300 // Build the requested merges to the requested type. 301 for (int J = 0; J != NumSubParts; ++J) { 302 int Idx = I * NumSubParts + J; 303 if (Idx >= NumOrigSrc) { 304 SubMerge[J] = PadReg; 305 continue; 306 } 307 308 SubMerge[J] = VRegs[Idx]; 309 310 // There are meaningful bits here we can't reuse later. 311 AllMergePartsArePadding = false; 312 } 313 314 // If we've filled up a complete piece with padding bits, we can directly 315 // emit the natural sized constant if applicable, rather than a merge of 316 // smaller constants. 317 if (AllMergePartsArePadding && !AllPadReg) { 318 if (PadStrategy == TargetOpcode::G_ANYEXT) 319 AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0); 320 else if (PadStrategy == TargetOpcode::G_ZEXT) 321 AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0); 322 323 // If this is a sign extension, we can't materialize a trivial constant 324 // with the right type and have to produce a merge. 325 } 326 327 if (AllPadReg) { 328 // Avoid creating additional instructions if we're just adding additional 329 // copies of padding bits. 330 Remerge[I] = AllPadReg; 331 continue; 332 } 333 334 if (NumSubParts == 1) 335 Remerge[I] = SubMerge[0]; 336 else 337 Remerge[I] = MIRBuilder.buildMergeLikeInstr(NarrowTy, SubMerge).getReg(0); 338 339 // In the sign extend padding case, re-use the first all-signbit merge. 340 if (AllMergePartsArePadding && !AllPadReg) 341 AllPadReg = Remerge[I]; 342 } 343 344 VRegs = std::move(Remerge); 345 return LCMTy; 346 } 347 348 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy, 349 ArrayRef<Register> RemergeRegs) { 350 LLT DstTy = MRI.getType(DstReg); 351 352 // Create the merge to the widened source, and extract the relevant bits into 353 // the result. 354 355 if (DstTy == LCMTy) { 356 MIRBuilder.buildMergeLikeInstr(DstReg, RemergeRegs); 357 return; 358 } 359 360 auto Remerge = MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs); 361 if (DstTy.isScalar() && LCMTy.isScalar()) { 362 MIRBuilder.buildTrunc(DstReg, Remerge); 363 return; 364 } 365 366 if (LCMTy.isVector()) { 367 unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits(); 368 SmallVector<Register, 8> UnmergeDefs(NumDefs); 369 UnmergeDefs[0] = DstReg; 370 for (unsigned I = 1; I != NumDefs; ++I) 371 UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy); 372 373 MIRBuilder.buildUnmerge(UnmergeDefs, 374 MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs)); 375 return; 376 } 377 378 llvm_unreachable("unhandled case"); 379 } 380 381 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { 382 #define RTLIBCASE_INT(LibcallPrefix) \ 383 do { \ 384 switch (Size) { \ 385 case 32: \ 386 return RTLIB::LibcallPrefix##32; \ 387 case 64: \ 388 return RTLIB::LibcallPrefix##64; \ 389 case 128: \ 390 return RTLIB::LibcallPrefix##128; \ 391 default: \ 392 llvm_unreachable("unexpected size"); \ 393 } \ 394 } while (0) 395 396 #define RTLIBCASE(LibcallPrefix) \ 397 do { \ 398 switch (Size) { \ 399 case 32: \ 400 return RTLIB::LibcallPrefix##32; \ 401 case 64: \ 402 return RTLIB::LibcallPrefix##64; \ 403 case 80: \ 404 return RTLIB::LibcallPrefix##80; \ 405 case 128: \ 406 return RTLIB::LibcallPrefix##128; \ 407 default: \ 408 llvm_unreachable("unexpected size"); \ 409 } \ 410 } while (0) 411 412 switch (Opcode) { 413 case TargetOpcode::G_MUL: 414 RTLIBCASE_INT(MUL_I); 415 case TargetOpcode::G_SDIV: 416 RTLIBCASE_INT(SDIV_I); 417 case TargetOpcode::G_UDIV: 418 RTLIBCASE_INT(UDIV_I); 419 case TargetOpcode::G_SREM: 420 RTLIBCASE_INT(SREM_I); 421 case TargetOpcode::G_UREM: 422 RTLIBCASE_INT(UREM_I); 423 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 424 RTLIBCASE_INT(CTLZ_I); 425 case TargetOpcode::G_FADD: 426 RTLIBCASE(ADD_F); 427 case TargetOpcode::G_FSUB: 428 RTLIBCASE(SUB_F); 429 case TargetOpcode::G_FMUL: 430 RTLIBCASE(MUL_F); 431 case TargetOpcode::G_FDIV: 432 RTLIBCASE(DIV_F); 433 case TargetOpcode::G_FEXP: 434 RTLIBCASE(EXP_F); 435 case TargetOpcode::G_FEXP2: 436 RTLIBCASE(EXP2_F); 437 case TargetOpcode::G_FEXP10: 438 RTLIBCASE(EXP10_F); 439 case TargetOpcode::G_FREM: 440 RTLIBCASE(REM_F); 441 case TargetOpcode::G_FPOW: 442 RTLIBCASE(POW_F); 443 case TargetOpcode::G_FPOWI: 444 RTLIBCASE(POWI_F); 445 case TargetOpcode::G_FMA: 446 RTLIBCASE(FMA_F); 447 case TargetOpcode::G_FSIN: 448 RTLIBCASE(SIN_F); 449 case TargetOpcode::G_FCOS: 450 RTLIBCASE(COS_F); 451 case TargetOpcode::G_FLOG10: 452 RTLIBCASE(LOG10_F); 453 case TargetOpcode::G_FLOG: 454 RTLIBCASE(LOG_F); 455 case TargetOpcode::G_FLOG2: 456 RTLIBCASE(LOG2_F); 457 case TargetOpcode::G_FLDEXP: 458 RTLIBCASE(LDEXP_F); 459 case TargetOpcode::G_FCEIL: 460 RTLIBCASE(CEIL_F); 461 case TargetOpcode::G_FFLOOR: 462 RTLIBCASE(FLOOR_F); 463 case TargetOpcode::G_FMINNUM: 464 RTLIBCASE(FMIN_F); 465 case TargetOpcode::G_FMAXNUM: 466 RTLIBCASE(FMAX_F); 467 case TargetOpcode::G_FSQRT: 468 RTLIBCASE(SQRT_F); 469 case TargetOpcode::G_FRINT: 470 RTLIBCASE(RINT_F); 471 case TargetOpcode::G_FNEARBYINT: 472 RTLIBCASE(NEARBYINT_F); 473 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: 474 RTLIBCASE(ROUNDEVEN_F); 475 } 476 llvm_unreachable("Unknown libcall function"); 477 } 478 479 /// True if an instruction is in tail position in its caller. Intended for 480 /// legalizing libcalls as tail calls when possible. 481 static bool isLibCallInTailPosition(const CallLowering::ArgInfo &Result, 482 MachineInstr &MI, 483 const TargetInstrInfo &TII, 484 MachineRegisterInfo &MRI) { 485 MachineBasicBlock &MBB = *MI.getParent(); 486 const Function &F = MBB.getParent()->getFunction(); 487 488 // Conservatively require the attributes of the call to match those of 489 // the return. Ignore NoAlias and NonNull because they don't affect the 490 // call sequence. 491 AttributeList CallerAttrs = F.getAttributes(); 492 if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs()) 493 .removeAttribute(Attribute::NoAlias) 494 .removeAttribute(Attribute::NonNull) 495 .hasAttributes()) 496 return false; 497 498 // It's not safe to eliminate the sign / zero extension of the return value. 499 if (CallerAttrs.hasRetAttr(Attribute::ZExt) || 500 CallerAttrs.hasRetAttr(Attribute::SExt)) 501 return false; 502 503 // Only tail call if the following instruction is a standard return or if we 504 // have a `thisreturn` callee, and a sequence like: 505 // 506 // G_MEMCPY %0, %1, %2 507 // $x0 = COPY %0 508 // RET_ReallyLR implicit $x0 509 auto Next = next_nodbg(MI.getIterator(), MBB.instr_end()); 510 if (Next != MBB.instr_end() && Next->isCopy()) { 511 if (MI.getOpcode() == TargetOpcode::G_BZERO) 512 return false; 513 514 // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the 515 // mempy/etc routines return the same parameter. For other it will be the 516 // returned value. 517 Register VReg = MI.getOperand(0).getReg(); 518 if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg()) 519 return false; 520 521 Register PReg = Next->getOperand(0).getReg(); 522 if (!PReg.isPhysical()) 523 return false; 524 525 auto Ret = next_nodbg(Next, MBB.instr_end()); 526 if (Ret == MBB.instr_end() || !Ret->isReturn()) 527 return false; 528 529 if (Ret->getNumImplicitOperands() != 1) 530 return false; 531 532 if (!Ret->getOperand(0).isReg() || PReg != Ret->getOperand(0).getReg()) 533 return false; 534 535 // Skip over the COPY that we just validated. 536 Next = Ret; 537 } 538 539 if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn()) 540 return false; 541 542 return true; 543 } 544 545 LegalizerHelper::LegalizeResult 546 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name, 547 const CallLowering::ArgInfo &Result, 548 ArrayRef<CallLowering::ArgInfo> Args, 549 const CallingConv::ID CC, LostDebugLocObserver &LocObserver, 550 MachineInstr *MI) { 551 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); 552 553 CallLowering::CallLoweringInfo Info; 554 Info.CallConv = CC; 555 Info.Callee = MachineOperand::CreateES(Name); 556 Info.OrigRet = Result; 557 if (MI) 558 Info.IsTailCall = 559 (Result.Ty->isVoidTy() || 560 Result.Ty == MIRBuilder.getMF().getFunction().getReturnType()) && 561 isLibCallInTailPosition(Result, *MI, MIRBuilder.getTII(), 562 *MIRBuilder.getMRI()); 563 564 std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs)); 565 if (!CLI.lowerCall(MIRBuilder, Info)) 566 return LegalizerHelper::UnableToLegalize; 567 568 if (MI && Info.LoweredTailCall) { 569 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?"); 570 571 // Check debug locations before removing the return. 572 LocObserver.checkpoint(true); 573 574 // We must have a return following the call (or debug insts) to get past 575 // isLibCallInTailPosition. 576 do { 577 MachineInstr *Next = MI->getNextNode(); 578 assert(Next && 579 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) && 580 "Expected instr following MI to be return or debug inst?"); 581 // We lowered a tail call, so the call is now the return from the block. 582 // Delete the old return. 583 Next->eraseFromParent(); 584 } while (MI->getNextNode()); 585 586 // We expect to lose the debug location from the return. 587 LocObserver.checkpoint(false); 588 } 589 return LegalizerHelper::Legalized; 590 } 591 592 LegalizerHelper::LegalizeResult 593 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall, 594 const CallLowering::ArgInfo &Result, 595 ArrayRef<CallLowering::ArgInfo> Args, 596 LostDebugLocObserver &LocObserver, MachineInstr *MI) { 597 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); 598 const char *Name = TLI.getLibcallName(Libcall); 599 const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall); 600 return createLibcall(MIRBuilder, Name, Result, Args, CC, LocObserver, MI); 601 } 602 603 // Useful for libcalls where all operands have the same type. 604 static LegalizerHelper::LegalizeResult 605 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, 606 Type *OpType, LostDebugLocObserver &LocObserver) { 607 auto Libcall = getRTLibDesc(MI.getOpcode(), Size); 608 609 // FIXME: What does the original arg index mean here? 610 SmallVector<CallLowering::ArgInfo, 3> Args; 611 for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) 612 Args.push_back({MO.getReg(), OpType, 0}); 613 return createLibcall(MIRBuilder, Libcall, 614 {MI.getOperand(0).getReg(), OpType, 0}, Args, 615 LocObserver, &MI); 616 } 617 618 LegalizerHelper::LegalizeResult 619 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 620 MachineInstr &MI, LostDebugLocObserver &LocObserver) { 621 auto &Ctx = MIRBuilder.getMF().getFunction().getContext(); 622 623 SmallVector<CallLowering::ArgInfo, 3> Args; 624 // Add all the args, except for the last which is an imm denoting 'tail'. 625 for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) { 626 Register Reg = MI.getOperand(i).getReg(); 627 628 // Need derive an IR type for call lowering. 629 LLT OpLLT = MRI.getType(Reg); 630 Type *OpTy = nullptr; 631 if (OpLLT.isPointer()) 632 OpTy = PointerType::get(Ctx, OpLLT.getAddressSpace()); 633 else 634 OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits()); 635 Args.push_back({Reg, OpTy, 0}); 636 } 637 638 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); 639 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); 640 RTLIB::Libcall RTLibcall; 641 unsigned Opc = MI.getOpcode(); 642 switch (Opc) { 643 case TargetOpcode::G_BZERO: 644 RTLibcall = RTLIB::BZERO; 645 break; 646 case TargetOpcode::G_MEMCPY: 647 RTLibcall = RTLIB::MEMCPY; 648 Args[0].Flags[0].setReturned(); 649 break; 650 case TargetOpcode::G_MEMMOVE: 651 RTLibcall = RTLIB::MEMMOVE; 652 Args[0].Flags[0].setReturned(); 653 break; 654 case TargetOpcode::G_MEMSET: 655 RTLibcall = RTLIB::MEMSET; 656 Args[0].Flags[0].setReturned(); 657 break; 658 default: 659 llvm_unreachable("unsupported opcode"); 660 } 661 const char *Name = TLI.getLibcallName(RTLibcall); 662 663 // Unsupported libcall on the target. 664 if (!Name) { 665 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for " 666 << MIRBuilder.getTII().getName(Opc) << "\n"); 667 return LegalizerHelper::UnableToLegalize; 668 } 669 670 CallLowering::CallLoweringInfo Info; 671 Info.CallConv = TLI.getLibcallCallingConv(RTLibcall); 672 Info.Callee = MachineOperand::CreateES(Name); 673 Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0); 674 Info.IsTailCall = 675 MI.getOperand(MI.getNumOperands() - 1).getImm() && 676 isLibCallInTailPosition(Info.OrigRet, MI, MIRBuilder.getTII(), MRI); 677 678 std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs)); 679 if (!CLI.lowerCall(MIRBuilder, Info)) 680 return LegalizerHelper::UnableToLegalize; 681 682 if (Info.LoweredTailCall) { 683 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?"); 684 685 // Check debug locations before removing the return. 686 LocObserver.checkpoint(true); 687 688 // We must have a return following the call (or debug insts) to get past 689 // isLibCallInTailPosition. 690 do { 691 MachineInstr *Next = MI.getNextNode(); 692 assert(Next && 693 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) && 694 "Expected instr following MI to be return or debug inst?"); 695 // We lowered a tail call, so the call is now the return from the block. 696 // Delete the old return. 697 Next->eraseFromParent(); 698 } while (MI.getNextNode()); 699 700 // We expect to lose the debug location from the return. 701 LocObserver.checkpoint(false); 702 } 703 704 return LegalizerHelper::Legalized; 705 } 706 707 static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) { 708 unsigned Opc = MI.getOpcode(); 709 auto &AtomicMI = cast<GMemOperation>(MI); 710 auto &MMO = AtomicMI.getMMO(); 711 auto Ordering = MMO.getMergedOrdering(); 712 LLT MemType = MMO.getMemoryType(); 713 uint64_t MemSize = MemType.getSizeInBytes(); 714 if (MemType.isVector()) 715 return RTLIB::UNKNOWN_LIBCALL; 716 717 #define LCALLS(A, B) \ 718 { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL } 719 #define LCALL5(A) \ 720 LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16) 721 switch (Opc) { 722 case TargetOpcode::G_ATOMIC_CMPXCHG: 723 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: { 724 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)}; 725 return getOutlineAtomicHelper(LC, Ordering, MemSize); 726 } 727 case TargetOpcode::G_ATOMICRMW_XCHG: { 728 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)}; 729 return getOutlineAtomicHelper(LC, Ordering, MemSize); 730 } 731 case TargetOpcode::G_ATOMICRMW_ADD: 732 case TargetOpcode::G_ATOMICRMW_SUB: { 733 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)}; 734 return getOutlineAtomicHelper(LC, Ordering, MemSize); 735 } 736 case TargetOpcode::G_ATOMICRMW_AND: { 737 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)}; 738 return getOutlineAtomicHelper(LC, Ordering, MemSize); 739 } 740 case TargetOpcode::G_ATOMICRMW_OR: { 741 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)}; 742 return getOutlineAtomicHelper(LC, Ordering, MemSize); 743 } 744 case TargetOpcode::G_ATOMICRMW_XOR: { 745 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)}; 746 return getOutlineAtomicHelper(LC, Ordering, MemSize); 747 } 748 default: 749 return RTLIB::UNKNOWN_LIBCALL; 750 } 751 #undef LCALLS 752 #undef LCALL5 753 } 754 755 static LegalizerHelper::LegalizeResult 756 createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) { 757 auto &Ctx = MIRBuilder.getMF().getFunction().getContext(); 758 759 Type *RetTy; 760 SmallVector<Register> RetRegs; 761 SmallVector<CallLowering::ArgInfo, 3> Args; 762 unsigned Opc = MI.getOpcode(); 763 switch (Opc) { 764 case TargetOpcode::G_ATOMIC_CMPXCHG: 765 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: { 766 Register Success; 767 LLT SuccessLLT; 768 auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] = 769 MI.getFirst4RegLLTs(); 770 RetRegs.push_back(Ret); 771 RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits()); 772 if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) { 773 std::tie(Ret, RetLLT, Success, SuccessLLT, Mem, MemLLT, Cmp, CmpLLT, New, 774 NewLLT) = MI.getFirst5RegLLTs(); 775 RetRegs.push_back(Success); 776 RetTy = StructType::get( 777 Ctx, {RetTy, IntegerType::get(Ctx, SuccessLLT.getSizeInBits())}); 778 } 779 Args.push_back({Cmp, IntegerType::get(Ctx, CmpLLT.getSizeInBits()), 0}); 780 Args.push_back({New, IntegerType::get(Ctx, NewLLT.getSizeInBits()), 0}); 781 Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0}); 782 break; 783 } 784 case TargetOpcode::G_ATOMICRMW_XCHG: 785 case TargetOpcode::G_ATOMICRMW_ADD: 786 case TargetOpcode::G_ATOMICRMW_SUB: 787 case TargetOpcode::G_ATOMICRMW_AND: 788 case TargetOpcode::G_ATOMICRMW_OR: 789 case TargetOpcode::G_ATOMICRMW_XOR: { 790 auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs(); 791 RetRegs.push_back(Ret); 792 RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits()); 793 if (Opc == TargetOpcode::G_ATOMICRMW_AND) 794 Val = 795 MIRBuilder.buildXor(ValLLT, MIRBuilder.buildConstant(ValLLT, -1), Val) 796 .getReg(0); 797 else if (Opc == TargetOpcode::G_ATOMICRMW_SUB) 798 Val = 799 MIRBuilder.buildSub(ValLLT, MIRBuilder.buildConstant(ValLLT, 0), Val) 800 .getReg(0); 801 Args.push_back({Val, IntegerType::get(Ctx, ValLLT.getSizeInBits()), 0}); 802 Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0}); 803 break; 804 } 805 default: 806 llvm_unreachable("unsupported opcode"); 807 } 808 809 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); 810 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); 811 RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI); 812 const char *Name = TLI.getLibcallName(RTLibcall); 813 814 // Unsupported libcall on the target. 815 if (!Name) { 816 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for " 817 << MIRBuilder.getTII().getName(Opc) << "\n"); 818 return LegalizerHelper::UnableToLegalize; 819 } 820 821 CallLowering::CallLoweringInfo Info; 822 Info.CallConv = TLI.getLibcallCallingConv(RTLibcall); 823 Info.Callee = MachineOperand::CreateES(Name); 824 Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0); 825 826 std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs)); 827 if (!CLI.lowerCall(MIRBuilder, Info)) 828 return LegalizerHelper::UnableToLegalize; 829 830 return LegalizerHelper::Legalized; 831 } 832 833 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType, 834 Type *FromType) { 835 auto ToMVT = MVT::getVT(ToType); 836 auto FromMVT = MVT::getVT(FromType); 837 838 switch (Opcode) { 839 case TargetOpcode::G_FPEXT: 840 return RTLIB::getFPEXT(FromMVT, ToMVT); 841 case TargetOpcode::G_FPTRUNC: 842 return RTLIB::getFPROUND(FromMVT, ToMVT); 843 case TargetOpcode::G_FPTOSI: 844 return RTLIB::getFPTOSINT(FromMVT, ToMVT); 845 case TargetOpcode::G_FPTOUI: 846 return RTLIB::getFPTOUINT(FromMVT, ToMVT); 847 case TargetOpcode::G_SITOFP: 848 return RTLIB::getSINTTOFP(FromMVT, ToMVT); 849 case TargetOpcode::G_UITOFP: 850 return RTLIB::getUINTTOFP(FromMVT, ToMVT); 851 } 852 llvm_unreachable("Unsupported libcall function"); 853 } 854 855 static LegalizerHelper::LegalizeResult 856 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType, 857 Type *FromType, LostDebugLocObserver &LocObserver) { 858 RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType); 859 return createLibcall( 860 MIRBuilder, Libcall, {MI.getOperand(0).getReg(), ToType, 0}, 861 {{MI.getOperand(1).getReg(), FromType, 0}}, LocObserver, &MI); 862 } 863 864 static RTLIB::Libcall 865 getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) { 866 RTLIB::Libcall RTLibcall; 867 switch (MI.getOpcode()) { 868 case TargetOpcode::G_GET_FPENV: 869 RTLibcall = RTLIB::FEGETENV; 870 break; 871 case TargetOpcode::G_SET_FPENV: 872 case TargetOpcode::G_RESET_FPENV: 873 RTLibcall = RTLIB::FESETENV; 874 break; 875 case TargetOpcode::G_GET_FPMODE: 876 RTLibcall = RTLIB::FEGETMODE; 877 break; 878 case TargetOpcode::G_SET_FPMODE: 879 case TargetOpcode::G_RESET_FPMODE: 880 RTLibcall = RTLIB::FESETMODE; 881 break; 882 default: 883 llvm_unreachable("Unexpected opcode"); 884 } 885 return RTLibcall; 886 } 887 888 // Some library functions that read FP state (fegetmode, fegetenv) write the 889 // state into a region in memory. IR intrinsics that do the same operations 890 // (get_fpmode, get_fpenv) return the state as integer value. To implement these 891 // intrinsics via the library functions, we need to use temporary variable, 892 // for example: 893 // 894 // %0:_(s32) = G_GET_FPMODE 895 // 896 // is transformed to: 897 // 898 // %1:_(p0) = G_FRAME_INDEX %stack.0 899 // BL &fegetmode 900 // %0:_(s32) = G_LOAD % 1 901 // 902 LegalizerHelper::LegalizeResult 903 LegalizerHelper::createGetStateLibcall(MachineIRBuilder &MIRBuilder, 904 MachineInstr &MI, 905 LostDebugLocObserver &LocObserver) { 906 const DataLayout &DL = MIRBuilder.getDataLayout(); 907 auto &MF = MIRBuilder.getMF(); 908 auto &MRI = *MIRBuilder.getMRI(); 909 auto &Ctx = MF.getFunction().getContext(); 910 911 // Create temporary, where library function will put the read state. 912 Register Dst = MI.getOperand(0).getReg(); 913 LLT StateTy = MRI.getType(Dst); 914 TypeSize StateSize = StateTy.getSizeInBytes(); 915 Align TempAlign = getStackTemporaryAlignment(StateTy); 916 MachinePointerInfo TempPtrInfo; 917 auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo); 918 919 // Create a call to library function, with the temporary as an argument. 920 unsigned TempAddrSpace = DL.getAllocaAddrSpace(); 921 Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace); 922 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI); 923 auto Res = 924 createLibcall(MIRBuilder, RTLibcall, 925 CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0), 926 CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}), 927 LocObserver, nullptr); 928 if (Res != LegalizerHelper::Legalized) 929 return Res; 930 931 // Create a load from the temporary. 932 MachineMemOperand *MMO = MF.getMachineMemOperand( 933 TempPtrInfo, MachineMemOperand::MOLoad, StateTy, TempAlign); 934 MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Temp, *MMO); 935 936 return LegalizerHelper::Legalized; 937 } 938 939 // Similar to `createGetStateLibcall` the function calls a library function 940 // using transient space in stack. In this case the library function reads 941 // content of memory region. 942 LegalizerHelper::LegalizeResult 943 LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder, 944 MachineInstr &MI, 945 LostDebugLocObserver &LocObserver) { 946 const DataLayout &DL = MIRBuilder.getDataLayout(); 947 auto &MF = MIRBuilder.getMF(); 948 auto &MRI = *MIRBuilder.getMRI(); 949 auto &Ctx = MF.getFunction().getContext(); 950 951 // Create temporary, where library function will get the new state. 952 Register Src = MI.getOperand(0).getReg(); 953 LLT StateTy = MRI.getType(Src); 954 TypeSize StateSize = StateTy.getSizeInBytes(); 955 Align TempAlign = getStackTemporaryAlignment(StateTy); 956 MachinePointerInfo TempPtrInfo; 957 auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo); 958 959 // Put the new state into the temporary. 960 MachineMemOperand *MMO = MF.getMachineMemOperand( 961 TempPtrInfo, MachineMemOperand::MOStore, StateTy, TempAlign); 962 MIRBuilder.buildStore(Src, Temp, *MMO); 963 964 // Create a call to library function, with the temporary as an argument. 965 unsigned TempAddrSpace = DL.getAllocaAddrSpace(); 966 Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace); 967 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI); 968 return createLibcall(MIRBuilder, RTLibcall, 969 CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0), 970 CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}), 971 LocObserver, nullptr); 972 } 973 974 // The function is used to legalize operations that set default environment 975 // state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that. 976 // On most targets supported in glibc FE_DFL_MODE is defined as 977 // `((const femode_t *) -1)`. Such assumption is used here. If for some target 978 // it is not true, the target must provide custom lowering. 979 LegalizerHelper::LegalizeResult 980 LegalizerHelper::createResetStateLibcall(MachineIRBuilder &MIRBuilder, 981 MachineInstr &MI, 982 LostDebugLocObserver &LocObserver) { 983 const DataLayout &DL = MIRBuilder.getDataLayout(); 984 auto &MF = MIRBuilder.getMF(); 985 auto &Ctx = MF.getFunction().getContext(); 986 987 // Create an argument for the library function. 988 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace(); 989 Type *StatePtrTy = PointerType::get(Ctx, AddrSpace); 990 unsigned PtrSize = DL.getPointerSizeInBits(AddrSpace); 991 LLT MemTy = LLT::pointer(AddrSpace, PtrSize); 992 auto DefValue = MIRBuilder.buildConstant(LLT::scalar(PtrSize), -1LL); 993 DstOp Dest(MRI.createGenericVirtualRegister(MemTy)); 994 MIRBuilder.buildIntToPtr(Dest, DefValue); 995 996 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI); 997 return createLibcall(MIRBuilder, RTLibcall, 998 CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0), 999 CallLowering::ArgInfo({Dest.getReg(), StatePtrTy, 0}), 1000 LocObserver, &MI); 1001 } 1002 1003 LegalizerHelper::LegalizeResult 1004 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) { 1005 auto &Ctx = MIRBuilder.getMF().getFunction().getContext(); 1006 1007 switch (MI.getOpcode()) { 1008 default: 1009 return UnableToLegalize; 1010 case TargetOpcode::G_MUL: 1011 case TargetOpcode::G_SDIV: 1012 case TargetOpcode::G_UDIV: 1013 case TargetOpcode::G_SREM: 1014 case TargetOpcode::G_UREM: 1015 case TargetOpcode::G_CTLZ_ZERO_UNDEF: { 1016 LLT LLTy = MRI.getType(MI.getOperand(0).getReg()); 1017 unsigned Size = LLTy.getSizeInBits(); 1018 Type *HLTy = IntegerType::get(Ctx, Size); 1019 auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver); 1020 if (Status != Legalized) 1021 return Status; 1022 break; 1023 } 1024 case TargetOpcode::G_FADD: 1025 case TargetOpcode::G_FSUB: 1026 case TargetOpcode::G_FMUL: 1027 case TargetOpcode::G_FDIV: 1028 case TargetOpcode::G_FMA: 1029 case TargetOpcode::G_FPOW: 1030 case TargetOpcode::G_FREM: 1031 case TargetOpcode::G_FCOS: 1032 case TargetOpcode::G_FSIN: 1033 case TargetOpcode::G_FLOG10: 1034 case TargetOpcode::G_FLOG: 1035 case TargetOpcode::G_FLOG2: 1036 case TargetOpcode::G_FLDEXP: 1037 case TargetOpcode::G_FEXP: 1038 case TargetOpcode::G_FEXP2: 1039 case TargetOpcode::G_FEXP10: 1040 case TargetOpcode::G_FCEIL: 1041 case TargetOpcode::G_FFLOOR: 1042 case TargetOpcode::G_FMINNUM: 1043 case TargetOpcode::G_FMAXNUM: 1044 case TargetOpcode::G_FSQRT: 1045 case TargetOpcode::G_FRINT: 1046 case TargetOpcode::G_FNEARBYINT: 1047 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: { 1048 LLT LLTy = MRI.getType(MI.getOperand(0).getReg()); 1049 unsigned Size = LLTy.getSizeInBits(); 1050 Type *HLTy = getFloatTypeForLLT(Ctx, LLTy); 1051 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) { 1052 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n"); 1053 return UnableToLegalize; 1054 } 1055 auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver); 1056 if (Status != Legalized) 1057 return Status; 1058 break; 1059 } 1060 case TargetOpcode::G_FPOWI: { 1061 LLT LLTy = MRI.getType(MI.getOperand(0).getReg()); 1062 unsigned Size = LLTy.getSizeInBits(); 1063 Type *HLTy = getFloatTypeForLLT(Ctx, LLTy); 1064 Type *ITy = IntegerType::get( 1065 Ctx, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits()); 1066 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) { 1067 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n"); 1068 return UnableToLegalize; 1069 } 1070 auto Libcall = getRTLibDesc(MI.getOpcode(), Size); 1071 std::initializer_list<CallLowering::ArgInfo> Args = { 1072 {MI.getOperand(1).getReg(), HLTy, 0}, 1073 {MI.getOperand(2).getReg(), ITy, 1}}; 1074 LegalizeResult Status = 1075 createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), HLTy, 0}, 1076 Args, LocObserver, &MI); 1077 if (Status != Legalized) 1078 return Status; 1079 break; 1080 } 1081 case TargetOpcode::G_FPEXT: 1082 case TargetOpcode::G_FPTRUNC: { 1083 Type *FromTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(1).getReg())); 1084 Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg())); 1085 if (!FromTy || !ToTy) 1086 return UnableToLegalize; 1087 LegalizeResult Status = 1088 conversionLibcall(MI, MIRBuilder, ToTy, FromTy, LocObserver); 1089 if (Status != Legalized) 1090 return Status; 1091 break; 1092 } 1093 case TargetOpcode::G_FPTOSI: 1094 case TargetOpcode::G_FPTOUI: { 1095 // FIXME: Support other types 1096 unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 1097 unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1098 if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64)) 1099 return UnableToLegalize; 1100 LegalizeResult Status = conversionLibcall( 1101 MI, MIRBuilder, 1102 ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx), 1103 FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx), 1104 LocObserver); 1105 if (Status != Legalized) 1106 return Status; 1107 break; 1108 } 1109 case TargetOpcode::G_SITOFP: 1110 case TargetOpcode::G_UITOFP: { 1111 // FIXME: Support other types 1112 unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 1113 unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1114 if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64)) 1115 return UnableToLegalize; 1116 LegalizeResult Status = conversionLibcall( 1117 MI, MIRBuilder, 1118 ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx), 1119 FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx), 1120 LocObserver); 1121 if (Status != Legalized) 1122 return Status; 1123 break; 1124 } 1125 case TargetOpcode::G_ATOMICRMW_XCHG: 1126 case TargetOpcode::G_ATOMICRMW_ADD: 1127 case TargetOpcode::G_ATOMICRMW_SUB: 1128 case TargetOpcode::G_ATOMICRMW_AND: 1129 case TargetOpcode::G_ATOMICRMW_OR: 1130 case TargetOpcode::G_ATOMICRMW_XOR: 1131 case TargetOpcode::G_ATOMIC_CMPXCHG: 1132 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: { 1133 auto Status = createAtomicLibcall(MIRBuilder, MI); 1134 if (Status != Legalized) 1135 return Status; 1136 break; 1137 } 1138 case TargetOpcode::G_BZERO: 1139 case TargetOpcode::G_MEMCPY: 1140 case TargetOpcode::G_MEMMOVE: 1141 case TargetOpcode::G_MEMSET: { 1142 LegalizeResult Result = 1143 createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver); 1144 if (Result != Legalized) 1145 return Result; 1146 MI.eraseFromParent(); 1147 return Result; 1148 } 1149 case TargetOpcode::G_GET_FPENV: 1150 case TargetOpcode::G_GET_FPMODE: { 1151 LegalizeResult Result = createGetStateLibcall(MIRBuilder, MI, LocObserver); 1152 if (Result != Legalized) 1153 return Result; 1154 break; 1155 } 1156 case TargetOpcode::G_SET_FPENV: 1157 case TargetOpcode::G_SET_FPMODE: { 1158 LegalizeResult Result = createSetStateLibcall(MIRBuilder, MI, LocObserver); 1159 if (Result != Legalized) 1160 return Result; 1161 break; 1162 } 1163 case TargetOpcode::G_RESET_FPENV: 1164 case TargetOpcode::G_RESET_FPMODE: { 1165 LegalizeResult Result = 1166 createResetStateLibcall(MIRBuilder, MI, LocObserver); 1167 if (Result != Legalized) 1168 return Result; 1169 break; 1170 } 1171 } 1172 1173 MI.eraseFromParent(); 1174 return Legalized; 1175 } 1176 1177 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, 1178 unsigned TypeIdx, 1179 LLT NarrowTy) { 1180 uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1181 uint64_t NarrowSize = NarrowTy.getSizeInBits(); 1182 1183 switch (MI.getOpcode()) { 1184 default: 1185 return UnableToLegalize; 1186 case TargetOpcode::G_IMPLICIT_DEF: { 1187 Register DstReg = MI.getOperand(0).getReg(); 1188 LLT DstTy = MRI.getType(DstReg); 1189 1190 // If SizeOp0 is not an exact multiple of NarrowSize, emit 1191 // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed. 1192 // FIXME: Although this would also be legal for the general case, it causes 1193 // a lot of regressions in the emitted code (superfluous COPYs, artifact 1194 // combines not being hit). This seems to be a problem related to the 1195 // artifact combiner. 1196 if (SizeOp0 % NarrowSize != 0) { 1197 LLT ImplicitTy = NarrowTy; 1198 if (DstTy.isVector()) 1199 ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy); 1200 1201 Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0); 1202 MIRBuilder.buildAnyExt(DstReg, ImplicitReg); 1203 1204 MI.eraseFromParent(); 1205 return Legalized; 1206 } 1207 1208 int NumParts = SizeOp0 / NarrowSize; 1209 1210 SmallVector<Register, 2> DstRegs; 1211 for (int i = 0; i < NumParts; ++i) 1212 DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0)); 1213 1214 if (DstTy.isVector()) 1215 MIRBuilder.buildBuildVector(DstReg, DstRegs); 1216 else 1217 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs); 1218 MI.eraseFromParent(); 1219 return Legalized; 1220 } 1221 case TargetOpcode::G_CONSTANT: { 1222 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1223 const APInt &Val = MI.getOperand(1).getCImm()->getValue(); 1224 unsigned TotalSize = Ty.getSizeInBits(); 1225 unsigned NarrowSize = NarrowTy.getSizeInBits(); 1226 int NumParts = TotalSize / NarrowSize; 1227 1228 SmallVector<Register, 4> PartRegs; 1229 for (int I = 0; I != NumParts; ++I) { 1230 unsigned Offset = I * NarrowSize; 1231 auto K = MIRBuilder.buildConstant(NarrowTy, 1232 Val.lshr(Offset).trunc(NarrowSize)); 1233 PartRegs.push_back(K.getReg(0)); 1234 } 1235 1236 LLT LeftoverTy; 1237 unsigned LeftoverBits = TotalSize - NumParts * NarrowSize; 1238 SmallVector<Register, 1> LeftoverRegs; 1239 if (LeftoverBits != 0) { 1240 LeftoverTy = LLT::scalar(LeftoverBits); 1241 auto K = MIRBuilder.buildConstant( 1242 LeftoverTy, 1243 Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits)); 1244 LeftoverRegs.push_back(K.getReg(0)); 1245 } 1246 1247 insertParts(MI.getOperand(0).getReg(), 1248 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs); 1249 1250 MI.eraseFromParent(); 1251 return Legalized; 1252 } 1253 case TargetOpcode::G_SEXT: 1254 case TargetOpcode::G_ZEXT: 1255 case TargetOpcode::G_ANYEXT: 1256 return narrowScalarExt(MI, TypeIdx, NarrowTy); 1257 case TargetOpcode::G_TRUNC: { 1258 if (TypeIdx != 1) 1259 return UnableToLegalize; 1260 1261 uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 1262 if (NarrowTy.getSizeInBits() * 2 != SizeOp1) { 1263 LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n"); 1264 return UnableToLegalize; 1265 } 1266 1267 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1)); 1268 MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0)); 1269 MI.eraseFromParent(); 1270 return Legalized; 1271 } 1272 1273 case TargetOpcode::G_FREEZE: { 1274 if (TypeIdx != 0) 1275 return UnableToLegalize; 1276 1277 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1278 // Should widen scalar first 1279 if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0) 1280 return UnableToLegalize; 1281 1282 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg()); 1283 SmallVector<Register, 8> Parts; 1284 for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) { 1285 Parts.push_back( 1286 MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0)); 1287 } 1288 1289 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), Parts); 1290 MI.eraseFromParent(); 1291 return Legalized; 1292 } 1293 case TargetOpcode::G_ADD: 1294 case TargetOpcode::G_SUB: 1295 case TargetOpcode::G_SADDO: 1296 case TargetOpcode::G_SSUBO: 1297 case TargetOpcode::G_SADDE: 1298 case TargetOpcode::G_SSUBE: 1299 case TargetOpcode::G_UADDO: 1300 case TargetOpcode::G_USUBO: 1301 case TargetOpcode::G_UADDE: 1302 case TargetOpcode::G_USUBE: 1303 return narrowScalarAddSub(MI, TypeIdx, NarrowTy); 1304 case TargetOpcode::G_MUL: 1305 case TargetOpcode::G_UMULH: 1306 return narrowScalarMul(MI, NarrowTy); 1307 case TargetOpcode::G_EXTRACT: 1308 return narrowScalarExtract(MI, TypeIdx, NarrowTy); 1309 case TargetOpcode::G_INSERT: 1310 return narrowScalarInsert(MI, TypeIdx, NarrowTy); 1311 case TargetOpcode::G_LOAD: { 1312 auto &LoadMI = cast<GLoad>(MI); 1313 Register DstReg = LoadMI.getDstReg(); 1314 LLT DstTy = MRI.getType(DstReg); 1315 if (DstTy.isVector()) 1316 return UnableToLegalize; 1317 1318 if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) { 1319 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy); 1320 MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO()); 1321 MIRBuilder.buildAnyExt(DstReg, TmpReg); 1322 LoadMI.eraseFromParent(); 1323 return Legalized; 1324 } 1325 1326 return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy); 1327 } 1328 case TargetOpcode::G_ZEXTLOAD: 1329 case TargetOpcode::G_SEXTLOAD: { 1330 auto &LoadMI = cast<GExtLoad>(MI); 1331 Register DstReg = LoadMI.getDstReg(); 1332 Register PtrReg = LoadMI.getPointerReg(); 1333 1334 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy); 1335 auto &MMO = LoadMI.getMMO(); 1336 unsigned MemSize = MMO.getSizeInBits(); 1337 1338 if (MemSize == NarrowSize) { 1339 MIRBuilder.buildLoad(TmpReg, PtrReg, MMO); 1340 } else if (MemSize < NarrowSize) { 1341 MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO); 1342 } else if (MemSize > NarrowSize) { 1343 // FIXME: Need to split the load. 1344 return UnableToLegalize; 1345 } 1346 1347 if (isa<GZExtLoad>(LoadMI)) 1348 MIRBuilder.buildZExt(DstReg, TmpReg); 1349 else 1350 MIRBuilder.buildSExt(DstReg, TmpReg); 1351 1352 LoadMI.eraseFromParent(); 1353 return Legalized; 1354 } 1355 case TargetOpcode::G_STORE: { 1356 auto &StoreMI = cast<GStore>(MI); 1357 1358 Register SrcReg = StoreMI.getValueReg(); 1359 LLT SrcTy = MRI.getType(SrcReg); 1360 if (SrcTy.isVector()) 1361 return UnableToLegalize; 1362 1363 int NumParts = SizeOp0 / NarrowSize; 1364 unsigned HandledSize = NumParts * NarrowTy.getSizeInBits(); 1365 unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize; 1366 if (SrcTy.isVector() && LeftoverBits != 0) 1367 return UnableToLegalize; 1368 1369 if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) { 1370 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy); 1371 MIRBuilder.buildTrunc(TmpReg, SrcReg); 1372 MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO()); 1373 StoreMI.eraseFromParent(); 1374 return Legalized; 1375 } 1376 1377 return reduceLoadStoreWidth(StoreMI, 0, NarrowTy); 1378 } 1379 case TargetOpcode::G_SELECT: 1380 return narrowScalarSelect(MI, TypeIdx, NarrowTy); 1381 case TargetOpcode::G_AND: 1382 case TargetOpcode::G_OR: 1383 case TargetOpcode::G_XOR: { 1384 // Legalize bitwise operation: 1385 // A = BinOp<Ty> B, C 1386 // into: 1387 // B1, ..., BN = G_UNMERGE_VALUES B 1388 // C1, ..., CN = G_UNMERGE_VALUES C 1389 // A1 = BinOp<Ty/N> B1, C2 1390 // ... 1391 // AN = BinOp<Ty/N> BN, CN 1392 // A = G_MERGE_VALUES A1, ..., AN 1393 return narrowScalarBasic(MI, TypeIdx, NarrowTy); 1394 } 1395 case TargetOpcode::G_SHL: 1396 case TargetOpcode::G_LSHR: 1397 case TargetOpcode::G_ASHR: 1398 return narrowScalarShift(MI, TypeIdx, NarrowTy); 1399 case TargetOpcode::G_CTLZ: 1400 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 1401 case TargetOpcode::G_CTTZ: 1402 case TargetOpcode::G_CTTZ_ZERO_UNDEF: 1403 case TargetOpcode::G_CTPOP: 1404 if (TypeIdx == 1) 1405 switch (MI.getOpcode()) { 1406 case TargetOpcode::G_CTLZ: 1407 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 1408 return narrowScalarCTLZ(MI, TypeIdx, NarrowTy); 1409 case TargetOpcode::G_CTTZ: 1410 case TargetOpcode::G_CTTZ_ZERO_UNDEF: 1411 return narrowScalarCTTZ(MI, TypeIdx, NarrowTy); 1412 case TargetOpcode::G_CTPOP: 1413 return narrowScalarCTPOP(MI, TypeIdx, NarrowTy); 1414 default: 1415 return UnableToLegalize; 1416 } 1417 1418 Observer.changingInstr(MI); 1419 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT); 1420 Observer.changedInstr(MI); 1421 return Legalized; 1422 case TargetOpcode::G_INTTOPTR: 1423 if (TypeIdx != 1) 1424 return UnableToLegalize; 1425 1426 Observer.changingInstr(MI); 1427 narrowScalarSrc(MI, NarrowTy, 1); 1428 Observer.changedInstr(MI); 1429 return Legalized; 1430 case TargetOpcode::G_PTRTOINT: 1431 if (TypeIdx != 0) 1432 return UnableToLegalize; 1433 1434 Observer.changingInstr(MI); 1435 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT); 1436 Observer.changedInstr(MI); 1437 return Legalized; 1438 case TargetOpcode::G_PHI: { 1439 // FIXME: add support for when SizeOp0 isn't an exact multiple of 1440 // NarrowSize. 1441 if (SizeOp0 % NarrowSize != 0) 1442 return UnableToLegalize; 1443 1444 unsigned NumParts = SizeOp0 / NarrowSize; 1445 SmallVector<Register, 2> DstRegs(NumParts); 1446 SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2); 1447 Observer.changingInstr(MI); 1448 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { 1449 MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB(); 1450 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward()); 1451 extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts, 1452 SrcRegs[i / 2], MIRBuilder, MRI); 1453 } 1454 MachineBasicBlock &MBB = *MI.getParent(); 1455 MIRBuilder.setInsertPt(MBB, MI); 1456 for (unsigned i = 0; i < NumParts; ++i) { 1457 DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy); 1458 MachineInstrBuilder MIB = 1459 MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]); 1460 for (unsigned j = 1; j < MI.getNumOperands(); j += 2) 1461 MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1)); 1462 } 1463 MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI()); 1464 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs); 1465 Observer.changedInstr(MI); 1466 MI.eraseFromParent(); 1467 return Legalized; 1468 } 1469 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1470 case TargetOpcode::G_INSERT_VECTOR_ELT: { 1471 if (TypeIdx != 2) 1472 return UnableToLegalize; 1473 1474 int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3; 1475 Observer.changingInstr(MI); 1476 narrowScalarSrc(MI, NarrowTy, OpIdx); 1477 Observer.changedInstr(MI); 1478 return Legalized; 1479 } 1480 case TargetOpcode::G_ICMP: { 1481 Register LHS = MI.getOperand(2).getReg(); 1482 LLT SrcTy = MRI.getType(LHS); 1483 uint64_t SrcSize = SrcTy.getSizeInBits(); 1484 CmpInst::Predicate Pred = 1485 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 1486 1487 // TODO: Handle the non-equality case for weird sizes. 1488 if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred)) 1489 return UnableToLegalize; 1490 1491 LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover) 1492 SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs; 1493 if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs, 1494 LHSLeftoverRegs, MIRBuilder, MRI)) 1495 return UnableToLegalize; 1496 1497 LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type. 1498 SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs; 1499 if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused, 1500 RHSPartRegs, RHSLeftoverRegs, MIRBuilder, MRI)) 1501 return UnableToLegalize; 1502 1503 // We now have the LHS and RHS of the compare split into narrow-type 1504 // registers, plus potentially some leftover type. 1505 Register Dst = MI.getOperand(0).getReg(); 1506 LLT ResTy = MRI.getType(Dst); 1507 if (ICmpInst::isEquality(Pred)) { 1508 // For each part on the LHS and RHS, keep track of the result of XOR-ing 1509 // them together. For each equal part, the result should be all 0s. For 1510 // each non-equal part, we'll get at least one 1. 1511 auto Zero = MIRBuilder.buildConstant(NarrowTy, 0); 1512 SmallVector<Register, 4> Xors; 1513 for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) { 1514 auto LHS = std::get<0>(LHSAndRHS); 1515 auto RHS = std::get<1>(LHSAndRHS); 1516 auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0); 1517 Xors.push_back(Xor); 1518 } 1519 1520 // Build a G_XOR for each leftover register. Each G_XOR must be widened 1521 // to the desired narrow type so that we can OR them together later. 1522 SmallVector<Register, 4> WidenedXors; 1523 for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) { 1524 auto LHS = std::get<0>(LHSAndRHS); 1525 auto RHS = std::get<1>(LHSAndRHS); 1526 auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0); 1527 LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor); 1528 buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors, 1529 /* PadStrategy = */ TargetOpcode::G_ZEXT); 1530 Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end()); 1531 } 1532 1533 // Now, for each part we broke up, we know if they are equal/not equal 1534 // based off the G_XOR. We can OR these all together and compare against 1535 // 0 to get the result. 1536 assert(Xors.size() >= 2 && "Should have gotten at least two Xors?"); 1537 auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]); 1538 for (unsigned I = 2, E = Xors.size(); I < E; ++I) 1539 Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]); 1540 MIRBuilder.buildICmp(Pred, Dst, Or, Zero); 1541 } else { 1542 // TODO: Handle non-power-of-two types. 1543 assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?"); 1544 assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?"); 1545 Register LHSL = LHSPartRegs[0]; 1546 Register LHSH = LHSPartRegs[1]; 1547 Register RHSL = RHSPartRegs[0]; 1548 Register RHSH = RHSPartRegs[1]; 1549 MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH); 1550 MachineInstrBuilder CmpHEQ = 1551 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH); 1552 MachineInstrBuilder CmpLU = MIRBuilder.buildICmp( 1553 ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL); 1554 MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH); 1555 } 1556 MI.eraseFromParent(); 1557 return Legalized; 1558 } 1559 case TargetOpcode::G_SEXT_INREG: { 1560 if (TypeIdx != 0) 1561 return UnableToLegalize; 1562 1563 int64_t SizeInBits = MI.getOperand(2).getImm(); 1564 1565 // So long as the new type has more bits than the bits we're extending we 1566 // don't need to break it apart. 1567 if (NarrowTy.getScalarSizeInBits() > SizeInBits) { 1568 Observer.changingInstr(MI); 1569 // We don't lose any non-extension bits by truncating the src and 1570 // sign-extending the dst. 1571 MachineOperand &MO1 = MI.getOperand(1); 1572 auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1); 1573 MO1.setReg(TruncMIB.getReg(0)); 1574 1575 MachineOperand &MO2 = MI.getOperand(0); 1576 Register DstExt = MRI.createGenericVirtualRegister(NarrowTy); 1577 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1578 MIRBuilder.buildSExt(MO2, DstExt); 1579 MO2.setReg(DstExt); 1580 Observer.changedInstr(MI); 1581 return Legalized; 1582 } 1583 1584 // Break it apart. Components below the extension point are unmodified. The 1585 // component containing the extension point becomes a narrower SEXT_INREG. 1586 // Components above it are ashr'd from the component containing the 1587 // extension point. 1588 if (SizeOp0 % NarrowSize != 0) 1589 return UnableToLegalize; 1590 int NumParts = SizeOp0 / NarrowSize; 1591 1592 // List the registers where the destination will be scattered. 1593 SmallVector<Register, 2> DstRegs; 1594 // List the registers where the source will be split. 1595 SmallVector<Register, 2> SrcRegs; 1596 1597 // Create all the temporary registers. 1598 for (int i = 0; i < NumParts; ++i) { 1599 Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy); 1600 1601 SrcRegs.push_back(SrcReg); 1602 } 1603 1604 // Explode the big arguments into smaller chunks. 1605 MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1)); 1606 1607 Register AshrCstReg = 1608 MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1) 1609 .getReg(0); 1610 Register FullExtensionReg; 1611 Register PartialExtensionReg; 1612 1613 // Do the operation on each small part. 1614 for (int i = 0; i < NumParts; ++i) { 1615 if ((i + 1) * NarrowTy.getScalarSizeInBits() <= SizeInBits) { 1616 DstRegs.push_back(SrcRegs[i]); 1617 PartialExtensionReg = DstRegs.back(); 1618 } else if (i * NarrowTy.getScalarSizeInBits() >= SizeInBits) { 1619 assert(PartialExtensionReg && 1620 "Expected to visit partial extension before full"); 1621 if (FullExtensionReg) { 1622 DstRegs.push_back(FullExtensionReg); 1623 continue; 1624 } 1625 DstRegs.push_back( 1626 MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg) 1627 .getReg(0)); 1628 FullExtensionReg = DstRegs.back(); 1629 } else { 1630 DstRegs.push_back( 1631 MIRBuilder 1632 .buildInstr( 1633 TargetOpcode::G_SEXT_INREG, {NarrowTy}, 1634 {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()}) 1635 .getReg(0)); 1636 PartialExtensionReg = DstRegs.back(); 1637 } 1638 } 1639 1640 // Gather the destination registers into the final destination. 1641 Register DstReg = MI.getOperand(0).getReg(); 1642 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs); 1643 MI.eraseFromParent(); 1644 return Legalized; 1645 } 1646 case TargetOpcode::G_BSWAP: 1647 case TargetOpcode::G_BITREVERSE: { 1648 if (SizeOp0 % NarrowSize != 0) 1649 return UnableToLegalize; 1650 1651 Observer.changingInstr(MI); 1652 SmallVector<Register, 2> SrcRegs, DstRegs; 1653 unsigned NumParts = SizeOp0 / NarrowSize; 1654 extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs, 1655 MIRBuilder, MRI); 1656 1657 for (unsigned i = 0; i < NumParts; ++i) { 1658 auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy}, 1659 {SrcRegs[NumParts - 1 - i]}); 1660 DstRegs.push_back(DstPart.getReg(0)); 1661 } 1662 1663 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs); 1664 1665 Observer.changedInstr(MI); 1666 MI.eraseFromParent(); 1667 return Legalized; 1668 } 1669 case TargetOpcode::G_PTR_ADD: 1670 case TargetOpcode::G_PTRMASK: { 1671 if (TypeIdx != 1) 1672 return UnableToLegalize; 1673 Observer.changingInstr(MI); 1674 narrowScalarSrc(MI, NarrowTy, 2); 1675 Observer.changedInstr(MI); 1676 return Legalized; 1677 } 1678 case TargetOpcode::G_FPTOUI: 1679 case TargetOpcode::G_FPTOSI: 1680 return narrowScalarFPTOI(MI, TypeIdx, NarrowTy); 1681 case TargetOpcode::G_FPEXT: 1682 if (TypeIdx != 0) 1683 return UnableToLegalize; 1684 Observer.changingInstr(MI); 1685 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT); 1686 Observer.changedInstr(MI); 1687 return Legalized; 1688 case TargetOpcode::G_FLDEXP: 1689 case TargetOpcode::G_STRICT_FLDEXP: 1690 return narrowScalarFLDEXP(MI, TypeIdx, NarrowTy); 1691 } 1692 } 1693 1694 Register LegalizerHelper::coerceToScalar(Register Val) { 1695 LLT Ty = MRI.getType(Val); 1696 if (Ty.isScalar()) 1697 return Val; 1698 1699 const DataLayout &DL = MIRBuilder.getDataLayout(); 1700 LLT NewTy = LLT::scalar(Ty.getSizeInBits()); 1701 if (Ty.isPointer()) { 1702 if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace())) 1703 return Register(); 1704 return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0); 1705 } 1706 1707 Register NewVal = Val; 1708 1709 assert(Ty.isVector()); 1710 LLT EltTy = Ty.getElementType(); 1711 if (EltTy.isPointer()) 1712 NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0); 1713 return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0); 1714 } 1715 1716 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy, 1717 unsigned OpIdx, unsigned ExtOpcode) { 1718 MachineOperand &MO = MI.getOperand(OpIdx); 1719 auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO}); 1720 MO.setReg(ExtB.getReg(0)); 1721 } 1722 1723 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy, 1724 unsigned OpIdx) { 1725 MachineOperand &MO = MI.getOperand(OpIdx); 1726 auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO); 1727 MO.setReg(ExtB.getReg(0)); 1728 } 1729 1730 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy, 1731 unsigned OpIdx, unsigned TruncOpcode) { 1732 MachineOperand &MO = MI.getOperand(OpIdx); 1733 Register DstExt = MRI.createGenericVirtualRegister(WideTy); 1734 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1735 MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt}); 1736 MO.setReg(DstExt); 1737 } 1738 1739 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy, 1740 unsigned OpIdx, unsigned ExtOpcode) { 1741 MachineOperand &MO = MI.getOperand(OpIdx); 1742 Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy); 1743 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1744 MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc}); 1745 MO.setReg(DstTrunc); 1746 } 1747 1748 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy, 1749 unsigned OpIdx) { 1750 MachineOperand &MO = MI.getOperand(OpIdx); 1751 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1752 Register Dst = MO.getReg(); 1753 Register DstExt = MRI.createGenericVirtualRegister(WideTy); 1754 MO.setReg(DstExt); 1755 MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt); 1756 } 1757 1758 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy, 1759 unsigned OpIdx) { 1760 MachineOperand &MO = MI.getOperand(OpIdx); 1761 SmallVector<Register, 8> Regs; 1762 MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0)); 1763 } 1764 1765 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) { 1766 MachineOperand &Op = MI.getOperand(OpIdx); 1767 Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0)); 1768 } 1769 1770 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) { 1771 MachineOperand &MO = MI.getOperand(OpIdx); 1772 Register CastDst = MRI.createGenericVirtualRegister(CastTy); 1773 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1774 MIRBuilder.buildBitcast(MO, CastDst); 1775 MO.setReg(CastDst); 1776 } 1777 1778 LegalizerHelper::LegalizeResult 1779 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, 1780 LLT WideTy) { 1781 if (TypeIdx != 1) 1782 return UnableToLegalize; 1783 1784 auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs(); 1785 if (DstTy.isVector()) 1786 return UnableToLegalize; 1787 1788 LLT SrcTy = MRI.getType(Src1Reg); 1789 const int DstSize = DstTy.getSizeInBits(); 1790 const int SrcSize = SrcTy.getSizeInBits(); 1791 const int WideSize = WideTy.getSizeInBits(); 1792 const int NumMerge = (DstSize + WideSize - 1) / WideSize; 1793 1794 unsigned NumOps = MI.getNumOperands(); 1795 unsigned NumSrc = MI.getNumOperands() - 1; 1796 unsigned PartSize = DstTy.getSizeInBits() / NumSrc; 1797 1798 if (WideSize >= DstSize) { 1799 // Directly pack the bits in the target type. 1800 Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1Reg).getReg(0); 1801 1802 for (unsigned I = 2; I != NumOps; ++I) { 1803 const unsigned Offset = (I - 1) * PartSize; 1804 1805 Register SrcReg = MI.getOperand(I).getReg(); 1806 assert(MRI.getType(SrcReg) == LLT::scalar(PartSize)); 1807 1808 auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg); 1809 1810 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg : 1811 MRI.createGenericVirtualRegister(WideTy); 1812 1813 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset); 1814 auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt); 1815 MIRBuilder.buildOr(NextResult, ResultReg, Shl); 1816 ResultReg = NextResult; 1817 } 1818 1819 if (WideSize > DstSize) 1820 MIRBuilder.buildTrunc(DstReg, ResultReg); 1821 else if (DstTy.isPointer()) 1822 MIRBuilder.buildIntToPtr(DstReg, ResultReg); 1823 1824 MI.eraseFromParent(); 1825 return Legalized; 1826 } 1827 1828 // Unmerge the original values to the GCD type, and recombine to the next 1829 // multiple greater than the original type. 1830 // 1831 // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6 1832 // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0 1833 // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1 1834 // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2 1835 // %10:_(s6) = G_MERGE_VALUES %4, %5, %6 1836 // %11:_(s6) = G_MERGE_VALUES %7, %8, %9 1837 // %12:_(s12) = G_MERGE_VALUES %10, %11 1838 // 1839 // Padding with undef if necessary: 1840 // 1841 // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6 1842 // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0 1843 // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1 1844 // %7:_(s2) = G_IMPLICIT_DEF 1845 // %8:_(s6) = G_MERGE_VALUES %3, %4, %5 1846 // %9:_(s6) = G_MERGE_VALUES %6, %7, %7 1847 // %10:_(s12) = G_MERGE_VALUES %8, %9 1848 1849 const int GCD = std::gcd(SrcSize, WideSize); 1850 LLT GCDTy = LLT::scalar(GCD); 1851 1852 SmallVector<Register, 8> Parts; 1853 SmallVector<Register, 8> NewMergeRegs; 1854 SmallVector<Register, 8> Unmerges; 1855 LLT WideDstTy = LLT::scalar(NumMerge * WideSize); 1856 1857 // Decompose the original operands if they don't evenly divide. 1858 for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) { 1859 Register SrcReg = MO.getReg(); 1860 if (GCD == SrcSize) { 1861 Unmerges.push_back(SrcReg); 1862 } else { 1863 auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg); 1864 for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J) 1865 Unmerges.push_back(Unmerge.getReg(J)); 1866 } 1867 } 1868 1869 // Pad with undef to the next size that is a multiple of the requested size. 1870 if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) { 1871 Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0); 1872 for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I) 1873 Unmerges.push_back(UndefReg); 1874 } 1875 1876 const int PartsPerGCD = WideSize / GCD; 1877 1878 // Build merges of each piece. 1879 ArrayRef<Register> Slicer(Unmerges); 1880 for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) { 1881 auto Merge = 1882 MIRBuilder.buildMergeLikeInstr(WideTy, Slicer.take_front(PartsPerGCD)); 1883 NewMergeRegs.push_back(Merge.getReg(0)); 1884 } 1885 1886 // A truncate may be necessary if the requested type doesn't evenly divide the 1887 // original result type. 1888 if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) { 1889 MIRBuilder.buildMergeLikeInstr(DstReg, NewMergeRegs); 1890 } else { 1891 auto FinalMerge = MIRBuilder.buildMergeLikeInstr(WideDstTy, NewMergeRegs); 1892 MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0)); 1893 } 1894 1895 MI.eraseFromParent(); 1896 return Legalized; 1897 } 1898 1899 LegalizerHelper::LegalizeResult 1900 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx, 1901 LLT WideTy) { 1902 if (TypeIdx != 0) 1903 return UnableToLegalize; 1904 1905 int NumDst = MI.getNumOperands() - 1; 1906 Register SrcReg = MI.getOperand(NumDst).getReg(); 1907 LLT SrcTy = MRI.getType(SrcReg); 1908 if (SrcTy.isVector()) 1909 return UnableToLegalize; 1910 1911 Register Dst0Reg = MI.getOperand(0).getReg(); 1912 LLT DstTy = MRI.getType(Dst0Reg); 1913 if (!DstTy.isScalar()) 1914 return UnableToLegalize; 1915 1916 if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) { 1917 if (SrcTy.isPointer()) { 1918 const DataLayout &DL = MIRBuilder.getDataLayout(); 1919 if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) { 1920 LLVM_DEBUG( 1921 dbgs() << "Not casting non-integral address space integer\n"); 1922 return UnableToLegalize; 1923 } 1924 1925 SrcTy = LLT::scalar(SrcTy.getSizeInBits()); 1926 SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0); 1927 } 1928 1929 // Widen SrcTy to WideTy. This does not affect the result, but since the 1930 // user requested this size, it is probably better handled than SrcTy and 1931 // should reduce the total number of legalization artifacts. 1932 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) { 1933 SrcTy = WideTy; 1934 SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0); 1935 } 1936 1937 // Theres no unmerge type to target. Directly extract the bits from the 1938 // source type 1939 unsigned DstSize = DstTy.getSizeInBits(); 1940 1941 MIRBuilder.buildTrunc(Dst0Reg, SrcReg); 1942 for (int I = 1; I != NumDst; ++I) { 1943 auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I); 1944 auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt); 1945 MIRBuilder.buildTrunc(MI.getOperand(I), Shr); 1946 } 1947 1948 MI.eraseFromParent(); 1949 return Legalized; 1950 } 1951 1952 // Extend the source to a wider type. 1953 LLT LCMTy = getLCMType(SrcTy, WideTy); 1954 1955 Register WideSrc = SrcReg; 1956 if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) { 1957 // TODO: If this is an integral address space, cast to integer and anyext. 1958 if (SrcTy.isPointer()) { 1959 LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n"); 1960 return UnableToLegalize; 1961 } 1962 1963 WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0); 1964 } 1965 1966 auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc); 1967 1968 // Create a sequence of unmerges and merges to the original results. Since we 1969 // may have widened the source, we will need to pad the results with dead defs 1970 // to cover the source register. 1971 // e.g. widen s48 to s64: 1972 // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96) 1973 // 1974 // => 1975 // %4:_(s192) = G_ANYEXT %0:_(s96) 1976 // %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge 1977 // ; unpack to GCD type, with extra dead defs 1978 // %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64) 1979 // %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64) 1980 // dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64) 1981 // %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10 ; Remerge to destination 1982 // %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination 1983 const LLT GCDTy = getGCDType(WideTy, DstTy); 1984 const int NumUnmerge = Unmerge->getNumOperands() - 1; 1985 const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits(); 1986 1987 // Directly unmerge to the destination without going through a GCD type 1988 // if possible 1989 if (PartsPerRemerge == 1) { 1990 const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits(); 1991 1992 for (int I = 0; I != NumUnmerge; ++I) { 1993 auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES); 1994 1995 for (int J = 0; J != PartsPerUnmerge; ++J) { 1996 int Idx = I * PartsPerUnmerge + J; 1997 if (Idx < NumDst) 1998 MIB.addDef(MI.getOperand(Idx).getReg()); 1999 else { 2000 // Create dead def for excess components. 2001 MIB.addDef(MRI.createGenericVirtualRegister(DstTy)); 2002 } 2003 } 2004 2005 MIB.addUse(Unmerge.getReg(I)); 2006 } 2007 } else { 2008 SmallVector<Register, 16> Parts; 2009 for (int J = 0; J != NumUnmerge; ++J) 2010 extractGCDType(Parts, GCDTy, Unmerge.getReg(J)); 2011 2012 SmallVector<Register, 8> RemergeParts; 2013 for (int I = 0; I != NumDst; ++I) { 2014 for (int J = 0; J < PartsPerRemerge; ++J) { 2015 const int Idx = I * PartsPerRemerge + J; 2016 RemergeParts.emplace_back(Parts[Idx]); 2017 } 2018 2019 MIRBuilder.buildMergeLikeInstr(MI.getOperand(I).getReg(), RemergeParts); 2020 RemergeParts.clear(); 2021 } 2022 } 2023 2024 MI.eraseFromParent(); 2025 return Legalized; 2026 } 2027 2028 LegalizerHelper::LegalizeResult 2029 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx, 2030 LLT WideTy) { 2031 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); 2032 unsigned Offset = MI.getOperand(2).getImm(); 2033 2034 if (TypeIdx == 0) { 2035 if (SrcTy.isVector() || DstTy.isVector()) 2036 return UnableToLegalize; 2037 2038 SrcOp Src(SrcReg); 2039 if (SrcTy.isPointer()) { 2040 // Extracts from pointers can be handled only if they are really just 2041 // simple integers. 2042 const DataLayout &DL = MIRBuilder.getDataLayout(); 2043 if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) 2044 return UnableToLegalize; 2045 2046 LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits()); 2047 Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src); 2048 SrcTy = SrcAsIntTy; 2049 } 2050 2051 if (DstTy.isPointer()) 2052 return UnableToLegalize; 2053 2054 if (Offset == 0) { 2055 // Avoid a shift in the degenerate case. 2056 MIRBuilder.buildTrunc(DstReg, 2057 MIRBuilder.buildAnyExtOrTrunc(WideTy, Src)); 2058 MI.eraseFromParent(); 2059 return Legalized; 2060 } 2061 2062 // Do a shift in the source type. 2063 LLT ShiftTy = SrcTy; 2064 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) { 2065 Src = MIRBuilder.buildAnyExt(WideTy, Src); 2066 ShiftTy = WideTy; 2067 } 2068 2069 auto LShr = MIRBuilder.buildLShr( 2070 ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset)); 2071 MIRBuilder.buildTrunc(DstReg, LShr); 2072 MI.eraseFromParent(); 2073 return Legalized; 2074 } 2075 2076 if (SrcTy.isScalar()) { 2077 Observer.changingInstr(MI); 2078 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2079 Observer.changedInstr(MI); 2080 return Legalized; 2081 } 2082 2083 if (!SrcTy.isVector()) 2084 return UnableToLegalize; 2085 2086 if (DstTy != SrcTy.getElementType()) 2087 return UnableToLegalize; 2088 2089 if (Offset % SrcTy.getScalarSizeInBits() != 0) 2090 return UnableToLegalize; 2091 2092 Observer.changingInstr(MI); 2093 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2094 2095 MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) * 2096 Offset); 2097 widenScalarDst(MI, WideTy.getScalarType(), 0); 2098 Observer.changedInstr(MI); 2099 return Legalized; 2100 } 2101 2102 LegalizerHelper::LegalizeResult 2103 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx, 2104 LLT WideTy) { 2105 if (TypeIdx != 0 || WideTy.isVector()) 2106 return UnableToLegalize; 2107 Observer.changingInstr(MI); 2108 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2109 widenScalarDst(MI, WideTy); 2110 Observer.changedInstr(MI); 2111 return Legalized; 2112 } 2113 2114 LegalizerHelper::LegalizeResult 2115 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx, 2116 LLT WideTy) { 2117 unsigned Opcode; 2118 unsigned ExtOpcode; 2119 std::optional<Register> CarryIn; 2120 switch (MI.getOpcode()) { 2121 default: 2122 llvm_unreachable("Unexpected opcode!"); 2123 case TargetOpcode::G_SADDO: 2124 Opcode = TargetOpcode::G_ADD; 2125 ExtOpcode = TargetOpcode::G_SEXT; 2126 break; 2127 case TargetOpcode::G_SSUBO: 2128 Opcode = TargetOpcode::G_SUB; 2129 ExtOpcode = TargetOpcode::G_SEXT; 2130 break; 2131 case TargetOpcode::G_UADDO: 2132 Opcode = TargetOpcode::G_ADD; 2133 ExtOpcode = TargetOpcode::G_ZEXT; 2134 break; 2135 case TargetOpcode::G_USUBO: 2136 Opcode = TargetOpcode::G_SUB; 2137 ExtOpcode = TargetOpcode::G_ZEXT; 2138 break; 2139 case TargetOpcode::G_SADDE: 2140 Opcode = TargetOpcode::G_UADDE; 2141 ExtOpcode = TargetOpcode::G_SEXT; 2142 CarryIn = MI.getOperand(4).getReg(); 2143 break; 2144 case TargetOpcode::G_SSUBE: 2145 Opcode = TargetOpcode::G_USUBE; 2146 ExtOpcode = TargetOpcode::G_SEXT; 2147 CarryIn = MI.getOperand(4).getReg(); 2148 break; 2149 case TargetOpcode::G_UADDE: 2150 Opcode = TargetOpcode::G_UADDE; 2151 ExtOpcode = TargetOpcode::G_ZEXT; 2152 CarryIn = MI.getOperand(4).getReg(); 2153 break; 2154 case TargetOpcode::G_USUBE: 2155 Opcode = TargetOpcode::G_USUBE; 2156 ExtOpcode = TargetOpcode::G_ZEXT; 2157 CarryIn = MI.getOperand(4).getReg(); 2158 break; 2159 } 2160 2161 if (TypeIdx == 1) { 2162 unsigned BoolExtOp = MIRBuilder.getBoolExtOp(WideTy.isVector(), false); 2163 2164 Observer.changingInstr(MI); 2165 if (CarryIn) 2166 widenScalarSrc(MI, WideTy, 4, BoolExtOp); 2167 widenScalarDst(MI, WideTy, 1); 2168 2169 Observer.changedInstr(MI); 2170 return Legalized; 2171 } 2172 2173 auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)}); 2174 auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)}); 2175 // Do the arithmetic in the larger type. 2176 Register NewOp; 2177 if (CarryIn) { 2178 LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg()); 2179 NewOp = MIRBuilder 2180 .buildInstr(Opcode, {WideTy, CarryOutTy}, 2181 {LHSExt, RHSExt, *CarryIn}) 2182 .getReg(0); 2183 } else { 2184 NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0); 2185 } 2186 LLT OrigTy = MRI.getType(MI.getOperand(0).getReg()); 2187 auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp); 2188 auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp}); 2189 // There is no overflow if the ExtOp is the same as NewOp. 2190 MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp); 2191 // Now trunc the NewOp to the original result. 2192 MIRBuilder.buildTrunc(MI.getOperand(0), NewOp); 2193 MI.eraseFromParent(); 2194 return Legalized; 2195 } 2196 2197 LegalizerHelper::LegalizeResult 2198 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx, 2199 LLT WideTy) { 2200 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT || 2201 MI.getOpcode() == TargetOpcode::G_SSUBSAT || 2202 MI.getOpcode() == TargetOpcode::G_SSHLSAT; 2203 bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT || 2204 MI.getOpcode() == TargetOpcode::G_USHLSAT; 2205 // We can convert this to: 2206 // 1. Any extend iN to iM 2207 // 2. SHL by M-N 2208 // 3. [US][ADD|SUB|SHL]SAT 2209 // 4. L/ASHR by M-N 2210 // 2211 // It may be more efficient to lower this to a min and a max operation in 2212 // the higher precision arithmetic if the promoted operation isn't legal, 2213 // but this decision is up to the target's lowering request. 2214 Register DstReg = MI.getOperand(0).getReg(); 2215 2216 unsigned NewBits = WideTy.getScalarSizeInBits(); 2217 unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits(); 2218 2219 // Shifts must zero-extend the RHS to preserve the unsigned quantity, and 2220 // must not left shift the RHS to preserve the shift amount. 2221 auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1)); 2222 auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2)) 2223 : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2)); 2224 auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount); 2225 auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK); 2226 auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK); 2227 2228 auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy}, 2229 {ShiftL, ShiftR}, MI.getFlags()); 2230 2231 // Use a shift that will preserve the number of sign bits when the trunc is 2232 // folded away. 2233 auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK) 2234 : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK); 2235 2236 MIRBuilder.buildTrunc(DstReg, Result); 2237 MI.eraseFromParent(); 2238 return Legalized; 2239 } 2240 2241 LegalizerHelper::LegalizeResult 2242 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx, 2243 LLT WideTy) { 2244 if (TypeIdx == 1) { 2245 Observer.changingInstr(MI); 2246 widenScalarDst(MI, WideTy, 1); 2247 Observer.changedInstr(MI); 2248 return Legalized; 2249 } 2250 2251 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO; 2252 auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs(); 2253 LLT SrcTy = MRI.getType(LHS); 2254 LLT OverflowTy = MRI.getType(OriginalOverflow); 2255 unsigned SrcBitWidth = SrcTy.getScalarSizeInBits(); 2256 2257 // To determine if the result overflowed in the larger type, we extend the 2258 // input to the larger type, do the multiply (checking if it overflows), 2259 // then also check the high bits of the result to see if overflow happened 2260 // there. 2261 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; 2262 auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS}); 2263 auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS}); 2264 2265 // Multiplication cannot overflow if the WideTy is >= 2 * original width, 2266 // so we don't need to check the overflow result of larger type Mulo. 2267 bool WideMulCanOverflow = WideTy.getScalarSizeInBits() < 2 * SrcBitWidth; 2268 2269 unsigned MulOpc = 2270 WideMulCanOverflow ? MI.getOpcode() : (unsigned)TargetOpcode::G_MUL; 2271 2272 MachineInstrBuilder Mulo; 2273 if (WideMulCanOverflow) 2274 Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy, OverflowTy}, 2275 {LeftOperand, RightOperand}); 2276 else 2277 Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy}, {LeftOperand, RightOperand}); 2278 2279 auto Mul = Mulo->getOperand(0); 2280 MIRBuilder.buildTrunc(Result, Mul); 2281 2282 MachineInstrBuilder ExtResult; 2283 // Overflow occurred if it occurred in the larger type, or if the high part 2284 // of the result does not zero/sign-extend the low part. Check this second 2285 // possibility first. 2286 if (IsSigned) { 2287 // For signed, overflow occurred when the high part does not sign-extend 2288 // the low part. 2289 ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth); 2290 } else { 2291 // Unsigned overflow occurred when the high part does not zero-extend the 2292 // low part. 2293 ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth); 2294 } 2295 2296 if (WideMulCanOverflow) { 2297 auto Overflow = 2298 MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult); 2299 // Finally check if the multiplication in the larger type itself overflowed. 2300 MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow); 2301 } else { 2302 MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult); 2303 } 2304 MI.eraseFromParent(); 2305 return Legalized; 2306 } 2307 2308 LegalizerHelper::LegalizeResult 2309 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { 2310 switch (MI.getOpcode()) { 2311 default: 2312 return UnableToLegalize; 2313 case TargetOpcode::G_ATOMICRMW_XCHG: 2314 case TargetOpcode::G_ATOMICRMW_ADD: 2315 case TargetOpcode::G_ATOMICRMW_SUB: 2316 case TargetOpcode::G_ATOMICRMW_AND: 2317 case TargetOpcode::G_ATOMICRMW_OR: 2318 case TargetOpcode::G_ATOMICRMW_XOR: 2319 case TargetOpcode::G_ATOMICRMW_MIN: 2320 case TargetOpcode::G_ATOMICRMW_MAX: 2321 case TargetOpcode::G_ATOMICRMW_UMIN: 2322 case TargetOpcode::G_ATOMICRMW_UMAX: 2323 assert(TypeIdx == 0 && "atomicrmw with second scalar type"); 2324 Observer.changingInstr(MI); 2325 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 2326 widenScalarDst(MI, WideTy, 0); 2327 Observer.changedInstr(MI); 2328 return Legalized; 2329 case TargetOpcode::G_ATOMIC_CMPXCHG: 2330 assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type"); 2331 Observer.changingInstr(MI); 2332 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 2333 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT); 2334 widenScalarDst(MI, WideTy, 0); 2335 Observer.changedInstr(MI); 2336 return Legalized; 2337 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: 2338 if (TypeIdx == 0) { 2339 Observer.changingInstr(MI); 2340 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT); 2341 widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT); 2342 widenScalarDst(MI, WideTy, 0); 2343 Observer.changedInstr(MI); 2344 return Legalized; 2345 } 2346 assert(TypeIdx == 1 && 2347 "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type"); 2348 Observer.changingInstr(MI); 2349 widenScalarDst(MI, WideTy, 1); 2350 Observer.changedInstr(MI); 2351 return Legalized; 2352 case TargetOpcode::G_EXTRACT: 2353 return widenScalarExtract(MI, TypeIdx, WideTy); 2354 case TargetOpcode::G_INSERT: 2355 return widenScalarInsert(MI, TypeIdx, WideTy); 2356 case TargetOpcode::G_MERGE_VALUES: 2357 return widenScalarMergeValues(MI, TypeIdx, WideTy); 2358 case TargetOpcode::G_UNMERGE_VALUES: 2359 return widenScalarUnmergeValues(MI, TypeIdx, WideTy); 2360 case TargetOpcode::G_SADDO: 2361 case TargetOpcode::G_SSUBO: 2362 case TargetOpcode::G_UADDO: 2363 case TargetOpcode::G_USUBO: 2364 case TargetOpcode::G_SADDE: 2365 case TargetOpcode::G_SSUBE: 2366 case TargetOpcode::G_UADDE: 2367 case TargetOpcode::G_USUBE: 2368 return widenScalarAddSubOverflow(MI, TypeIdx, WideTy); 2369 case TargetOpcode::G_UMULO: 2370 case TargetOpcode::G_SMULO: 2371 return widenScalarMulo(MI, TypeIdx, WideTy); 2372 case TargetOpcode::G_SADDSAT: 2373 case TargetOpcode::G_SSUBSAT: 2374 case TargetOpcode::G_SSHLSAT: 2375 case TargetOpcode::G_UADDSAT: 2376 case TargetOpcode::G_USUBSAT: 2377 case TargetOpcode::G_USHLSAT: 2378 return widenScalarAddSubShlSat(MI, TypeIdx, WideTy); 2379 case TargetOpcode::G_CTTZ: 2380 case TargetOpcode::G_CTTZ_ZERO_UNDEF: 2381 case TargetOpcode::G_CTLZ: 2382 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 2383 case TargetOpcode::G_CTPOP: { 2384 if (TypeIdx == 0) { 2385 Observer.changingInstr(MI); 2386 widenScalarDst(MI, WideTy, 0); 2387 Observer.changedInstr(MI); 2388 return Legalized; 2389 } 2390 2391 Register SrcReg = MI.getOperand(1).getReg(); 2392 2393 // First extend the input. 2394 unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ || 2395 MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF 2396 ? TargetOpcode::G_ANYEXT 2397 : TargetOpcode::G_ZEXT; 2398 auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg}); 2399 LLT CurTy = MRI.getType(SrcReg); 2400 unsigned NewOpc = MI.getOpcode(); 2401 if (NewOpc == TargetOpcode::G_CTTZ) { 2402 // The count is the same in the larger type except if the original 2403 // value was zero. This can be handled by setting the bit just off 2404 // the top of the original type. 2405 auto TopBit = 2406 APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits()); 2407 MIBSrc = MIRBuilder.buildOr( 2408 WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit)); 2409 // Now we know the operand is non-zero, use the more relaxed opcode. 2410 NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF; 2411 } 2412 2413 // Perform the operation at the larger size. 2414 auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc}); 2415 // This is already the correct result for CTPOP and CTTZs 2416 if (MI.getOpcode() == TargetOpcode::G_CTLZ || 2417 MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) { 2418 // The correct result is NewOp - (Difference in widety and current ty). 2419 unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits(); 2420 MIBNewOp = MIRBuilder.buildSub( 2421 WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff)); 2422 } 2423 2424 MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp); 2425 MI.eraseFromParent(); 2426 return Legalized; 2427 } 2428 case TargetOpcode::G_BSWAP: { 2429 Observer.changingInstr(MI); 2430 Register DstReg = MI.getOperand(0).getReg(); 2431 2432 Register ShrReg = MRI.createGenericVirtualRegister(WideTy); 2433 Register DstExt = MRI.createGenericVirtualRegister(WideTy); 2434 Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy); 2435 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2436 2437 MI.getOperand(0).setReg(DstExt); 2438 2439 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 2440 2441 LLT Ty = MRI.getType(DstReg); 2442 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits(); 2443 MIRBuilder.buildConstant(ShiftAmtReg, DiffBits); 2444 MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg); 2445 2446 MIRBuilder.buildTrunc(DstReg, ShrReg); 2447 Observer.changedInstr(MI); 2448 return Legalized; 2449 } 2450 case TargetOpcode::G_BITREVERSE: { 2451 Observer.changingInstr(MI); 2452 2453 Register DstReg = MI.getOperand(0).getReg(); 2454 LLT Ty = MRI.getType(DstReg); 2455 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits(); 2456 2457 Register DstExt = MRI.createGenericVirtualRegister(WideTy); 2458 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2459 MI.getOperand(0).setReg(DstExt); 2460 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 2461 2462 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits); 2463 auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt); 2464 MIRBuilder.buildTrunc(DstReg, Shift); 2465 Observer.changedInstr(MI); 2466 return Legalized; 2467 } 2468 case TargetOpcode::G_FREEZE: 2469 Observer.changingInstr(MI); 2470 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2471 widenScalarDst(MI, WideTy); 2472 Observer.changedInstr(MI); 2473 return Legalized; 2474 2475 case TargetOpcode::G_ABS: 2476 Observer.changingInstr(MI); 2477 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); 2478 widenScalarDst(MI, WideTy); 2479 Observer.changedInstr(MI); 2480 return Legalized; 2481 2482 case TargetOpcode::G_ADD: 2483 case TargetOpcode::G_AND: 2484 case TargetOpcode::G_MUL: 2485 case TargetOpcode::G_OR: 2486 case TargetOpcode::G_XOR: 2487 case TargetOpcode::G_SUB: 2488 // Perform operation at larger width (any extension is fines here, high bits 2489 // don't affect the result) and then truncate the result back to the 2490 // original type. 2491 Observer.changingInstr(MI); 2492 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2493 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 2494 widenScalarDst(MI, WideTy); 2495 Observer.changedInstr(MI); 2496 return Legalized; 2497 2498 case TargetOpcode::G_SBFX: 2499 case TargetOpcode::G_UBFX: 2500 Observer.changingInstr(MI); 2501 2502 if (TypeIdx == 0) { 2503 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2504 widenScalarDst(MI, WideTy); 2505 } else { 2506 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2507 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT); 2508 } 2509 2510 Observer.changedInstr(MI); 2511 return Legalized; 2512 2513 case TargetOpcode::G_SHL: 2514 Observer.changingInstr(MI); 2515 2516 if (TypeIdx == 0) { 2517 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2518 widenScalarDst(MI, WideTy); 2519 } else { 2520 assert(TypeIdx == 1); 2521 // The "number of bits to shift" operand must preserve its value as an 2522 // unsigned integer: 2523 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2524 } 2525 2526 Observer.changedInstr(MI); 2527 return Legalized; 2528 2529 case TargetOpcode::G_ROTR: 2530 case TargetOpcode::G_ROTL: 2531 if (TypeIdx != 1) 2532 return UnableToLegalize; 2533 2534 Observer.changingInstr(MI); 2535 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2536 Observer.changedInstr(MI); 2537 return Legalized; 2538 2539 case TargetOpcode::G_SDIV: 2540 case TargetOpcode::G_SREM: 2541 case TargetOpcode::G_SMIN: 2542 case TargetOpcode::G_SMAX: 2543 Observer.changingInstr(MI); 2544 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); 2545 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); 2546 widenScalarDst(MI, WideTy); 2547 Observer.changedInstr(MI); 2548 return Legalized; 2549 2550 case TargetOpcode::G_SDIVREM: 2551 Observer.changingInstr(MI); 2552 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); 2553 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT); 2554 widenScalarDst(MI, WideTy); 2555 widenScalarDst(MI, WideTy, 1); 2556 Observer.changedInstr(MI); 2557 return Legalized; 2558 2559 case TargetOpcode::G_ASHR: 2560 case TargetOpcode::G_LSHR: 2561 Observer.changingInstr(MI); 2562 2563 if (TypeIdx == 0) { 2564 unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ? 2565 TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; 2566 2567 widenScalarSrc(MI, WideTy, 1, CvtOp); 2568 widenScalarDst(MI, WideTy); 2569 } else { 2570 assert(TypeIdx == 1); 2571 // The "number of bits to shift" operand must preserve its value as an 2572 // unsigned integer: 2573 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2574 } 2575 2576 Observer.changedInstr(MI); 2577 return Legalized; 2578 case TargetOpcode::G_UDIV: 2579 case TargetOpcode::G_UREM: 2580 case TargetOpcode::G_UMIN: 2581 case TargetOpcode::G_UMAX: 2582 Observer.changingInstr(MI); 2583 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); 2584 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2585 widenScalarDst(MI, WideTy); 2586 Observer.changedInstr(MI); 2587 return Legalized; 2588 2589 case TargetOpcode::G_UDIVREM: 2590 Observer.changingInstr(MI); 2591 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2592 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT); 2593 widenScalarDst(MI, WideTy); 2594 widenScalarDst(MI, WideTy, 1); 2595 Observer.changedInstr(MI); 2596 return Legalized; 2597 2598 case TargetOpcode::G_SELECT: 2599 Observer.changingInstr(MI); 2600 if (TypeIdx == 0) { 2601 // Perform operation at larger width (any extension is fine here, high 2602 // bits don't affect the result) and then truncate the result back to the 2603 // original type. 2604 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 2605 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT); 2606 widenScalarDst(MI, WideTy); 2607 } else { 2608 bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector(); 2609 // Explicit extension is required here since high bits affect the result. 2610 widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false)); 2611 } 2612 Observer.changedInstr(MI); 2613 return Legalized; 2614 2615 case TargetOpcode::G_FPTOSI: 2616 case TargetOpcode::G_FPTOUI: 2617 case TargetOpcode::G_IS_FPCLASS: 2618 Observer.changingInstr(MI); 2619 2620 if (TypeIdx == 0) 2621 widenScalarDst(MI, WideTy); 2622 else 2623 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); 2624 2625 Observer.changedInstr(MI); 2626 return Legalized; 2627 case TargetOpcode::G_SITOFP: 2628 Observer.changingInstr(MI); 2629 2630 if (TypeIdx == 0) 2631 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2632 else 2633 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); 2634 2635 Observer.changedInstr(MI); 2636 return Legalized; 2637 case TargetOpcode::G_UITOFP: 2638 Observer.changingInstr(MI); 2639 2640 if (TypeIdx == 0) 2641 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2642 else 2643 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); 2644 2645 Observer.changedInstr(MI); 2646 return Legalized; 2647 case TargetOpcode::G_LOAD: 2648 case TargetOpcode::G_SEXTLOAD: 2649 case TargetOpcode::G_ZEXTLOAD: 2650 Observer.changingInstr(MI); 2651 widenScalarDst(MI, WideTy); 2652 Observer.changedInstr(MI); 2653 return Legalized; 2654 2655 case TargetOpcode::G_STORE: { 2656 if (TypeIdx != 0) 2657 return UnableToLegalize; 2658 2659 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2660 if (!Ty.isScalar()) 2661 return UnableToLegalize; 2662 2663 Observer.changingInstr(MI); 2664 2665 unsigned ExtType = Ty.getScalarSizeInBits() == 1 ? 2666 TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT; 2667 widenScalarSrc(MI, WideTy, 0, ExtType); 2668 2669 Observer.changedInstr(MI); 2670 return Legalized; 2671 } 2672 case TargetOpcode::G_CONSTANT: { 2673 MachineOperand &SrcMO = MI.getOperand(1); 2674 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext(); 2675 unsigned ExtOpc = LI.getExtOpcodeForWideningConstant( 2676 MRI.getType(MI.getOperand(0).getReg())); 2677 assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT || 2678 ExtOpc == TargetOpcode::G_ANYEXT) && 2679 "Illegal Extend"); 2680 const APInt &SrcVal = SrcMO.getCImm()->getValue(); 2681 const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT) 2682 ? SrcVal.sext(WideTy.getSizeInBits()) 2683 : SrcVal.zext(WideTy.getSizeInBits()); 2684 Observer.changingInstr(MI); 2685 SrcMO.setCImm(ConstantInt::get(Ctx, Val)); 2686 2687 widenScalarDst(MI, WideTy); 2688 Observer.changedInstr(MI); 2689 return Legalized; 2690 } 2691 case TargetOpcode::G_FCONSTANT: { 2692 // To avoid changing the bits of the constant due to extension to a larger 2693 // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT. 2694 MachineOperand &SrcMO = MI.getOperand(1); 2695 APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt(); 2696 MIRBuilder.setInstrAndDebugLoc(MI); 2697 auto IntCst = MIRBuilder.buildConstant(MI.getOperand(0).getReg(), Val); 2698 widenScalarDst(*IntCst, WideTy, 0, TargetOpcode::G_TRUNC); 2699 MI.eraseFromParent(); 2700 return Legalized; 2701 } 2702 case TargetOpcode::G_IMPLICIT_DEF: { 2703 Observer.changingInstr(MI); 2704 widenScalarDst(MI, WideTy); 2705 Observer.changedInstr(MI); 2706 return Legalized; 2707 } 2708 case TargetOpcode::G_BRCOND: 2709 Observer.changingInstr(MI); 2710 widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false)); 2711 Observer.changedInstr(MI); 2712 return Legalized; 2713 2714 case TargetOpcode::G_FCMP: 2715 Observer.changingInstr(MI); 2716 if (TypeIdx == 0) 2717 widenScalarDst(MI, WideTy); 2718 else { 2719 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT); 2720 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT); 2721 } 2722 Observer.changedInstr(MI); 2723 return Legalized; 2724 2725 case TargetOpcode::G_ICMP: 2726 Observer.changingInstr(MI); 2727 if (TypeIdx == 0) 2728 widenScalarDst(MI, WideTy); 2729 else { 2730 unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>( 2731 MI.getOperand(1).getPredicate())) 2732 ? TargetOpcode::G_SEXT 2733 : TargetOpcode::G_ZEXT; 2734 widenScalarSrc(MI, WideTy, 2, ExtOpcode); 2735 widenScalarSrc(MI, WideTy, 3, ExtOpcode); 2736 } 2737 Observer.changedInstr(MI); 2738 return Legalized; 2739 2740 case TargetOpcode::G_PTR_ADD: 2741 assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD"); 2742 Observer.changingInstr(MI); 2743 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); 2744 Observer.changedInstr(MI); 2745 return Legalized; 2746 2747 case TargetOpcode::G_PHI: { 2748 assert(TypeIdx == 0 && "Expecting only Idx 0"); 2749 2750 Observer.changingInstr(MI); 2751 for (unsigned I = 1; I < MI.getNumOperands(); I += 2) { 2752 MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB(); 2753 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward()); 2754 widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT); 2755 } 2756 2757 MachineBasicBlock &MBB = *MI.getParent(); 2758 MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI()); 2759 widenScalarDst(MI, WideTy); 2760 Observer.changedInstr(MI); 2761 return Legalized; 2762 } 2763 case TargetOpcode::G_EXTRACT_VECTOR_ELT: { 2764 if (TypeIdx == 0) { 2765 Register VecReg = MI.getOperand(1).getReg(); 2766 LLT VecTy = MRI.getType(VecReg); 2767 Observer.changingInstr(MI); 2768 2769 widenScalarSrc( 2770 MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1, 2771 TargetOpcode::G_ANYEXT); 2772 2773 widenScalarDst(MI, WideTy, 0); 2774 Observer.changedInstr(MI); 2775 return Legalized; 2776 } 2777 2778 if (TypeIdx != 2) 2779 return UnableToLegalize; 2780 Observer.changingInstr(MI); 2781 // TODO: Probably should be zext 2782 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); 2783 Observer.changedInstr(MI); 2784 return Legalized; 2785 } 2786 case TargetOpcode::G_INSERT_VECTOR_ELT: { 2787 if (TypeIdx == 0) { 2788 Observer.changingInstr(MI); 2789 const LLT WideEltTy = WideTy.getElementType(); 2790 2791 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2792 widenScalarSrc(MI, WideEltTy, 2, TargetOpcode::G_ANYEXT); 2793 widenScalarDst(MI, WideTy, 0); 2794 Observer.changedInstr(MI); 2795 return Legalized; 2796 } 2797 2798 if (TypeIdx == 1) { 2799 Observer.changingInstr(MI); 2800 2801 Register VecReg = MI.getOperand(1).getReg(); 2802 LLT VecTy = MRI.getType(VecReg); 2803 LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy); 2804 2805 widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT); 2806 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 2807 widenScalarDst(MI, WideVecTy, 0); 2808 Observer.changedInstr(MI); 2809 return Legalized; 2810 } 2811 2812 if (TypeIdx == 2) { 2813 Observer.changingInstr(MI); 2814 // TODO: Probably should be zext 2815 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT); 2816 Observer.changedInstr(MI); 2817 return Legalized; 2818 } 2819 2820 return UnableToLegalize; 2821 } 2822 case TargetOpcode::G_FADD: 2823 case TargetOpcode::G_FMUL: 2824 case TargetOpcode::G_FSUB: 2825 case TargetOpcode::G_FMA: 2826 case TargetOpcode::G_FMAD: 2827 case TargetOpcode::G_FNEG: 2828 case TargetOpcode::G_FABS: 2829 case TargetOpcode::G_FCANONICALIZE: 2830 case TargetOpcode::G_FMINNUM: 2831 case TargetOpcode::G_FMAXNUM: 2832 case TargetOpcode::G_FMINNUM_IEEE: 2833 case TargetOpcode::G_FMAXNUM_IEEE: 2834 case TargetOpcode::G_FMINIMUM: 2835 case TargetOpcode::G_FMAXIMUM: 2836 case TargetOpcode::G_FDIV: 2837 case TargetOpcode::G_FREM: 2838 case TargetOpcode::G_FCEIL: 2839 case TargetOpcode::G_FFLOOR: 2840 case TargetOpcode::G_FCOS: 2841 case TargetOpcode::G_FSIN: 2842 case TargetOpcode::G_FLOG10: 2843 case TargetOpcode::G_FLOG: 2844 case TargetOpcode::G_FLOG2: 2845 case TargetOpcode::G_FRINT: 2846 case TargetOpcode::G_FNEARBYINT: 2847 case TargetOpcode::G_FSQRT: 2848 case TargetOpcode::G_FEXP: 2849 case TargetOpcode::G_FEXP2: 2850 case TargetOpcode::G_FEXP10: 2851 case TargetOpcode::G_FPOW: 2852 case TargetOpcode::G_INTRINSIC_TRUNC: 2853 case TargetOpcode::G_INTRINSIC_ROUND: 2854 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: 2855 assert(TypeIdx == 0); 2856 Observer.changingInstr(MI); 2857 2858 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) 2859 widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT); 2860 2861 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2862 Observer.changedInstr(MI); 2863 return Legalized; 2864 case TargetOpcode::G_FPOWI: 2865 case TargetOpcode::G_FLDEXP: 2866 case TargetOpcode::G_STRICT_FLDEXP: { 2867 if (TypeIdx == 0) { 2868 if (MI.getOpcode() == TargetOpcode::G_STRICT_FLDEXP) 2869 return UnableToLegalize; 2870 2871 Observer.changingInstr(MI); 2872 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); 2873 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2874 Observer.changedInstr(MI); 2875 return Legalized; 2876 } 2877 2878 if (TypeIdx == 1) { 2879 // For some reason SelectionDAG tries to promote to a libcall without 2880 // actually changing the integer type for promotion. 2881 Observer.changingInstr(MI); 2882 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); 2883 Observer.changedInstr(MI); 2884 return Legalized; 2885 } 2886 2887 return UnableToLegalize; 2888 } 2889 case TargetOpcode::G_FFREXP: { 2890 Observer.changingInstr(MI); 2891 2892 if (TypeIdx == 0) { 2893 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT); 2894 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2895 } else { 2896 widenScalarDst(MI, WideTy, 1); 2897 } 2898 2899 Observer.changedInstr(MI); 2900 return Legalized; 2901 } 2902 case TargetOpcode::G_INTTOPTR: 2903 if (TypeIdx != 1) 2904 return UnableToLegalize; 2905 2906 Observer.changingInstr(MI); 2907 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); 2908 Observer.changedInstr(MI); 2909 return Legalized; 2910 case TargetOpcode::G_PTRTOINT: 2911 if (TypeIdx != 0) 2912 return UnableToLegalize; 2913 2914 Observer.changingInstr(MI); 2915 widenScalarDst(MI, WideTy, 0); 2916 Observer.changedInstr(MI); 2917 return Legalized; 2918 case TargetOpcode::G_BUILD_VECTOR: { 2919 Observer.changingInstr(MI); 2920 2921 const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType(); 2922 for (int I = 1, E = MI.getNumOperands(); I != E; ++I) 2923 widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT); 2924 2925 // Avoid changing the result vector type if the source element type was 2926 // requested. 2927 if (TypeIdx == 1) { 2928 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC)); 2929 } else { 2930 widenScalarDst(MI, WideTy, 0); 2931 } 2932 2933 Observer.changedInstr(MI); 2934 return Legalized; 2935 } 2936 case TargetOpcode::G_SEXT_INREG: 2937 if (TypeIdx != 0) 2938 return UnableToLegalize; 2939 2940 Observer.changingInstr(MI); 2941 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2942 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC); 2943 Observer.changedInstr(MI); 2944 return Legalized; 2945 case TargetOpcode::G_PTRMASK: { 2946 if (TypeIdx != 1) 2947 return UnableToLegalize; 2948 Observer.changingInstr(MI); 2949 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2950 Observer.changedInstr(MI); 2951 return Legalized; 2952 } 2953 case TargetOpcode::G_VECREDUCE_FADD: 2954 case TargetOpcode::G_VECREDUCE_FMUL: 2955 case TargetOpcode::G_VECREDUCE_FMIN: 2956 case TargetOpcode::G_VECREDUCE_FMAX: 2957 case TargetOpcode::G_VECREDUCE_FMINIMUM: 2958 case TargetOpcode::G_VECREDUCE_FMAXIMUM: 2959 if (TypeIdx != 0) 2960 return UnableToLegalize; 2961 Observer.changingInstr(MI); 2962 Register VecReg = MI.getOperand(1).getReg(); 2963 LLT VecTy = MRI.getType(VecReg); 2964 LLT WideVecTy = VecTy.isVector() 2965 ? LLT::vector(VecTy.getElementCount(), WideTy) 2966 : WideTy; 2967 widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_FPEXT); 2968 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2969 Observer.changedInstr(MI); 2970 return Legalized; 2971 } 2972 } 2973 2974 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces, 2975 MachineIRBuilder &B, Register Src, LLT Ty) { 2976 auto Unmerge = B.buildUnmerge(Ty, Src); 2977 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2978 Pieces.push_back(Unmerge.getReg(I)); 2979 } 2980 2981 LegalizerHelper::LegalizeResult 2982 LegalizerHelper::lowerFConstant(MachineInstr &MI) { 2983 Register Dst = MI.getOperand(0).getReg(); 2984 2985 MachineFunction &MF = MIRBuilder.getMF(); 2986 const DataLayout &DL = MIRBuilder.getDataLayout(); 2987 2988 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace(); 2989 LLT AddrPtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)); 2990 Align Alignment = Align(DL.getABITypeAlign( 2991 getFloatTypeForLLT(MF.getFunction().getContext(), MRI.getType(Dst)))); 2992 2993 auto Addr = MIRBuilder.buildConstantPool( 2994 AddrPtrTy, MF.getConstantPool()->getConstantPoolIndex( 2995 MI.getOperand(1).getFPImm(), Alignment)); 2996 2997 MachineMemOperand *MMO = MF.getMachineMemOperand( 2998 MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad, 2999 MRI.getType(Dst), Alignment); 3000 3001 MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Addr, *MMO); 3002 MI.eraseFromParent(); 3003 3004 return Legalized; 3005 } 3006 3007 LegalizerHelper::LegalizeResult 3008 LegalizerHelper::lowerBitcast(MachineInstr &MI) { 3009 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); 3010 if (SrcTy.isVector()) { 3011 LLT SrcEltTy = SrcTy.getElementType(); 3012 SmallVector<Register, 8> SrcRegs; 3013 3014 if (DstTy.isVector()) { 3015 int NumDstElt = DstTy.getNumElements(); 3016 int NumSrcElt = SrcTy.getNumElements(); 3017 3018 LLT DstEltTy = DstTy.getElementType(); 3019 LLT DstCastTy = DstEltTy; // Intermediate bitcast result type 3020 LLT SrcPartTy = SrcEltTy; // Original unmerge result type. 3021 3022 // If there's an element size mismatch, insert intermediate casts to match 3023 // the result element type. 3024 if (NumSrcElt < NumDstElt) { // Source element type is larger. 3025 // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>) 3026 // 3027 // => 3028 // 3029 // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0 3030 // %3:_(<2 x s8>) = G_BITCAST %2 3031 // %4:_(<2 x s8>) = G_BITCAST %3 3032 // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4 3033 DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy); 3034 SrcPartTy = SrcEltTy; 3035 } else if (NumSrcElt > NumDstElt) { // Source element type is smaller. 3036 // 3037 // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>) 3038 // 3039 // => 3040 // 3041 // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0 3042 // %3:_(s16) = G_BITCAST %2 3043 // %4:_(s16) = G_BITCAST %3 3044 // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4 3045 SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy); 3046 DstCastTy = DstEltTy; 3047 } 3048 3049 getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy); 3050 for (Register &SrcReg : SrcRegs) 3051 SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0); 3052 } else 3053 getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy); 3054 3055 MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs); 3056 MI.eraseFromParent(); 3057 return Legalized; 3058 } 3059 3060 if (DstTy.isVector()) { 3061 SmallVector<Register, 8> SrcRegs; 3062 getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType()); 3063 MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs); 3064 MI.eraseFromParent(); 3065 return Legalized; 3066 } 3067 3068 return UnableToLegalize; 3069 } 3070 3071 /// Figure out the bit offset into a register when coercing a vector index for 3072 /// the wide element type. This is only for the case when promoting vector to 3073 /// one with larger elements. 3074 // 3075 /// 3076 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize)) 3077 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize) 3078 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B, 3079 Register Idx, 3080 unsigned NewEltSize, 3081 unsigned OldEltSize) { 3082 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); 3083 LLT IdxTy = B.getMRI()->getType(Idx); 3084 3085 // Now figure out the amount we need to shift to get the target bits. 3086 auto OffsetMask = B.buildConstant( 3087 IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio)); 3088 auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask); 3089 return B.buildShl(IdxTy, OffsetIdx, 3090 B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0); 3091 } 3092 3093 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this 3094 /// is casting to a vector with a smaller element size, perform multiple element 3095 /// extracts and merge the results. If this is coercing to a vector with larger 3096 /// elements, index the bitcasted vector and extract the target element with bit 3097 /// operations. This is intended to force the indexing in the native register 3098 /// size for architectures that can dynamically index the register file. 3099 LegalizerHelper::LegalizeResult 3100 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx, 3101 LLT CastTy) { 3102 if (TypeIdx != 1) 3103 return UnableToLegalize; 3104 3105 auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs(); 3106 3107 LLT SrcEltTy = SrcVecTy.getElementType(); 3108 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1; 3109 unsigned OldNumElts = SrcVecTy.getNumElements(); 3110 3111 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy; 3112 Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0); 3113 3114 const unsigned NewEltSize = NewEltTy.getSizeInBits(); 3115 const unsigned OldEltSize = SrcEltTy.getSizeInBits(); 3116 if (NewNumElts > OldNumElts) { 3117 // Decreasing the vector element size 3118 // 3119 // e.g. i64 = extract_vector_elt x:v2i64, y:i32 3120 // => 3121 // v4i32:castx = bitcast x:v2i64 3122 // 3123 // i64 = bitcast 3124 // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))), 3125 // (i32 (extract_vector_elt castx, (2 * y + 1))) 3126 // 3127 if (NewNumElts % OldNumElts != 0) 3128 return UnableToLegalize; 3129 3130 // Type of the intermediate result vector. 3131 const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts; 3132 LLT MidTy = 3133 LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy); 3134 3135 auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt); 3136 3137 SmallVector<Register, 8> NewOps(NewEltsPerOldElt); 3138 auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK); 3139 3140 for (unsigned I = 0; I < NewEltsPerOldElt; ++I) { 3141 auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I); 3142 auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset); 3143 auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx); 3144 NewOps[I] = Elt.getReg(0); 3145 } 3146 3147 auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps); 3148 MIRBuilder.buildBitcast(Dst, NewVec); 3149 MI.eraseFromParent(); 3150 return Legalized; 3151 } 3152 3153 if (NewNumElts < OldNumElts) { 3154 if (NewEltSize % OldEltSize != 0) 3155 return UnableToLegalize; 3156 3157 // This only depends on powers of 2 because we use bit tricks to figure out 3158 // the bit offset we need to shift to get the target element. A general 3159 // expansion could emit division/multiply. 3160 if (!isPowerOf2_32(NewEltSize / OldEltSize)) 3161 return UnableToLegalize; 3162 3163 // Increasing the vector element size. 3164 // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx 3165 // 3166 // => 3167 // 3168 // %cast = G_BITCAST %vec 3169 // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize) 3170 // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx 3171 // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize)) 3172 // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize) 3173 // %elt_bits = G_LSHR %wide_elt, %offset_bits 3174 // %elt = G_TRUNC %elt_bits 3175 3176 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); 3177 auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio); 3178 3179 // Divide to get the index in the wider element type. 3180 auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio); 3181 3182 Register WideElt = CastVec; 3183 if (CastTy.isVector()) { 3184 WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, 3185 ScaledIdx).getReg(0); 3186 } 3187 3188 // Compute the bit offset into the register of the target element. 3189 Register OffsetBits = getBitcastWiderVectorElementOffset( 3190 MIRBuilder, Idx, NewEltSize, OldEltSize); 3191 3192 // Shift the wide element to get the target element. 3193 auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits); 3194 MIRBuilder.buildTrunc(Dst, ExtractedBits); 3195 MI.eraseFromParent(); 3196 return Legalized; 3197 } 3198 3199 return UnableToLegalize; 3200 } 3201 3202 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p 3203 /// TargetReg, while preserving other bits in \p TargetReg. 3204 /// 3205 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset) 3206 static Register buildBitFieldInsert(MachineIRBuilder &B, 3207 Register TargetReg, Register InsertReg, 3208 Register OffsetBits) { 3209 LLT TargetTy = B.getMRI()->getType(TargetReg); 3210 LLT InsertTy = B.getMRI()->getType(InsertReg); 3211 auto ZextVal = B.buildZExt(TargetTy, InsertReg); 3212 auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits); 3213 3214 // Produce a bitmask of the value to insert 3215 auto EltMask = B.buildConstant( 3216 TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(), 3217 InsertTy.getSizeInBits())); 3218 // Shift it into position 3219 auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits); 3220 auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask); 3221 3222 // Clear out the bits in the wide element 3223 auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask); 3224 3225 // The value to insert has all zeros already, so stick it into the masked 3226 // wide element. 3227 return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0); 3228 } 3229 3230 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this 3231 /// is increasing the element size, perform the indexing in the target element 3232 /// type, and use bit operations to insert at the element position. This is 3233 /// intended for architectures that can dynamically index the register file and 3234 /// want to force indexing in the native register size. 3235 LegalizerHelper::LegalizeResult 3236 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx, 3237 LLT CastTy) { 3238 if (TypeIdx != 0) 3239 return UnableToLegalize; 3240 3241 auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] = 3242 MI.getFirst4RegLLTs(); 3243 LLT VecTy = DstTy; 3244 3245 LLT VecEltTy = VecTy.getElementType(); 3246 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy; 3247 const unsigned NewEltSize = NewEltTy.getSizeInBits(); 3248 const unsigned OldEltSize = VecEltTy.getSizeInBits(); 3249 3250 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1; 3251 unsigned OldNumElts = VecTy.getNumElements(); 3252 3253 Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0); 3254 if (NewNumElts < OldNumElts) { 3255 if (NewEltSize % OldEltSize != 0) 3256 return UnableToLegalize; 3257 3258 // This only depends on powers of 2 because we use bit tricks to figure out 3259 // the bit offset we need to shift to get the target element. A general 3260 // expansion could emit division/multiply. 3261 if (!isPowerOf2_32(NewEltSize / OldEltSize)) 3262 return UnableToLegalize; 3263 3264 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); 3265 auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio); 3266 3267 // Divide to get the index in the wider element type. 3268 auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio); 3269 3270 Register ExtractedElt = CastVec; 3271 if (CastTy.isVector()) { 3272 ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, 3273 ScaledIdx).getReg(0); 3274 } 3275 3276 // Compute the bit offset into the register of the target element. 3277 Register OffsetBits = getBitcastWiderVectorElementOffset( 3278 MIRBuilder, Idx, NewEltSize, OldEltSize); 3279 3280 Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt, 3281 Val, OffsetBits); 3282 if (CastTy.isVector()) { 3283 InsertedElt = MIRBuilder.buildInsertVectorElement( 3284 CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0); 3285 } 3286 3287 MIRBuilder.buildBitcast(Dst, InsertedElt); 3288 MI.eraseFromParent(); 3289 return Legalized; 3290 } 3291 3292 return UnableToLegalize; 3293 } 3294 3295 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) { 3296 // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT 3297 Register DstReg = LoadMI.getDstReg(); 3298 Register PtrReg = LoadMI.getPointerReg(); 3299 LLT DstTy = MRI.getType(DstReg); 3300 MachineMemOperand &MMO = LoadMI.getMMO(); 3301 LLT MemTy = MMO.getMemoryType(); 3302 MachineFunction &MF = MIRBuilder.getMF(); 3303 3304 unsigned MemSizeInBits = MemTy.getSizeInBits(); 3305 unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes(); 3306 3307 if (MemSizeInBits != MemStoreSizeInBits) { 3308 if (MemTy.isVector()) 3309 return UnableToLegalize; 3310 3311 // Promote to a byte-sized load if not loading an integral number of 3312 // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24. 3313 LLT WideMemTy = LLT::scalar(MemStoreSizeInBits); 3314 MachineMemOperand *NewMMO = 3315 MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy); 3316 3317 Register LoadReg = DstReg; 3318 LLT LoadTy = DstTy; 3319 3320 // If this wasn't already an extending load, we need to widen the result 3321 // register to avoid creating a load with a narrower result than the source. 3322 if (MemStoreSizeInBits > DstTy.getSizeInBits()) { 3323 LoadTy = WideMemTy; 3324 LoadReg = MRI.createGenericVirtualRegister(WideMemTy); 3325 } 3326 3327 if (isa<GSExtLoad>(LoadMI)) { 3328 auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO); 3329 MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits); 3330 } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == LoadTy) { 3331 auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO); 3332 // The extra bits are guaranteed to be zero, since we stored them that 3333 // way. A zext load from Wide thus automatically gives zext from MemVT. 3334 MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits); 3335 } else { 3336 MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO); 3337 } 3338 3339 if (DstTy != LoadTy) 3340 MIRBuilder.buildTrunc(DstReg, LoadReg); 3341 3342 LoadMI.eraseFromParent(); 3343 return Legalized; 3344 } 3345 3346 // Big endian lowering not implemented. 3347 if (MIRBuilder.getDataLayout().isBigEndian()) 3348 return UnableToLegalize; 3349 3350 // This load needs splitting into power of 2 sized loads. 3351 // 3352 // Our strategy here is to generate anyextending loads for the smaller 3353 // types up to next power-2 result type, and then combine the two larger 3354 // result values together, before truncating back down to the non-pow-2 3355 // type. 3356 // E.g. v1 = i24 load => 3357 // v2 = i32 zextload (2 byte) 3358 // v3 = i32 load (1 byte) 3359 // v4 = i32 shl v3, 16 3360 // v5 = i32 or v4, v2 3361 // v1 = i24 trunc v5 3362 // By doing this we generate the correct truncate which should get 3363 // combined away as an artifact with a matching extend. 3364 3365 uint64_t LargeSplitSize, SmallSplitSize; 3366 3367 if (!isPowerOf2_32(MemSizeInBits)) { 3368 // This load needs splitting into power of 2 sized loads. 3369 LargeSplitSize = llvm::bit_floor(MemSizeInBits); 3370 SmallSplitSize = MemSizeInBits - LargeSplitSize; 3371 } else { 3372 // This is already a power of 2, but we still need to split this in half. 3373 // 3374 // Assume we're being asked to decompose an unaligned load. 3375 // TODO: If this requires multiple splits, handle them all at once. 3376 auto &Ctx = MF.getFunction().getContext(); 3377 if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO)) 3378 return UnableToLegalize; 3379 3380 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2; 3381 } 3382 3383 if (MemTy.isVector()) { 3384 // TODO: Handle vector extloads 3385 if (MemTy != DstTy) 3386 return UnableToLegalize; 3387 3388 // TODO: We can do better than scalarizing the vector and at least split it 3389 // in half. 3390 return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType()); 3391 } 3392 3393 MachineMemOperand *LargeMMO = 3394 MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); 3395 MachineMemOperand *SmallMMO = 3396 MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8); 3397 3398 LLT PtrTy = MRI.getType(PtrReg); 3399 unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits()); 3400 LLT AnyExtTy = LLT::scalar(AnyExtSize); 3401 auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy, 3402 PtrReg, *LargeMMO); 3403 3404 auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), 3405 LargeSplitSize / 8); 3406 Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy); 3407 auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst); 3408 auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy, 3409 SmallPtr, *SmallMMO); 3410 3411 auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize); 3412 auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt); 3413 3414 if (AnyExtTy == DstTy) 3415 MIRBuilder.buildOr(DstReg, Shift, LargeLoad); 3416 else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) { 3417 auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad); 3418 MIRBuilder.buildTrunc(DstReg, {Or}); 3419 } else { 3420 assert(DstTy.isPointer() && "expected pointer"); 3421 auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad); 3422 3423 // FIXME: We currently consider this to be illegal for non-integral address 3424 // spaces, but we need still need a way to reinterpret the bits. 3425 MIRBuilder.buildIntToPtr(DstReg, Or); 3426 } 3427 3428 LoadMI.eraseFromParent(); 3429 return Legalized; 3430 } 3431 3432 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) { 3433 // Lower a non-power of 2 store into multiple pow-2 stores. 3434 // E.g. split an i24 store into an i16 store + i8 store. 3435 // We do this by first extending the stored value to the next largest power 3436 // of 2 type, and then using truncating stores to store the components. 3437 // By doing this, likewise with G_LOAD, generate an extend that can be 3438 // artifact-combined away instead of leaving behind extracts. 3439 Register SrcReg = StoreMI.getValueReg(); 3440 Register PtrReg = StoreMI.getPointerReg(); 3441 LLT SrcTy = MRI.getType(SrcReg); 3442 MachineFunction &MF = MIRBuilder.getMF(); 3443 MachineMemOperand &MMO = **StoreMI.memoperands_begin(); 3444 LLT MemTy = MMO.getMemoryType(); 3445 3446 unsigned StoreWidth = MemTy.getSizeInBits(); 3447 unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes(); 3448 3449 if (StoreWidth != StoreSizeInBits) { 3450 if (SrcTy.isVector()) 3451 return UnableToLegalize; 3452 3453 // Promote to a byte-sized store with upper bits zero if not 3454 // storing an integral number of bytes. For example, promote 3455 // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1) 3456 LLT WideTy = LLT::scalar(StoreSizeInBits); 3457 3458 if (StoreSizeInBits > SrcTy.getSizeInBits()) { 3459 // Avoid creating a store with a narrower source than result. 3460 SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0); 3461 SrcTy = WideTy; 3462 } 3463 3464 auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth); 3465 3466 MachineMemOperand *NewMMO = 3467 MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy); 3468 MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO); 3469 StoreMI.eraseFromParent(); 3470 return Legalized; 3471 } 3472 3473 if (MemTy.isVector()) { 3474 // TODO: Handle vector trunc stores 3475 if (MemTy != SrcTy) 3476 return UnableToLegalize; 3477 3478 // TODO: We can do better than scalarizing the vector and at least split it 3479 // in half. 3480 return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType()); 3481 } 3482 3483 unsigned MemSizeInBits = MemTy.getSizeInBits(); 3484 uint64_t LargeSplitSize, SmallSplitSize; 3485 3486 if (!isPowerOf2_32(MemSizeInBits)) { 3487 LargeSplitSize = llvm::bit_floor<uint64_t>(MemTy.getSizeInBits()); 3488 SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize; 3489 } else { 3490 auto &Ctx = MF.getFunction().getContext(); 3491 if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO)) 3492 return UnableToLegalize; // Don't know what we're being asked to do. 3493 3494 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2; 3495 } 3496 3497 // Extend to the next pow-2. If this store was itself the result of lowering, 3498 // e.g. an s56 store being broken into s32 + s24, we might have a stored type 3499 // that's wider than the stored size. 3500 unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits()); 3501 const LLT NewSrcTy = LLT::scalar(AnyExtSize); 3502 3503 if (SrcTy.isPointer()) { 3504 const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits()); 3505 SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0); 3506 } 3507 3508 auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg); 3509 3510 // Obtain the smaller value by shifting away the larger value. 3511 auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize); 3512 auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt); 3513 3514 // Generate the PtrAdd and truncating stores. 3515 LLT PtrTy = MRI.getType(PtrReg); 3516 auto OffsetCst = MIRBuilder.buildConstant( 3517 LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); 3518 auto SmallPtr = 3519 MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst); 3520 3521 MachineMemOperand *LargeMMO = 3522 MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); 3523 MachineMemOperand *SmallMMO = 3524 MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8); 3525 MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO); 3526 MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO); 3527 StoreMI.eraseFromParent(); 3528 return Legalized; 3529 } 3530 3531 LegalizerHelper::LegalizeResult 3532 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) { 3533 switch (MI.getOpcode()) { 3534 case TargetOpcode::G_LOAD: { 3535 if (TypeIdx != 0) 3536 return UnableToLegalize; 3537 MachineMemOperand &MMO = **MI.memoperands_begin(); 3538 3539 // Not sure how to interpret a bitcast of an extending load. 3540 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits()) 3541 return UnableToLegalize; 3542 3543 Observer.changingInstr(MI); 3544 bitcastDst(MI, CastTy, 0); 3545 MMO.setType(CastTy); 3546 Observer.changedInstr(MI); 3547 return Legalized; 3548 } 3549 case TargetOpcode::G_STORE: { 3550 if (TypeIdx != 0) 3551 return UnableToLegalize; 3552 3553 MachineMemOperand &MMO = **MI.memoperands_begin(); 3554 3555 // Not sure how to interpret a bitcast of a truncating store. 3556 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits()) 3557 return UnableToLegalize; 3558 3559 Observer.changingInstr(MI); 3560 bitcastSrc(MI, CastTy, 0); 3561 MMO.setType(CastTy); 3562 Observer.changedInstr(MI); 3563 return Legalized; 3564 } 3565 case TargetOpcode::G_SELECT: { 3566 if (TypeIdx != 0) 3567 return UnableToLegalize; 3568 3569 if (MRI.getType(MI.getOperand(1).getReg()).isVector()) { 3570 LLVM_DEBUG( 3571 dbgs() << "bitcast action not implemented for vector select\n"); 3572 return UnableToLegalize; 3573 } 3574 3575 Observer.changingInstr(MI); 3576 bitcastSrc(MI, CastTy, 2); 3577 bitcastSrc(MI, CastTy, 3); 3578 bitcastDst(MI, CastTy, 0); 3579 Observer.changedInstr(MI); 3580 return Legalized; 3581 } 3582 case TargetOpcode::G_AND: 3583 case TargetOpcode::G_OR: 3584 case TargetOpcode::G_XOR: { 3585 Observer.changingInstr(MI); 3586 bitcastSrc(MI, CastTy, 1); 3587 bitcastSrc(MI, CastTy, 2); 3588 bitcastDst(MI, CastTy, 0); 3589 Observer.changedInstr(MI); 3590 return Legalized; 3591 } 3592 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 3593 return bitcastExtractVectorElt(MI, TypeIdx, CastTy); 3594 case TargetOpcode::G_INSERT_VECTOR_ELT: 3595 return bitcastInsertVectorElt(MI, TypeIdx, CastTy); 3596 default: 3597 return UnableToLegalize; 3598 } 3599 } 3600 3601 // Legalize an instruction by changing the opcode in place. 3602 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) { 3603 Observer.changingInstr(MI); 3604 MI.setDesc(MIRBuilder.getTII().get(NewOpcode)); 3605 Observer.changedInstr(MI); 3606 } 3607 3608 LegalizerHelper::LegalizeResult 3609 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { 3610 using namespace TargetOpcode; 3611 3612 switch(MI.getOpcode()) { 3613 default: 3614 return UnableToLegalize; 3615 case TargetOpcode::G_FCONSTANT: 3616 return lowerFConstant(MI); 3617 case TargetOpcode::G_BITCAST: 3618 return lowerBitcast(MI); 3619 case TargetOpcode::G_SREM: 3620 case TargetOpcode::G_UREM: { 3621 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3622 auto Quot = 3623 MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty}, 3624 {MI.getOperand(1), MI.getOperand(2)}); 3625 3626 auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2)); 3627 MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod); 3628 MI.eraseFromParent(); 3629 return Legalized; 3630 } 3631 case TargetOpcode::G_SADDO: 3632 case TargetOpcode::G_SSUBO: 3633 return lowerSADDO_SSUBO(MI); 3634 case TargetOpcode::G_UMULH: 3635 case TargetOpcode::G_SMULH: 3636 return lowerSMULH_UMULH(MI); 3637 case TargetOpcode::G_SMULO: 3638 case TargetOpcode::G_UMULO: { 3639 // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the 3640 // result. 3641 auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs(); 3642 LLT Ty = MRI.getType(Res); 3643 3644 unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO 3645 ? TargetOpcode::G_SMULH 3646 : TargetOpcode::G_UMULH; 3647 3648 Observer.changingInstr(MI); 3649 const auto &TII = MIRBuilder.getTII(); 3650 MI.setDesc(TII.get(TargetOpcode::G_MUL)); 3651 MI.removeOperand(1); 3652 Observer.changedInstr(MI); 3653 3654 auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS}); 3655 auto Zero = MIRBuilder.buildConstant(Ty, 0); 3656 3657 // Move insert point forward so we can use the Res register if needed. 3658 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 3659 3660 // For *signed* multiply, overflow is detected by checking: 3661 // (hi != (lo >> bitwidth-1)) 3662 if (Opcode == TargetOpcode::G_SMULH) { 3663 auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1); 3664 auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt); 3665 MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted); 3666 } else { 3667 MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero); 3668 } 3669 return Legalized; 3670 } 3671 case TargetOpcode::G_FNEG: { 3672 auto [Res, SubByReg] = MI.getFirst2Regs(); 3673 LLT Ty = MRI.getType(Res); 3674 3675 // TODO: Handle vector types once we are able to 3676 // represent them. 3677 if (Ty.isVector()) 3678 return UnableToLegalize; 3679 auto SignMask = 3680 MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits())); 3681 MIRBuilder.buildXor(Res, SubByReg, SignMask); 3682 MI.eraseFromParent(); 3683 return Legalized; 3684 } 3685 case TargetOpcode::G_FSUB: 3686 case TargetOpcode::G_STRICT_FSUB: { 3687 auto [Res, LHS, RHS] = MI.getFirst3Regs(); 3688 LLT Ty = MRI.getType(Res); 3689 3690 // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)). 3691 auto Neg = MIRBuilder.buildFNeg(Ty, RHS); 3692 3693 if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB) 3694 MIRBuilder.buildStrictFAdd(Res, LHS, Neg, MI.getFlags()); 3695 else 3696 MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags()); 3697 3698 MI.eraseFromParent(); 3699 return Legalized; 3700 } 3701 case TargetOpcode::G_FMAD: 3702 return lowerFMad(MI); 3703 case TargetOpcode::G_FFLOOR: 3704 return lowerFFloor(MI); 3705 case TargetOpcode::G_INTRINSIC_ROUND: 3706 return lowerIntrinsicRound(MI); 3707 case TargetOpcode::G_FRINT: { 3708 // Since round even is the assumed rounding mode for unconstrained FP 3709 // operations, rint and roundeven are the same operation. 3710 changeOpcode(MI, TargetOpcode::G_INTRINSIC_ROUNDEVEN); 3711 return Legalized; 3712 } 3713 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: { 3714 auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs(); 3715 MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal, 3716 **MI.memoperands_begin()); 3717 MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal); 3718 MI.eraseFromParent(); 3719 return Legalized; 3720 } 3721 case TargetOpcode::G_LOAD: 3722 case TargetOpcode::G_SEXTLOAD: 3723 case TargetOpcode::G_ZEXTLOAD: 3724 return lowerLoad(cast<GAnyLoad>(MI)); 3725 case TargetOpcode::G_STORE: 3726 return lowerStore(cast<GStore>(MI)); 3727 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 3728 case TargetOpcode::G_CTTZ_ZERO_UNDEF: 3729 case TargetOpcode::G_CTLZ: 3730 case TargetOpcode::G_CTTZ: 3731 case TargetOpcode::G_CTPOP: 3732 return lowerBitCount(MI); 3733 case G_UADDO: { 3734 auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs(); 3735 3736 MIRBuilder.buildAdd(Res, LHS, RHS); 3737 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS); 3738 3739 MI.eraseFromParent(); 3740 return Legalized; 3741 } 3742 case G_UADDE: { 3743 auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs(); 3744 const LLT CondTy = MRI.getType(CarryOut); 3745 const LLT Ty = MRI.getType(Res); 3746 3747 // Initial add of the two operands. 3748 auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS); 3749 3750 // Initial check for carry. 3751 auto Carry = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, TmpRes, LHS); 3752 3753 // Add the sum and the carry. 3754 auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn); 3755 MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn); 3756 3757 // Second check for carry. We can only carry if the initial sum is all 1s 3758 // and the carry is set, resulting in a new sum of 0. 3759 auto Zero = MIRBuilder.buildConstant(Ty, 0); 3760 auto ResEqZero = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, Res, Zero); 3761 auto Carry2 = MIRBuilder.buildAnd(CondTy, ResEqZero, CarryIn); 3762 MIRBuilder.buildOr(CarryOut, Carry, Carry2); 3763 3764 MI.eraseFromParent(); 3765 return Legalized; 3766 } 3767 case G_USUBO: { 3768 auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs(); 3769 3770 MIRBuilder.buildSub(Res, LHS, RHS); 3771 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS); 3772 3773 MI.eraseFromParent(); 3774 return Legalized; 3775 } 3776 case G_USUBE: { 3777 auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs(); 3778 const LLT CondTy = MRI.getType(BorrowOut); 3779 const LLT Ty = MRI.getType(Res); 3780 3781 // Initial subtract of the two operands. 3782 auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS); 3783 3784 // Initial check for borrow. 3785 auto Borrow = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, CondTy, TmpRes, LHS); 3786 3787 // Subtract the borrow from the first subtract. 3788 auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn); 3789 MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn); 3790 3791 // Second check for borrow. We can only borrow if the initial difference is 3792 // 0 and the borrow is set, resulting in a new difference of all 1s. 3793 auto Zero = MIRBuilder.buildConstant(Ty, 0); 3794 auto TmpResEqZero = 3795 MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, TmpRes, Zero); 3796 auto Borrow2 = MIRBuilder.buildAnd(CondTy, TmpResEqZero, BorrowIn); 3797 MIRBuilder.buildOr(BorrowOut, Borrow, Borrow2); 3798 3799 MI.eraseFromParent(); 3800 return Legalized; 3801 } 3802 case G_UITOFP: 3803 return lowerUITOFP(MI); 3804 case G_SITOFP: 3805 return lowerSITOFP(MI); 3806 case G_FPTOUI: 3807 return lowerFPTOUI(MI); 3808 case G_FPTOSI: 3809 return lowerFPTOSI(MI); 3810 case G_FPTRUNC: 3811 return lowerFPTRUNC(MI); 3812 case G_FPOWI: 3813 return lowerFPOWI(MI); 3814 case G_SMIN: 3815 case G_SMAX: 3816 case G_UMIN: 3817 case G_UMAX: 3818 return lowerMinMax(MI); 3819 case G_FCOPYSIGN: 3820 return lowerFCopySign(MI); 3821 case G_FMINNUM: 3822 case G_FMAXNUM: 3823 return lowerFMinNumMaxNum(MI); 3824 case G_MERGE_VALUES: 3825 return lowerMergeValues(MI); 3826 case G_UNMERGE_VALUES: 3827 return lowerUnmergeValues(MI); 3828 case TargetOpcode::G_SEXT_INREG: { 3829 assert(MI.getOperand(2).isImm() && "Expected immediate"); 3830 int64_t SizeInBits = MI.getOperand(2).getImm(); 3831 3832 auto [DstReg, SrcReg] = MI.getFirst2Regs(); 3833 LLT DstTy = MRI.getType(DstReg); 3834 Register TmpRes = MRI.createGenericVirtualRegister(DstTy); 3835 3836 auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits); 3837 MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0)); 3838 MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0)); 3839 MI.eraseFromParent(); 3840 return Legalized; 3841 } 3842 case G_EXTRACT_VECTOR_ELT: 3843 case G_INSERT_VECTOR_ELT: 3844 return lowerExtractInsertVectorElt(MI); 3845 case G_SHUFFLE_VECTOR: 3846 return lowerShuffleVector(MI); 3847 case G_DYN_STACKALLOC: 3848 return lowerDynStackAlloc(MI); 3849 case G_STACKSAVE: 3850 return lowerStackSave(MI); 3851 case G_STACKRESTORE: 3852 return lowerStackRestore(MI); 3853 case G_EXTRACT: 3854 return lowerExtract(MI); 3855 case G_INSERT: 3856 return lowerInsert(MI); 3857 case G_BSWAP: 3858 return lowerBswap(MI); 3859 case G_BITREVERSE: 3860 return lowerBitreverse(MI); 3861 case G_READ_REGISTER: 3862 case G_WRITE_REGISTER: 3863 return lowerReadWriteRegister(MI); 3864 case G_UADDSAT: 3865 case G_USUBSAT: { 3866 // Try to make a reasonable guess about which lowering strategy to use. The 3867 // target can override this with custom lowering and calling the 3868 // implementation functions. 3869 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3870 if (LI.isLegalOrCustom({G_UMIN, Ty})) 3871 return lowerAddSubSatToMinMax(MI); 3872 return lowerAddSubSatToAddoSubo(MI); 3873 } 3874 case G_SADDSAT: 3875 case G_SSUBSAT: { 3876 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3877 3878 // FIXME: It would probably make more sense to see if G_SADDO is preferred, 3879 // since it's a shorter expansion. However, we would need to figure out the 3880 // preferred boolean type for the carry out for the query. 3881 if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty})) 3882 return lowerAddSubSatToMinMax(MI); 3883 return lowerAddSubSatToAddoSubo(MI); 3884 } 3885 case G_SSHLSAT: 3886 case G_USHLSAT: 3887 return lowerShlSat(MI); 3888 case G_ABS: 3889 return lowerAbsToAddXor(MI); 3890 case G_SELECT: 3891 return lowerSelect(MI); 3892 case G_IS_FPCLASS: 3893 return lowerISFPCLASS(MI); 3894 case G_SDIVREM: 3895 case G_UDIVREM: 3896 return lowerDIVREM(MI); 3897 case G_FSHL: 3898 case G_FSHR: 3899 return lowerFunnelShift(MI); 3900 case G_ROTL: 3901 case G_ROTR: 3902 return lowerRotate(MI); 3903 case G_MEMSET: 3904 case G_MEMCPY: 3905 case G_MEMMOVE: 3906 return lowerMemCpyFamily(MI); 3907 case G_MEMCPY_INLINE: 3908 return lowerMemcpyInline(MI); 3909 case G_ZEXT: 3910 case G_SEXT: 3911 case G_ANYEXT: 3912 return lowerEXT(MI); 3913 case G_TRUNC: 3914 return lowerTRUNC(MI); 3915 GISEL_VECREDUCE_CASES_NONSEQ 3916 return lowerVectorReduction(MI); 3917 case G_VAARG: 3918 return lowerVAArg(MI); 3919 } 3920 } 3921 3922 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty, 3923 Align MinAlign) const { 3924 // FIXME: We're missing a way to go back from LLT to llvm::Type to query the 3925 // datalayout for the preferred alignment. Also there should be a target hook 3926 // for this to allow targets to reduce the alignment and ignore the 3927 // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of 3928 // the type. 3929 return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign); 3930 } 3931 3932 MachineInstrBuilder 3933 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment, 3934 MachinePointerInfo &PtrInfo) { 3935 MachineFunction &MF = MIRBuilder.getMF(); 3936 const DataLayout &DL = MIRBuilder.getDataLayout(); 3937 int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false); 3938 3939 unsigned AddrSpace = DL.getAllocaAddrSpace(); 3940 LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)); 3941 3942 PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx); 3943 return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx); 3944 } 3945 3946 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg, 3947 LLT VecTy) { 3948 int64_t IdxVal; 3949 if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal))) 3950 return IdxReg; 3951 3952 LLT IdxTy = B.getMRI()->getType(IdxReg); 3953 unsigned NElts = VecTy.getNumElements(); 3954 if (isPowerOf2_32(NElts)) { 3955 APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts)); 3956 return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0); 3957 } 3958 3959 return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1)) 3960 .getReg(0); 3961 } 3962 3963 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy, 3964 Register Index) { 3965 LLT EltTy = VecTy.getElementType(); 3966 3967 // Calculate the element offset and add it to the pointer. 3968 unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size. 3969 assert(EltSize * 8 == EltTy.getSizeInBits() && 3970 "Converting bits to bytes lost precision"); 3971 3972 Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy); 3973 3974 LLT IdxTy = MRI.getType(Index); 3975 auto Mul = MIRBuilder.buildMul(IdxTy, Index, 3976 MIRBuilder.buildConstant(IdxTy, EltSize)); 3977 3978 LLT PtrTy = MRI.getType(VecPtr); 3979 return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0); 3980 } 3981 3982 #ifndef NDEBUG 3983 /// Check that all vector operands have same number of elements. Other operands 3984 /// should be listed in NonVecOp. 3985 static bool hasSameNumEltsOnAllVectorOperands( 3986 GenericMachineInstr &MI, MachineRegisterInfo &MRI, 3987 std::initializer_list<unsigned> NonVecOpIndices) { 3988 if (MI.getNumMemOperands() != 0) 3989 return false; 3990 3991 LLT VecTy = MRI.getType(MI.getReg(0)); 3992 if (!VecTy.isVector()) 3993 return false; 3994 unsigned NumElts = VecTy.getNumElements(); 3995 3996 for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) { 3997 MachineOperand &Op = MI.getOperand(OpIdx); 3998 if (!Op.isReg()) { 3999 if (!is_contained(NonVecOpIndices, OpIdx)) 4000 return false; 4001 continue; 4002 } 4003 4004 LLT Ty = MRI.getType(Op.getReg()); 4005 if (!Ty.isVector()) { 4006 if (!is_contained(NonVecOpIndices, OpIdx)) 4007 return false; 4008 continue; 4009 } 4010 4011 if (Ty.getNumElements() != NumElts) 4012 return false; 4013 } 4014 4015 return true; 4016 } 4017 #endif 4018 4019 /// Fill \p DstOps with DstOps that have same number of elements combined as 4020 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are 4021 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple 4022 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements. 4023 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty, 4024 unsigned NumElts) { 4025 LLT LeftoverTy; 4026 assert(Ty.isVector() && "Expected vector type"); 4027 LLT EltTy = Ty.getElementType(); 4028 LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy); 4029 int NumParts, NumLeftover; 4030 std::tie(NumParts, NumLeftover) = 4031 getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy); 4032 4033 assert(NumParts > 0 && "Error in getNarrowTypeBreakDown"); 4034 for (int i = 0; i < NumParts; ++i) { 4035 DstOps.push_back(NarrowTy); 4036 } 4037 4038 if (LeftoverTy.isValid()) { 4039 assert(NumLeftover == 1 && "expected exactly one leftover"); 4040 DstOps.push_back(LeftoverTy); 4041 } 4042 } 4043 4044 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps 4045 /// made from \p Op depending on operand type. 4046 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N, 4047 MachineOperand &Op) { 4048 for (unsigned i = 0; i < N; ++i) { 4049 if (Op.isReg()) 4050 Ops.push_back(Op.getReg()); 4051 else if (Op.isImm()) 4052 Ops.push_back(Op.getImm()); 4053 else if (Op.isPredicate()) 4054 Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate())); 4055 else 4056 llvm_unreachable("Unsupported type"); 4057 } 4058 } 4059 4060 // Handle splitting vector operations which need to have the same number of 4061 // elements in each type index, but each type index may have a different element 4062 // type. 4063 // 4064 // e.g. <4 x s64> = G_SHL <4 x s64>, <4 x s32> -> 4065 // <2 x s64> = G_SHL <2 x s64>, <2 x s32> 4066 // <2 x s64> = G_SHL <2 x s64>, <2 x s32> 4067 // 4068 // Also handles some irregular breakdown cases, e.g. 4069 // e.g. <3 x s64> = G_SHL <3 x s64>, <3 x s32> -> 4070 // <2 x s64> = G_SHL <2 x s64>, <2 x s32> 4071 // s64 = G_SHL s64, s32 4072 LegalizerHelper::LegalizeResult 4073 LegalizerHelper::fewerElementsVectorMultiEltType( 4074 GenericMachineInstr &MI, unsigned NumElts, 4075 std::initializer_list<unsigned> NonVecOpIndices) { 4076 assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) && 4077 "Non-compatible opcode or not specified non-vector operands"); 4078 unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements(); 4079 4080 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs(); 4081 unsigned NumDefs = MI.getNumDefs(); 4082 4083 // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output. 4084 // Build instructions with DstOps to use instruction found by CSE directly. 4085 // CSE copies found instruction into given vreg when building with vreg dest. 4086 SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs); 4087 // Output registers will be taken from created instructions. 4088 SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs); 4089 for (unsigned i = 0; i < NumDefs; ++i) { 4090 makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts); 4091 } 4092 4093 // Split vector input operands into sub-vectors with NumElts elts + Leftover. 4094 // Operands listed in NonVecOpIndices will be used as is without splitting; 4095 // examples: compare predicate in icmp and fcmp (op 1), vector select with i1 4096 // scalar condition (op 1), immediate in sext_inreg (op 2). 4097 SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs); 4098 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands(); 4099 ++UseIdx, ++UseNo) { 4100 if (is_contained(NonVecOpIndices, UseIdx)) { 4101 broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(), 4102 MI.getOperand(UseIdx)); 4103 } else { 4104 SmallVector<Register, 8> SplitPieces; 4105 extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces, MIRBuilder, 4106 MRI); 4107 for (auto Reg : SplitPieces) 4108 InputOpsPieces[UseNo].push_back(Reg); 4109 } 4110 } 4111 4112 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0; 4113 4114 // Take i-th piece of each input operand split and build sub-vector/scalar 4115 // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s). 4116 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) { 4117 SmallVector<DstOp, 2> Defs; 4118 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo) 4119 Defs.push_back(OutputOpsPieces[DstNo][i]); 4120 4121 SmallVector<SrcOp, 3> Uses; 4122 for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo) 4123 Uses.push_back(InputOpsPieces[InputNo][i]); 4124 4125 auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags()); 4126 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo) 4127 OutputRegs[DstNo].push_back(I.getReg(DstNo)); 4128 } 4129 4130 // Merge small outputs into MI's output for each def operand. 4131 if (NumLeftovers) { 4132 for (unsigned i = 0; i < NumDefs; ++i) 4133 mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]); 4134 } else { 4135 for (unsigned i = 0; i < NumDefs; ++i) 4136 MIRBuilder.buildMergeLikeInstr(MI.getReg(i), OutputRegs[i]); 4137 } 4138 4139 MI.eraseFromParent(); 4140 return Legalized; 4141 } 4142 4143 LegalizerHelper::LegalizeResult 4144 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI, 4145 unsigned NumElts) { 4146 unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements(); 4147 4148 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs(); 4149 unsigned NumDefs = MI.getNumDefs(); 4150 4151 SmallVector<DstOp, 8> OutputOpsPieces; 4152 SmallVector<Register, 8> OutputRegs; 4153 makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts); 4154 4155 // Instructions that perform register split will be inserted in basic block 4156 // where register is defined (basic block is in the next operand). 4157 SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2); 4158 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands(); 4159 UseIdx += 2, ++UseNo) { 4160 MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB(); 4161 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward()); 4162 extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo], 4163 MIRBuilder, MRI); 4164 } 4165 4166 // Build PHIs with fewer elements. 4167 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0; 4168 MIRBuilder.setInsertPt(*MI.getParent(), MI); 4169 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) { 4170 auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI); 4171 Phi.addDef( 4172 MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI))); 4173 OutputRegs.push_back(Phi.getReg(0)); 4174 4175 for (unsigned j = 0; j < NumInputs / 2; ++j) { 4176 Phi.addUse(InputOpsPieces[j][i]); 4177 Phi.add(MI.getOperand(1 + j * 2 + 1)); 4178 } 4179 } 4180 4181 // Merge small outputs into MI's def. 4182 if (NumLeftovers) { 4183 mergeMixedSubvectors(MI.getReg(0), OutputRegs); 4184 } else { 4185 MIRBuilder.buildMergeLikeInstr(MI.getReg(0), OutputRegs); 4186 } 4187 4188 MI.eraseFromParent(); 4189 return Legalized; 4190 } 4191 4192 LegalizerHelper::LegalizeResult 4193 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI, 4194 unsigned TypeIdx, 4195 LLT NarrowTy) { 4196 const int NumDst = MI.getNumOperands() - 1; 4197 const Register SrcReg = MI.getOperand(NumDst).getReg(); 4198 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 4199 LLT SrcTy = MRI.getType(SrcReg); 4200 4201 if (TypeIdx != 1 || NarrowTy == DstTy) 4202 return UnableToLegalize; 4203 4204 // Requires compatible types. Otherwise SrcReg should have been defined by 4205 // merge-like instruction that would get artifact combined. Most likely 4206 // instruction that defines SrcReg has to perform more/fewer elements 4207 // legalization compatible with NarrowTy. 4208 assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types"); 4209 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type"); 4210 4211 if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) || 4212 (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0)) 4213 return UnableToLegalize; 4214 4215 // This is most likely DstTy (smaller then register size) packed in SrcTy 4216 // (larger then register size) and since unmerge was not combined it will be 4217 // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy 4218 // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy. 4219 4220 // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy) 4221 // 4222 // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence 4223 // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg 4224 // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy) 4225 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg); 4226 const int NumUnmerge = Unmerge->getNumOperands() - 1; 4227 const int PartsPerUnmerge = NumDst / NumUnmerge; 4228 4229 for (int I = 0; I != NumUnmerge; ++I) { 4230 auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES); 4231 4232 for (int J = 0; J != PartsPerUnmerge; ++J) 4233 MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg()); 4234 MIB.addUse(Unmerge.getReg(I)); 4235 } 4236 4237 MI.eraseFromParent(); 4238 return Legalized; 4239 } 4240 4241 LegalizerHelper::LegalizeResult 4242 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx, 4243 LLT NarrowTy) { 4244 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); 4245 // Requires compatible types. Otherwise user of DstReg did not perform unmerge 4246 // that should have been artifact combined. Most likely instruction that uses 4247 // DstReg has to do more/fewer elements legalization compatible with NarrowTy. 4248 assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types"); 4249 assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type"); 4250 if (NarrowTy == SrcTy) 4251 return UnableToLegalize; 4252 4253 // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use 4254 // is for old mir tests. Since the changes to more/fewer elements it should no 4255 // longer be possible to generate MIR like this when starting from llvm-ir 4256 // because LCMTy approach was replaced with merge/unmerge to vector elements. 4257 if (TypeIdx == 1) { 4258 assert(SrcTy.isVector() && "Expected vector types"); 4259 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type"); 4260 if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) || 4261 (NarrowTy.getNumElements() >= SrcTy.getNumElements())) 4262 return UnableToLegalize; 4263 // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy) 4264 // 4265 // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy) 4266 // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy) 4267 // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4 4268 // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6 4269 // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8 4270 // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11 4271 4272 SmallVector<Register, 8> Elts; 4273 LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType(); 4274 for (unsigned i = 1; i < MI.getNumOperands(); ++i) { 4275 auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg()); 4276 for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j) 4277 Elts.push_back(Unmerge.getReg(j)); 4278 } 4279 4280 SmallVector<Register, 8> NarrowTyElts; 4281 unsigned NumNarrowTyElts = NarrowTy.getNumElements(); 4282 unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts; 4283 for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces; 4284 ++i, Offset += NumNarrowTyElts) { 4285 ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts); 4286 NarrowTyElts.push_back( 4287 MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0)); 4288 } 4289 4290 MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts); 4291 MI.eraseFromParent(); 4292 return Legalized; 4293 } 4294 4295 assert(TypeIdx == 0 && "Bad type index"); 4296 if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) || 4297 (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0)) 4298 return UnableToLegalize; 4299 4300 // This is most likely SrcTy (smaller then register size) packed in DstTy 4301 // (larger then register size) and since merge was not combined it will be 4302 // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy 4303 // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy. 4304 4305 // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4 4306 // 4307 // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg 4308 // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4 4309 // %0:_(DstTy) = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence 4310 SmallVector<Register, 8> NarrowTyElts; 4311 unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements(); 4312 unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1; 4313 unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts; 4314 for (unsigned i = 0; i < NumParts; ++i) { 4315 SmallVector<Register, 8> Sources; 4316 for (unsigned j = 0; j < NumElts; ++j) 4317 Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg()); 4318 NarrowTyElts.push_back( 4319 MIRBuilder.buildMergeLikeInstr(NarrowTy, Sources).getReg(0)); 4320 } 4321 4322 MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts); 4323 MI.eraseFromParent(); 4324 return Legalized; 4325 } 4326 4327 LegalizerHelper::LegalizeResult 4328 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI, 4329 unsigned TypeIdx, 4330 LLT NarrowVecTy) { 4331 auto [DstReg, SrcVec] = MI.getFirst2Regs(); 4332 Register InsertVal; 4333 bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT; 4334 4335 assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index"); 4336 if (IsInsert) 4337 InsertVal = MI.getOperand(2).getReg(); 4338 4339 Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg(); 4340 4341 // TODO: Handle total scalarization case. 4342 if (!NarrowVecTy.isVector()) 4343 return UnableToLegalize; 4344 4345 LLT VecTy = MRI.getType(SrcVec); 4346 4347 // If the index is a constant, we can really break this down as you would 4348 // expect, and index into the target size pieces. 4349 int64_t IdxVal; 4350 auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI); 4351 if (MaybeCst) { 4352 IdxVal = MaybeCst->Value.getSExtValue(); 4353 // Avoid out of bounds indexing the pieces. 4354 if (IdxVal >= VecTy.getNumElements()) { 4355 MIRBuilder.buildUndef(DstReg); 4356 MI.eraseFromParent(); 4357 return Legalized; 4358 } 4359 4360 SmallVector<Register, 8> VecParts; 4361 LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec); 4362 4363 // Build a sequence of NarrowTy pieces in VecParts for this operand. 4364 LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts, 4365 TargetOpcode::G_ANYEXT); 4366 4367 unsigned NewNumElts = NarrowVecTy.getNumElements(); 4368 4369 LLT IdxTy = MRI.getType(Idx); 4370 int64_t PartIdx = IdxVal / NewNumElts; 4371 auto NewIdx = 4372 MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx); 4373 4374 if (IsInsert) { 4375 LLT PartTy = MRI.getType(VecParts[PartIdx]); 4376 4377 // Use the adjusted index to insert into one of the subvectors. 4378 auto InsertPart = MIRBuilder.buildInsertVectorElement( 4379 PartTy, VecParts[PartIdx], InsertVal, NewIdx); 4380 VecParts[PartIdx] = InsertPart.getReg(0); 4381 4382 // Recombine the inserted subvector with the others to reform the result 4383 // vector. 4384 buildWidenedRemergeToDst(DstReg, LCMTy, VecParts); 4385 } else { 4386 MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx); 4387 } 4388 4389 MI.eraseFromParent(); 4390 return Legalized; 4391 } 4392 4393 // With a variable index, we can't perform the operation in a smaller type, so 4394 // we're forced to expand this. 4395 // 4396 // TODO: We could emit a chain of compare/select to figure out which piece to 4397 // index. 4398 return lowerExtractInsertVectorElt(MI); 4399 } 4400 4401 LegalizerHelper::LegalizeResult 4402 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx, 4403 LLT NarrowTy) { 4404 // FIXME: Don't know how to handle secondary types yet. 4405 if (TypeIdx != 0) 4406 return UnableToLegalize; 4407 4408 // This implementation doesn't work for atomics. Give up instead of doing 4409 // something invalid. 4410 if (LdStMI.isAtomic()) 4411 return UnableToLegalize; 4412 4413 bool IsLoad = isa<GLoad>(LdStMI); 4414 Register ValReg = LdStMI.getReg(0); 4415 Register AddrReg = LdStMI.getPointerReg(); 4416 LLT ValTy = MRI.getType(ValReg); 4417 4418 // FIXME: Do we need a distinct NarrowMemory legalize action? 4419 if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) { 4420 LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n"); 4421 return UnableToLegalize; 4422 } 4423 4424 int NumParts = -1; 4425 int NumLeftover = -1; 4426 LLT LeftoverTy; 4427 SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs; 4428 if (IsLoad) { 4429 std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy); 4430 } else { 4431 if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs, 4432 NarrowLeftoverRegs, MIRBuilder, MRI)) { 4433 NumParts = NarrowRegs.size(); 4434 NumLeftover = NarrowLeftoverRegs.size(); 4435 } 4436 } 4437 4438 if (NumParts == -1) 4439 return UnableToLegalize; 4440 4441 LLT PtrTy = MRI.getType(AddrReg); 4442 const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); 4443 4444 unsigned TotalSize = ValTy.getSizeInBits(); 4445 4446 // Split the load/store into PartTy sized pieces starting at Offset. If this 4447 // is a load, return the new registers in ValRegs. For a store, each elements 4448 // of ValRegs should be PartTy. Returns the next offset that needs to be 4449 // handled. 4450 bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian(); 4451 auto MMO = LdStMI.getMMO(); 4452 auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs, 4453 unsigned NumParts, unsigned Offset) -> unsigned { 4454 MachineFunction &MF = MIRBuilder.getMF(); 4455 unsigned PartSize = PartTy.getSizeInBits(); 4456 for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize; 4457 ++Idx) { 4458 unsigned ByteOffset = Offset / 8; 4459 Register NewAddrReg; 4460 4461 MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset); 4462 4463 MachineMemOperand *NewMMO = 4464 MF.getMachineMemOperand(&MMO, ByteOffset, PartTy); 4465 4466 if (IsLoad) { 4467 Register Dst = MRI.createGenericVirtualRegister(PartTy); 4468 ValRegs.push_back(Dst); 4469 MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO); 4470 } else { 4471 MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO); 4472 } 4473 Offset = isBigEndian ? Offset - PartSize : Offset + PartSize; 4474 } 4475 4476 return Offset; 4477 }; 4478 4479 unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0; 4480 unsigned HandledOffset = 4481 splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset); 4482 4483 // Handle the rest of the register if this isn't an even type breakdown. 4484 if (LeftoverTy.isValid()) 4485 splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset); 4486 4487 if (IsLoad) { 4488 insertParts(ValReg, ValTy, NarrowTy, NarrowRegs, 4489 LeftoverTy, NarrowLeftoverRegs); 4490 } 4491 4492 LdStMI.eraseFromParent(); 4493 return Legalized; 4494 } 4495 4496 LegalizerHelper::LegalizeResult 4497 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, 4498 LLT NarrowTy) { 4499 using namespace TargetOpcode; 4500 GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI); 4501 unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1; 4502 4503 switch (MI.getOpcode()) { 4504 case G_IMPLICIT_DEF: 4505 case G_TRUNC: 4506 case G_AND: 4507 case G_OR: 4508 case G_XOR: 4509 case G_ADD: 4510 case G_SUB: 4511 case G_MUL: 4512 case G_PTR_ADD: 4513 case G_SMULH: 4514 case G_UMULH: 4515 case G_FADD: 4516 case G_FMUL: 4517 case G_FSUB: 4518 case G_FNEG: 4519 case G_FABS: 4520 case G_FCANONICALIZE: 4521 case G_FDIV: 4522 case G_FREM: 4523 case G_FMA: 4524 case G_FMAD: 4525 case G_FPOW: 4526 case G_FEXP: 4527 case G_FEXP2: 4528 case G_FEXP10: 4529 case G_FLOG: 4530 case G_FLOG2: 4531 case G_FLOG10: 4532 case G_FLDEXP: 4533 case G_FNEARBYINT: 4534 case G_FCEIL: 4535 case G_FFLOOR: 4536 case G_FRINT: 4537 case G_INTRINSIC_ROUND: 4538 case G_INTRINSIC_ROUNDEVEN: 4539 case G_INTRINSIC_TRUNC: 4540 case G_FCOS: 4541 case G_FSIN: 4542 case G_FSQRT: 4543 case G_BSWAP: 4544 case G_BITREVERSE: 4545 case G_SDIV: 4546 case G_UDIV: 4547 case G_SREM: 4548 case G_UREM: 4549 case G_SDIVREM: 4550 case G_UDIVREM: 4551 case G_SMIN: 4552 case G_SMAX: 4553 case G_UMIN: 4554 case G_UMAX: 4555 case G_ABS: 4556 case G_FMINNUM: 4557 case G_FMAXNUM: 4558 case G_FMINNUM_IEEE: 4559 case G_FMAXNUM_IEEE: 4560 case G_FMINIMUM: 4561 case G_FMAXIMUM: 4562 case G_FSHL: 4563 case G_FSHR: 4564 case G_ROTL: 4565 case G_ROTR: 4566 case G_FREEZE: 4567 case G_SADDSAT: 4568 case G_SSUBSAT: 4569 case G_UADDSAT: 4570 case G_USUBSAT: 4571 case G_UMULO: 4572 case G_SMULO: 4573 case G_SHL: 4574 case G_LSHR: 4575 case G_ASHR: 4576 case G_SSHLSAT: 4577 case G_USHLSAT: 4578 case G_CTLZ: 4579 case G_CTLZ_ZERO_UNDEF: 4580 case G_CTTZ: 4581 case G_CTTZ_ZERO_UNDEF: 4582 case G_CTPOP: 4583 case G_FCOPYSIGN: 4584 case G_ZEXT: 4585 case G_SEXT: 4586 case G_ANYEXT: 4587 case G_FPEXT: 4588 case G_FPTRUNC: 4589 case G_SITOFP: 4590 case G_UITOFP: 4591 case G_FPTOSI: 4592 case G_FPTOUI: 4593 case G_INTTOPTR: 4594 case G_PTRTOINT: 4595 case G_ADDRSPACE_CAST: 4596 case G_UADDO: 4597 case G_USUBO: 4598 case G_UADDE: 4599 case G_USUBE: 4600 case G_SADDO: 4601 case G_SSUBO: 4602 case G_SADDE: 4603 case G_SSUBE: 4604 case G_STRICT_FADD: 4605 case G_STRICT_FSUB: 4606 case G_STRICT_FMUL: 4607 case G_STRICT_FMA: 4608 case G_STRICT_FLDEXP: 4609 case G_FFREXP: 4610 return fewerElementsVectorMultiEltType(GMI, NumElts); 4611 case G_ICMP: 4612 case G_FCMP: 4613 return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/}); 4614 case G_IS_FPCLASS: 4615 return fewerElementsVectorMultiEltType(GMI, NumElts, {2, 3 /*mask,fpsem*/}); 4616 case G_SELECT: 4617 if (MRI.getType(MI.getOperand(1).getReg()).isVector()) 4618 return fewerElementsVectorMultiEltType(GMI, NumElts); 4619 return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/}); 4620 case G_PHI: 4621 return fewerElementsVectorPhi(GMI, NumElts); 4622 case G_UNMERGE_VALUES: 4623 return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy); 4624 case G_BUILD_VECTOR: 4625 assert(TypeIdx == 0 && "not a vector type index"); 4626 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy); 4627 case G_CONCAT_VECTORS: 4628 if (TypeIdx != 1) // TODO: This probably does work as expected already. 4629 return UnableToLegalize; 4630 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy); 4631 case G_EXTRACT_VECTOR_ELT: 4632 case G_INSERT_VECTOR_ELT: 4633 return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy); 4634 case G_LOAD: 4635 case G_STORE: 4636 return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy); 4637 case G_SEXT_INREG: 4638 return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/}); 4639 GISEL_VECREDUCE_CASES_NONSEQ 4640 return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy); 4641 case TargetOpcode::G_VECREDUCE_SEQ_FADD: 4642 case TargetOpcode::G_VECREDUCE_SEQ_FMUL: 4643 return fewerElementsVectorSeqReductions(MI, TypeIdx, NarrowTy); 4644 case G_SHUFFLE_VECTOR: 4645 return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy); 4646 case G_FPOWI: 4647 return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*pow*/}); 4648 default: 4649 return UnableToLegalize; 4650 } 4651 } 4652 4653 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle( 4654 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) { 4655 assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); 4656 if (TypeIdx != 0) 4657 return UnableToLegalize; 4658 4659 auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] = 4660 MI.getFirst3RegLLTs(); 4661 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); 4662 // The shuffle should be canonicalized by now. 4663 if (DstTy != Src1Ty) 4664 return UnableToLegalize; 4665 if (DstTy != Src2Ty) 4666 return UnableToLegalize; 4667 4668 if (!isPowerOf2_32(DstTy.getNumElements())) 4669 return UnableToLegalize; 4670 4671 // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly. 4672 // Further legalization attempts will be needed to do split further. 4673 NarrowTy = 4674 DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2)); 4675 unsigned NewElts = NarrowTy.getNumElements(); 4676 4677 SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs; 4678 extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs, MIRBuilder, MRI); 4679 extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs, MIRBuilder, MRI); 4680 Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0], 4681 SplitSrc2Regs[1]}; 4682 4683 Register Hi, Lo; 4684 4685 // If Lo or Hi uses elements from at most two of the four input vectors, then 4686 // express it as a vector shuffle of those two inputs. Otherwise extract the 4687 // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR. 4688 SmallVector<int, 16> Ops; 4689 for (unsigned High = 0; High < 2; ++High) { 4690 Register &Output = High ? Hi : Lo; 4691 4692 // Build a shuffle mask for the output, discovering on the fly which 4693 // input vectors to use as shuffle operands (recorded in InputUsed). 4694 // If building a suitable shuffle vector proves too hard, then bail 4695 // out with useBuildVector set. 4696 unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered. 4697 unsigned FirstMaskIdx = High * NewElts; 4698 bool UseBuildVector = false; 4699 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) { 4700 // The mask element. This indexes into the input. 4701 int Idx = Mask[FirstMaskIdx + MaskOffset]; 4702 4703 // The input vector this mask element indexes into. 4704 unsigned Input = (unsigned)Idx / NewElts; 4705 4706 if (Input >= std::size(Inputs)) { 4707 // The mask element does not index into any input vector. 4708 Ops.push_back(-1); 4709 continue; 4710 } 4711 4712 // Turn the index into an offset from the start of the input vector. 4713 Idx -= Input * NewElts; 4714 4715 // Find or create a shuffle vector operand to hold this input. 4716 unsigned OpNo; 4717 for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) { 4718 if (InputUsed[OpNo] == Input) { 4719 // This input vector is already an operand. 4720 break; 4721 } else if (InputUsed[OpNo] == -1U) { 4722 // Create a new operand for this input vector. 4723 InputUsed[OpNo] = Input; 4724 break; 4725 } 4726 } 4727 4728 if (OpNo >= std::size(InputUsed)) { 4729 // More than two input vectors used! Give up on trying to create a 4730 // shuffle vector. Insert all elements into a BUILD_VECTOR instead. 4731 UseBuildVector = true; 4732 break; 4733 } 4734 4735 // Add the mask index for the new shuffle vector. 4736 Ops.push_back(Idx + OpNo * NewElts); 4737 } 4738 4739 if (UseBuildVector) { 4740 LLT EltTy = NarrowTy.getElementType(); 4741 SmallVector<Register, 16> SVOps; 4742 4743 // Extract the input elements by hand. 4744 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) { 4745 // The mask element. This indexes into the input. 4746 int Idx = Mask[FirstMaskIdx + MaskOffset]; 4747 4748 // The input vector this mask element indexes into. 4749 unsigned Input = (unsigned)Idx / NewElts; 4750 4751 if (Input >= std::size(Inputs)) { 4752 // The mask element is "undef" or indexes off the end of the input. 4753 SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0)); 4754 continue; 4755 } 4756 4757 // Turn the index into an offset from the start of the input vector. 4758 Idx -= Input * NewElts; 4759 4760 // Extract the vector element by hand. 4761 SVOps.push_back(MIRBuilder 4762 .buildExtractVectorElement( 4763 EltTy, Inputs[Input], 4764 MIRBuilder.buildConstant(LLT::scalar(32), Idx)) 4765 .getReg(0)); 4766 } 4767 4768 // Construct the Lo/Hi output using a G_BUILD_VECTOR. 4769 Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0); 4770 } else if (InputUsed[0] == -1U) { 4771 // No input vectors were used! The result is undefined. 4772 Output = MIRBuilder.buildUndef(NarrowTy).getReg(0); 4773 } else { 4774 Register Op0 = Inputs[InputUsed[0]]; 4775 // If only one input was used, use an undefined vector for the other. 4776 Register Op1 = InputUsed[1] == -1U 4777 ? MIRBuilder.buildUndef(NarrowTy).getReg(0) 4778 : Inputs[InputUsed[1]]; 4779 // At least one input vector was used. Create a new shuffle vector. 4780 Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0); 4781 } 4782 4783 Ops.clear(); 4784 } 4785 4786 MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi}); 4787 MI.eraseFromParent(); 4788 return Legalized; 4789 } 4790 4791 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions( 4792 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) { 4793 auto &RdxMI = cast<GVecReduce>(MI); 4794 4795 if (TypeIdx != 1) 4796 return UnableToLegalize; 4797 4798 // The semantics of the normal non-sequential reductions allow us to freely 4799 // re-associate the operation. 4800 auto [DstReg, DstTy, SrcReg, SrcTy] = RdxMI.getFirst2RegLLTs(); 4801 4802 if (NarrowTy.isVector() && 4803 (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0)) 4804 return UnableToLegalize; 4805 4806 unsigned ScalarOpc = RdxMI.getScalarOpcForReduction(); 4807 SmallVector<Register> SplitSrcs; 4808 // If NarrowTy is a scalar then we're being asked to scalarize. 4809 const unsigned NumParts = 4810 NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements() 4811 : SrcTy.getNumElements(); 4812 4813 extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI); 4814 if (NarrowTy.isScalar()) { 4815 if (DstTy != NarrowTy) 4816 return UnableToLegalize; // FIXME: handle implicit extensions. 4817 4818 if (isPowerOf2_32(NumParts)) { 4819 // Generate a tree of scalar operations to reduce the critical path. 4820 SmallVector<Register> PartialResults; 4821 unsigned NumPartsLeft = NumParts; 4822 while (NumPartsLeft > 1) { 4823 for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) { 4824 PartialResults.emplace_back( 4825 MIRBuilder 4826 .buildInstr(ScalarOpc, {NarrowTy}, 4827 {SplitSrcs[Idx], SplitSrcs[Idx + 1]}) 4828 .getReg(0)); 4829 } 4830 SplitSrcs = PartialResults; 4831 PartialResults.clear(); 4832 NumPartsLeft = SplitSrcs.size(); 4833 } 4834 assert(SplitSrcs.size() == 1); 4835 MIRBuilder.buildCopy(DstReg, SplitSrcs[0]); 4836 MI.eraseFromParent(); 4837 return Legalized; 4838 } 4839 // If we can't generate a tree, then just do sequential operations. 4840 Register Acc = SplitSrcs[0]; 4841 for (unsigned Idx = 1; Idx < NumParts; ++Idx) 4842 Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]}) 4843 .getReg(0); 4844 MIRBuilder.buildCopy(DstReg, Acc); 4845 MI.eraseFromParent(); 4846 return Legalized; 4847 } 4848 SmallVector<Register> PartialReductions; 4849 for (unsigned Part = 0; Part < NumParts; ++Part) { 4850 PartialReductions.push_back( 4851 MIRBuilder.buildInstr(RdxMI.getOpcode(), {DstTy}, {SplitSrcs[Part]}) 4852 .getReg(0)); 4853 } 4854 4855 // If the types involved are powers of 2, we can generate intermediate vector 4856 // ops, before generating a final reduction operation. 4857 if (isPowerOf2_32(SrcTy.getNumElements()) && 4858 isPowerOf2_32(NarrowTy.getNumElements())) { 4859 return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc); 4860 } 4861 4862 Register Acc = PartialReductions[0]; 4863 for (unsigned Part = 1; Part < NumParts; ++Part) { 4864 if (Part == NumParts - 1) { 4865 MIRBuilder.buildInstr(ScalarOpc, {DstReg}, 4866 {Acc, PartialReductions[Part]}); 4867 } else { 4868 Acc = MIRBuilder 4869 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]}) 4870 .getReg(0); 4871 } 4872 } 4873 MI.eraseFromParent(); 4874 return Legalized; 4875 } 4876 4877 LegalizerHelper::LegalizeResult 4878 LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr &MI, 4879 unsigned int TypeIdx, 4880 LLT NarrowTy) { 4881 auto [DstReg, DstTy, ScalarReg, ScalarTy, SrcReg, SrcTy] = 4882 MI.getFirst3RegLLTs(); 4883 if (!NarrowTy.isScalar() || TypeIdx != 2 || DstTy != ScalarTy || 4884 DstTy != NarrowTy) 4885 return UnableToLegalize; 4886 4887 assert((MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD || 4888 MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FMUL) && 4889 "Unexpected vecreduce opcode"); 4890 unsigned ScalarOpc = MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD 4891 ? TargetOpcode::G_FADD 4892 : TargetOpcode::G_FMUL; 4893 4894 SmallVector<Register> SplitSrcs; 4895 unsigned NumParts = SrcTy.getNumElements(); 4896 extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI); 4897 Register Acc = ScalarReg; 4898 for (unsigned i = 0; i < NumParts; i++) 4899 Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[i]}) 4900 .getReg(0); 4901 4902 MIRBuilder.buildCopy(DstReg, Acc); 4903 MI.eraseFromParent(); 4904 return Legalized; 4905 } 4906 4907 LegalizerHelper::LegalizeResult 4908 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg, 4909 LLT SrcTy, LLT NarrowTy, 4910 unsigned ScalarOpc) { 4911 SmallVector<Register> SplitSrcs; 4912 // Split the sources into NarrowTy size pieces. 4913 extractParts(SrcReg, NarrowTy, 4914 SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs, 4915 MIRBuilder, MRI); 4916 // We're going to do a tree reduction using vector operations until we have 4917 // one NarrowTy size value left. 4918 while (SplitSrcs.size() > 1) { 4919 SmallVector<Register> PartialRdxs; 4920 for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) { 4921 Register LHS = SplitSrcs[Idx]; 4922 Register RHS = SplitSrcs[Idx + 1]; 4923 // Create the intermediate vector op. 4924 Register Res = 4925 MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0); 4926 PartialRdxs.push_back(Res); 4927 } 4928 SplitSrcs = std::move(PartialRdxs); 4929 } 4930 // Finally generate the requested NarrowTy based reduction. 4931 Observer.changingInstr(MI); 4932 MI.getOperand(1).setReg(SplitSrcs[0]); 4933 Observer.changedInstr(MI); 4934 return Legalized; 4935 } 4936 4937 LegalizerHelper::LegalizeResult 4938 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt, 4939 const LLT HalfTy, const LLT AmtTy) { 4940 4941 Register InL = MRI.createGenericVirtualRegister(HalfTy); 4942 Register InH = MRI.createGenericVirtualRegister(HalfTy); 4943 MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1)); 4944 4945 if (Amt.isZero()) { 4946 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {InL, InH}); 4947 MI.eraseFromParent(); 4948 return Legalized; 4949 } 4950 4951 LLT NVT = HalfTy; 4952 unsigned NVTBits = HalfTy.getSizeInBits(); 4953 unsigned VTBits = 2 * NVTBits; 4954 4955 SrcOp Lo(Register(0)), Hi(Register(0)); 4956 if (MI.getOpcode() == TargetOpcode::G_SHL) { 4957 if (Amt.ugt(VTBits)) { 4958 Lo = Hi = MIRBuilder.buildConstant(NVT, 0); 4959 } else if (Amt.ugt(NVTBits)) { 4960 Lo = MIRBuilder.buildConstant(NVT, 0); 4961 Hi = MIRBuilder.buildShl(NVT, InL, 4962 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits)); 4963 } else if (Amt == NVTBits) { 4964 Lo = MIRBuilder.buildConstant(NVT, 0); 4965 Hi = InL; 4966 } else { 4967 Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt)); 4968 auto OrLHS = 4969 MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt)); 4970 auto OrRHS = MIRBuilder.buildLShr( 4971 NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits)); 4972 Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS); 4973 } 4974 } else if (MI.getOpcode() == TargetOpcode::G_LSHR) { 4975 if (Amt.ugt(VTBits)) { 4976 Lo = Hi = MIRBuilder.buildConstant(NVT, 0); 4977 } else if (Amt.ugt(NVTBits)) { 4978 Lo = MIRBuilder.buildLShr(NVT, InH, 4979 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits)); 4980 Hi = MIRBuilder.buildConstant(NVT, 0); 4981 } else if (Amt == NVTBits) { 4982 Lo = InH; 4983 Hi = MIRBuilder.buildConstant(NVT, 0); 4984 } else { 4985 auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt); 4986 4987 auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst); 4988 auto OrRHS = MIRBuilder.buildShl( 4989 NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits)); 4990 4991 Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS); 4992 Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst); 4993 } 4994 } else { 4995 if (Amt.ugt(VTBits)) { 4996 Hi = Lo = MIRBuilder.buildAShr( 4997 NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1)); 4998 } else if (Amt.ugt(NVTBits)) { 4999 Lo = MIRBuilder.buildAShr(NVT, InH, 5000 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits)); 5001 Hi = MIRBuilder.buildAShr(NVT, InH, 5002 MIRBuilder.buildConstant(AmtTy, NVTBits - 1)); 5003 } else if (Amt == NVTBits) { 5004 Lo = InH; 5005 Hi = MIRBuilder.buildAShr(NVT, InH, 5006 MIRBuilder.buildConstant(AmtTy, NVTBits - 1)); 5007 } else { 5008 auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt); 5009 5010 auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst); 5011 auto OrRHS = MIRBuilder.buildShl( 5012 NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits)); 5013 5014 Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS); 5015 Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst); 5016 } 5017 } 5018 5019 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {Lo, Hi}); 5020 MI.eraseFromParent(); 5021 5022 return Legalized; 5023 } 5024 5025 // TODO: Optimize if constant shift amount. 5026 LegalizerHelper::LegalizeResult 5027 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx, 5028 LLT RequestedTy) { 5029 if (TypeIdx == 1) { 5030 Observer.changingInstr(MI); 5031 narrowScalarSrc(MI, RequestedTy, 2); 5032 Observer.changedInstr(MI); 5033 return Legalized; 5034 } 5035 5036 Register DstReg = MI.getOperand(0).getReg(); 5037 LLT DstTy = MRI.getType(DstReg); 5038 if (DstTy.isVector()) 5039 return UnableToLegalize; 5040 5041 Register Amt = MI.getOperand(2).getReg(); 5042 LLT ShiftAmtTy = MRI.getType(Amt); 5043 const unsigned DstEltSize = DstTy.getScalarSizeInBits(); 5044 if (DstEltSize % 2 != 0) 5045 return UnableToLegalize; 5046 5047 // Ignore the input type. We can only go to exactly half the size of the 5048 // input. If that isn't small enough, the resulting pieces will be further 5049 // legalized. 5050 const unsigned NewBitSize = DstEltSize / 2; 5051 const LLT HalfTy = LLT::scalar(NewBitSize); 5052 const LLT CondTy = LLT::scalar(1); 5053 5054 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) { 5055 return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy, 5056 ShiftAmtTy); 5057 } 5058 5059 // TODO: Expand with known bits. 5060 5061 // Handle the fully general expansion by an unknown amount. 5062 auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize); 5063 5064 Register InL = MRI.createGenericVirtualRegister(HalfTy); 5065 Register InH = MRI.createGenericVirtualRegister(HalfTy); 5066 MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1)); 5067 5068 auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits); 5069 auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt); 5070 5071 auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0); 5072 auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits); 5073 auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero); 5074 5075 Register ResultRegs[2]; 5076 switch (MI.getOpcode()) { 5077 case TargetOpcode::G_SHL: { 5078 // Short: ShAmt < NewBitSize 5079 auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt); 5080 5081 auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack); 5082 auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt); 5083 auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr); 5084 5085 // Long: ShAmt >= NewBitSize 5086 auto LoL = MIRBuilder.buildConstant(HalfTy, 0); // Lo part is zero. 5087 auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part. 5088 5089 auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL); 5090 auto Hi = MIRBuilder.buildSelect( 5091 HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL)); 5092 5093 ResultRegs[0] = Lo.getReg(0); 5094 ResultRegs[1] = Hi.getReg(0); 5095 break; 5096 } 5097 case TargetOpcode::G_LSHR: 5098 case TargetOpcode::G_ASHR: { 5099 // Short: ShAmt < NewBitSize 5100 auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt}); 5101 5102 auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt); 5103 auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack); 5104 auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr); 5105 5106 // Long: ShAmt >= NewBitSize 5107 MachineInstrBuilder HiL; 5108 if (MI.getOpcode() == TargetOpcode::G_LSHR) { 5109 HiL = MIRBuilder.buildConstant(HalfTy, 0); // Hi part is zero. 5110 } else { 5111 auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1); 5112 HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt); // Sign of Hi part. 5113 } 5114 auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, 5115 {InH, AmtExcess}); // Lo from Hi part. 5116 5117 auto Lo = MIRBuilder.buildSelect( 5118 HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL)); 5119 5120 auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL); 5121 5122 ResultRegs[0] = Lo.getReg(0); 5123 ResultRegs[1] = Hi.getReg(0); 5124 break; 5125 } 5126 default: 5127 llvm_unreachable("not a shift"); 5128 } 5129 5130 MIRBuilder.buildMergeLikeInstr(DstReg, ResultRegs); 5131 MI.eraseFromParent(); 5132 return Legalized; 5133 } 5134 5135 LegalizerHelper::LegalizeResult 5136 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx, 5137 LLT MoreTy) { 5138 assert(TypeIdx == 0 && "Expecting only Idx 0"); 5139 5140 Observer.changingInstr(MI); 5141 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 5142 MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB(); 5143 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator()); 5144 moreElementsVectorSrc(MI, MoreTy, I); 5145 } 5146 5147 MachineBasicBlock &MBB = *MI.getParent(); 5148 MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI()); 5149 moreElementsVectorDst(MI, MoreTy, 0); 5150 Observer.changedInstr(MI); 5151 return Legalized; 5152 } 5153 5154 LegalizerHelper::LegalizeResult 5155 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, 5156 LLT MoreTy) { 5157 unsigned Opc = MI.getOpcode(); 5158 switch (Opc) { 5159 case TargetOpcode::G_IMPLICIT_DEF: 5160 case TargetOpcode::G_LOAD: { 5161 if (TypeIdx != 0) 5162 return UnableToLegalize; 5163 Observer.changingInstr(MI); 5164 moreElementsVectorDst(MI, MoreTy, 0); 5165 Observer.changedInstr(MI); 5166 return Legalized; 5167 } 5168 case TargetOpcode::G_STORE: 5169 if (TypeIdx != 0) 5170 return UnableToLegalize; 5171 Observer.changingInstr(MI); 5172 moreElementsVectorSrc(MI, MoreTy, 0); 5173 Observer.changedInstr(MI); 5174 return Legalized; 5175 case TargetOpcode::G_AND: 5176 case TargetOpcode::G_OR: 5177 case TargetOpcode::G_XOR: 5178 case TargetOpcode::G_ADD: 5179 case TargetOpcode::G_SUB: 5180 case TargetOpcode::G_MUL: 5181 case TargetOpcode::G_FADD: 5182 case TargetOpcode::G_FSUB: 5183 case TargetOpcode::G_FMUL: 5184 case TargetOpcode::G_FDIV: 5185 case TargetOpcode::G_UADDSAT: 5186 case TargetOpcode::G_USUBSAT: 5187 case TargetOpcode::G_SADDSAT: 5188 case TargetOpcode::G_SSUBSAT: 5189 case TargetOpcode::G_SMIN: 5190 case TargetOpcode::G_SMAX: 5191 case TargetOpcode::G_UMIN: 5192 case TargetOpcode::G_UMAX: 5193 case TargetOpcode::G_FMINNUM: 5194 case TargetOpcode::G_FMAXNUM: 5195 case TargetOpcode::G_FMINNUM_IEEE: 5196 case TargetOpcode::G_FMAXNUM_IEEE: 5197 case TargetOpcode::G_FMINIMUM: 5198 case TargetOpcode::G_FMAXIMUM: 5199 case TargetOpcode::G_STRICT_FADD: 5200 case TargetOpcode::G_STRICT_FSUB: 5201 case TargetOpcode::G_STRICT_FMUL: 5202 case TargetOpcode::G_SHL: 5203 case TargetOpcode::G_ASHR: 5204 case TargetOpcode::G_LSHR: { 5205 Observer.changingInstr(MI); 5206 moreElementsVectorSrc(MI, MoreTy, 1); 5207 moreElementsVectorSrc(MI, MoreTy, 2); 5208 moreElementsVectorDst(MI, MoreTy, 0); 5209 Observer.changedInstr(MI); 5210 return Legalized; 5211 } 5212 case TargetOpcode::G_FMA: 5213 case TargetOpcode::G_STRICT_FMA: 5214 case TargetOpcode::G_FSHR: 5215 case TargetOpcode::G_FSHL: { 5216 Observer.changingInstr(MI); 5217 moreElementsVectorSrc(MI, MoreTy, 1); 5218 moreElementsVectorSrc(MI, MoreTy, 2); 5219 moreElementsVectorSrc(MI, MoreTy, 3); 5220 moreElementsVectorDst(MI, MoreTy, 0); 5221 Observer.changedInstr(MI); 5222 return Legalized; 5223 } 5224 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 5225 case TargetOpcode::G_EXTRACT: 5226 if (TypeIdx != 1) 5227 return UnableToLegalize; 5228 Observer.changingInstr(MI); 5229 moreElementsVectorSrc(MI, MoreTy, 1); 5230 Observer.changedInstr(MI); 5231 return Legalized; 5232 case TargetOpcode::G_INSERT: 5233 case TargetOpcode::G_INSERT_VECTOR_ELT: 5234 case TargetOpcode::G_FREEZE: 5235 case TargetOpcode::G_FNEG: 5236 case TargetOpcode::G_FABS: 5237 case TargetOpcode::G_FSQRT: 5238 case TargetOpcode::G_FCEIL: 5239 case TargetOpcode::G_FFLOOR: 5240 case TargetOpcode::G_FNEARBYINT: 5241 case TargetOpcode::G_FRINT: 5242 case TargetOpcode::G_INTRINSIC_ROUND: 5243 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: 5244 case TargetOpcode::G_INTRINSIC_TRUNC: 5245 case TargetOpcode::G_BSWAP: 5246 case TargetOpcode::G_FCANONICALIZE: 5247 case TargetOpcode::G_SEXT_INREG: 5248 if (TypeIdx != 0) 5249 return UnableToLegalize; 5250 Observer.changingInstr(MI); 5251 moreElementsVectorSrc(MI, MoreTy, 1); 5252 moreElementsVectorDst(MI, MoreTy, 0); 5253 Observer.changedInstr(MI); 5254 return Legalized; 5255 case TargetOpcode::G_SELECT: { 5256 auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs(); 5257 if (TypeIdx == 1) { 5258 if (!CondTy.isScalar() || 5259 DstTy.getElementCount() != MoreTy.getElementCount()) 5260 return UnableToLegalize; 5261 5262 // This is turning a scalar select of vectors into a vector 5263 // select. Broadcast the select condition. 5264 auto ShufSplat = MIRBuilder.buildShuffleSplat(MoreTy, CondReg); 5265 Observer.changingInstr(MI); 5266 MI.getOperand(1).setReg(ShufSplat.getReg(0)); 5267 Observer.changedInstr(MI); 5268 return Legalized; 5269 } 5270 5271 if (CondTy.isVector()) 5272 return UnableToLegalize; 5273 5274 Observer.changingInstr(MI); 5275 moreElementsVectorSrc(MI, MoreTy, 2); 5276 moreElementsVectorSrc(MI, MoreTy, 3); 5277 moreElementsVectorDst(MI, MoreTy, 0); 5278 Observer.changedInstr(MI); 5279 return Legalized; 5280 } 5281 case TargetOpcode::G_UNMERGE_VALUES: 5282 return UnableToLegalize; 5283 case TargetOpcode::G_PHI: 5284 return moreElementsVectorPhi(MI, TypeIdx, MoreTy); 5285 case TargetOpcode::G_SHUFFLE_VECTOR: 5286 return moreElementsVectorShuffle(MI, TypeIdx, MoreTy); 5287 case TargetOpcode::G_BUILD_VECTOR: { 5288 SmallVector<SrcOp, 8> Elts; 5289 for (auto Op : MI.uses()) { 5290 Elts.push_back(Op.getReg()); 5291 } 5292 5293 for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) { 5294 Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType())); 5295 } 5296 5297 MIRBuilder.buildDeleteTrailingVectorElements( 5298 MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts)); 5299 MI.eraseFromParent(); 5300 return Legalized; 5301 } 5302 case TargetOpcode::G_TRUNC: 5303 case TargetOpcode::G_FPTRUNC: 5304 case TargetOpcode::G_FPEXT: 5305 case TargetOpcode::G_FPTOSI: 5306 case TargetOpcode::G_FPTOUI: 5307 case TargetOpcode::G_SITOFP: 5308 case TargetOpcode::G_UITOFP: { 5309 if (TypeIdx != 0) 5310 return UnableToLegalize; 5311 Observer.changingInstr(MI); 5312 LLT SrcTy = LLT::fixed_vector( 5313 MoreTy.getNumElements(), 5314 MRI.getType(MI.getOperand(1).getReg()).getElementType()); 5315 moreElementsVectorSrc(MI, SrcTy, 1); 5316 moreElementsVectorDst(MI, MoreTy, 0); 5317 Observer.changedInstr(MI); 5318 return Legalized; 5319 } 5320 case TargetOpcode::G_ICMP: { 5321 // TODO: the symmetric MoreTy works for targets like, e.g. NEON. 5322 // For targets, like e.g. MVE, the result is a predicated vector (i1). 5323 // This will need some refactoring. 5324 Observer.changingInstr(MI); 5325 moreElementsVectorSrc(MI, MoreTy, 2); 5326 moreElementsVectorSrc(MI, MoreTy, 3); 5327 moreElementsVectorDst(MI, MoreTy, 0); 5328 Observer.changedInstr(MI); 5329 return Legalized; 5330 } 5331 default: 5332 return UnableToLegalize; 5333 } 5334 } 5335 5336 LegalizerHelper::LegalizeResult 5337 LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) { 5338 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); 5339 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); 5340 unsigned MaskNumElts = Mask.size(); 5341 unsigned SrcNumElts = SrcTy.getNumElements(); 5342 LLT DestEltTy = DstTy.getElementType(); 5343 5344 if (MaskNumElts == SrcNumElts) 5345 return Legalized; 5346 5347 if (MaskNumElts < SrcNumElts) { 5348 // Extend mask to match new destination vector size with 5349 // undef values. 5350 SmallVector<int, 16> NewMask(Mask); 5351 for (unsigned I = MaskNumElts; I < SrcNumElts; ++I) 5352 NewMask.push_back(-1); 5353 5354 moreElementsVectorDst(MI, SrcTy, 0); 5355 MIRBuilder.setInstrAndDebugLoc(MI); 5356 MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(), 5357 MI.getOperand(1).getReg(), 5358 MI.getOperand(2).getReg(), NewMask); 5359 MI.eraseFromParent(); 5360 5361 return Legalized; 5362 } 5363 5364 unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts); 5365 unsigned NumConcat = PaddedMaskNumElts / SrcNumElts; 5366 LLT PaddedTy = LLT::fixed_vector(PaddedMaskNumElts, DestEltTy); 5367 5368 // Create new source vectors by concatenating the initial 5369 // source vectors with undefined vectors of the same size. 5370 auto Undef = MIRBuilder.buildUndef(SrcTy); 5371 SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(0)); 5372 SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(0)); 5373 MOps1[0] = MI.getOperand(1).getReg(); 5374 MOps2[0] = MI.getOperand(2).getReg(); 5375 5376 auto Src1 = MIRBuilder.buildConcatVectors(PaddedTy, MOps1); 5377 auto Src2 = MIRBuilder.buildConcatVectors(PaddedTy, MOps2); 5378 5379 // Readjust mask for new input vector length. 5380 SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1); 5381 for (unsigned I = 0; I != MaskNumElts; ++I) { 5382 int Idx = Mask[I]; 5383 if (Idx >= static_cast<int>(SrcNumElts)) 5384 Idx += PaddedMaskNumElts - SrcNumElts; 5385 MappedOps[I] = Idx; 5386 } 5387 5388 // If we got more elements than required, extract subvector. 5389 if (MaskNumElts != PaddedMaskNumElts) { 5390 auto Shuffle = 5391 MIRBuilder.buildShuffleVector(PaddedTy, Src1, Src2, MappedOps); 5392 5393 SmallVector<Register, 16> Elts(MaskNumElts); 5394 for (unsigned I = 0; I < MaskNumElts; ++I) { 5395 Elts[I] = 5396 MIRBuilder.buildExtractVectorElementConstant(DestEltTy, Shuffle, I) 5397 .getReg(0); 5398 } 5399 MIRBuilder.buildBuildVector(DstReg, Elts); 5400 } else { 5401 MIRBuilder.buildShuffleVector(DstReg, Src1, Src2, MappedOps); 5402 } 5403 5404 MI.eraseFromParent(); 5405 return LegalizerHelper::LegalizeResult::Legalized; 5406 } 5407 5408 LegalizerHelper::LegalizeResult 5409 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI, 5410 unsigned int TypeIdx, LLT MoreTy) { 5411 auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs(); 5412 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); 5413 unsigned NumElts = DstTy.getNumElements(); 5414 unsigned WidenNumElts = MoreTy.getNumElements(); 5415 5416 if (DstTy.isVector() && Src1Ty.isVector() && 5417 DstTy.getNumElements() != Src1Ty.getNumElements()) { 5418 return equalizeVectorShuffleLengths(MI); 5419 } 5420 5421 if (TypeIdx != 0) 5422 return UnableToLegalize; 5423 5424 // Expect a canonicalized shuffle. 5425 if (DstTy != Src1Ty || DstTy != Src2Ty) 5426 return UnableToLegalize; 5427 5428 moreElementsVectorSrc(MI, MoreTy, 1); 5429 moreElementsVectorSrc(MI, MoreTy, 2); 5430 5431 // Adjust mask based on new input vector length. 5432 SmallVector<int, 16> NewMask; 5433 for (unsigned I = 0; I != NumElts; ++I) { 5434 int Idx = Mask[I]; 5435 if (Idx < static_cast<int>(NumElts)) 5436 NewMask.push_back(Idx); 5437 else 5438 NewMask.push_back(Idx - NumElts + WidenNumElts); 5439 } 5440 for (unsigned I = NumElts; I != WidenNumElts; ++I) 5441 NewMask.push_back(-1); 5442 moreElementsVectorDst(MI, MoreTy, 0); 5443 MIRBuilder.setInstrAndDebugLoc(MI); 5444 MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(), 5445 MI.getOperand(1).getReg(), 5446 MI.getOperand(2).getReg(), NewMask); 5447 MI.eraseFromParent(); 5448 return Legalized; 5449 } 5450 5451 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs, 5452 ArrayRef<Register> Src1Regs, 5453 ArrayRef<Register> Src2Regs, 5454 LLT NarrowTy) { 5455 MachineIRBuilder &B = MIRBuilder; 5456 unsigned SrcParts = Src1Regs.size(); 5457 unsigned DstParts = DstRegs.size(); 5458 5459 unsigned DstIdx = 0; // Low bits of the result. 5460 Register FactorSum = 5461 B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0); 5462 DstRegs[DstIdx] = FactorSum; 5463 5464 unsigned CarrySumPrevDstIdx; 5465 SmallVector<Register, 4> Factors; 5466 5467 for (DstIdx = 1; DstIdx < DstParts; DstIdx++) { 5468 // Collect low parts of muls for DstIdx. 5469 for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1; 5470 i <= std::min(DstIdx, SrcParts - 1); ++i) { 5471 MachineInstrBuilder Mul = 5472 B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]); 5473 Factors.push_back(Mul.getReg(0)); 5474 } 5475 // Collect high parts of muls from previous DstIdx. 5476 for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts; 5477 i <= std::min(DstIdx - 1, SrcParts - 1); ++i) { 5478 MachineInstrBuilder Umulh = 5479 B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]); 5480 Factors.push_back(Umulh.getReg(0)); 5481 } 5482 // Add CarrySum from additions calculated for previous DstIdx. 5483 if (DstIdx != 1) { 5484 Factors.push_back(CarrySumPrevDstIdx); 5485 } 5486 5487 Register CarrySum; 5488 // Add all factors and accumulate all carries into CarrySum. 5489 if (DstIdx != DstParts - 1) { 5490 MachineInstrBuilder Uaddo = 5491 B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]); 5492 FactorSum = Uaddo.getReg(0); 5493 CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0); 5494 for (unsigned i = 2; i < Factors.size(); ++i) { 5495 MachineInstrBuilder Uaddo = 5496 B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]); 5497 FactorSum = Uaddo.getReg(0); 5498 MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1)); 5499 CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0); 5500 } 5501 } else { 5502 // Since value for the next index is not calculated, neither is CarrySum. 5503 FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0); 5504 for (unsigned i = 2; i < Factors.size(); ++i) 5505 FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0); 5506 } 5507 5508 CarrySumPrevDstIdx = CarrySum; 5509 DstRegs[DstIdx] = FactorSum; 5510 Factors.clear(); 5511 } 5512 } 5513 5514 LegalizerHelper::LegalizeResult 5515 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx, 5516 LLT NarrowTy) { 5517 if (TypeIdx != 0) 5518 return UnableToLegalize; 5519 5520 Register DstReg = MI.getOperand(0).getReg(); 5521 LLT DstType = MRI.getType(DstReg); 5522 // FIXME: add support for vector types 5523 if (DstType.isVector()) 5524 return UnableToLegalize; 5525 5526 unsigned Opcode = MI.getOpcode(); 5527 unsigned OpO, OpE, OpF; 5528 switch (Opcode) { 5529 case TargetOpcode::G_SADDO: 5530 case TargetOpcode::G_SADDE: 5531 case TargetOpcode::G_UADDO: 5532 case TargetOpcode::G_UADDE: 5533 case TargetOpcode::G_ADD: 5534 OpO = TargetOpcode::G_UADDO; 5535 OpE = TargetOpcode::G_UADDE; 5536 OpF = TargetOpcode::G_UADDE; 5537 if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE) 5538 OpF = TargetOpcode::G_SADDE; 5539 break; 5540 case TargetOpcode::G_SSUBO: 5541 case TargetOpcode::G_SSUBE: 5542 case TargetOpcode::G_USUBO: 5543 case TargetOpcode::G_USUBE: 5544 case TargetOpcode::G_SUB: 5545 OpO = TargetOpcode::G_USUBO; 5546 OpE = TargetOpcode::G_USUBE; 5547 OpF = TargetOpcode::G_USUBE; 5548 if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE) 5549 OpF = TargetOpcode::G_SSUBE; 5550 break; 5551 default: 5552 llvm_unreachable("Unexpected add/sub opcode!"); 5553 } 5554 5555 // 1 for a plain add/sub, 2 if this is an operation with a carry-out. 5556 unsigned NumDefs = MI.getNumExplicitDefs(); 5557 Register Src1 = MI.getOperand(NumDefs).getReg(); 5558 Register Src2 = MI.getOperand(NumDefs + 1).getReg(); 5559 Register CarryDst, CarryIn; 5560 if (NumDefs == 2) 5561 CarryDst = MI.getOperand(1).getReg(); 5562 if (MI.getNumOperands() == NumDefs + 3) 5563 CarryIn = MI.getOperand(NumDefs + 2).getReg(); 5564 5565 LLT RegTy = MRI.getType(MI.getOperand(0).getReg()); 5566 LLT LeftoverTy, DummyTy; 5567 SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs; 5568 extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left, 5569 MIRBuilder, MRI); 5570 extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left, MIRBuilder, 5571 MRI); 5572 5573 int NarrowParts = Src1Regs.size(); 5574 for (int I = 0, E = Src1Left.size(); I != E; ++I) { 5575 Src1Regs.push_back(Src1Left[I]); 5576 Src2Regs.push_back(Src2Left[I]); 5577 } 5578 DstRegs.reserve(Src1Regs.size()); 5579 5580 for (int i = 0, e = Src1Regs.size(); i != e; ++i) { 5581 Register DstReg = 5582 MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i])); 5583 Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1)); 5584 // Forward the final carry-out to the destination register 5585 if (i == e - 1 && CarryDst) 5586 CarryOut = CarryDst; 5587 5588 if (!CarryIn) { 5589 MIRBuilder.buildInstr(OpO, {DstReg, CarryOut}, 5590 {Src1Regs[i], Src2Regs[i]}); 5591 } else if (i == e - 1) { 5592 MIRBuilder.buildInstr(OpF, {DstReg, CarryOut}, 5593 {Src1Regs[i], Src2Regs[i], CarryIn}); 5594 } else { 5595 MIRBuilder.buildInstr(OpE, {DstReg, CarryOut}, 5596 {Src1Regs[i], Src2Regs[i], CarryIn}); 5597 } 5598 5599 DstRegs.push_back(DstReg); 5600 CarryIn = CarryOut; 5601 } 5602 insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy, 5603 ArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy, 5604 ArrayRef(DstRegs).drop_front(NarrowParts)); 5605 5606 MI.eraseFromParent(); 5607 return Legalized; 5608 } 5609 5610 LegalizerHelper::LegalizeResult 5611 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) { 5612 auto [DstReg, Src1, Src2] = MI.getFirst3Regs(); 5613 5614 LLT Ty = MRI.getType(DstReg); 5615 if (Ty.isVector()) 5616 return UnableToLegalize; 5617 5618 unsigned Size = Ty.getSizeInBits(); 5619 unsigned NarrowSize = NarrowTy.getSizeInBits(); 5620 if (Size % NarrowSize != 0) 5621 return UnableToLegalize; 5622 5623 unsigned NumParts = Size / NarrowSize; 5624 bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH; 5625 unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1); 5626 5627 SmallVector<Register, 2> Src1Parts, Src2Parts; 5628 SmallVector<Register, 2> DstTmpRegs(DstTmpParts); 5629 extractParts(Src1, NarrowTy, NumParts, Src1Parts, MIRBuilder, MRI); 5630 extractParts(Src2, NarrowTy, NumParts, Src2Parts, MIRBuilder, MRI); 5631 multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy); 5632 5633 // Take only high half of registers if this is high mul. 5634 ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts); 5635 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs); 5636 MI.eraseFromParent(); 5637 return Legalized; 5638 } 5639 5640 LegalizerHelper::LegalizeResult 5641 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx, 5642 LLT NarrowTy) { 5643 if (TypeIdx != 0) 5644 return UnableToLegalize; 5645 5646 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI; 5647 5648 Register Src = MI.getOperand(1).getReg(); 5649 LLT SrcTy = MRI.getType(Src); 5650 5651 // If all finite floats fit into the narrowed integer type, we can just swap 5652 // out the result type. This is practically only useful for conversions from 5653 // half to at least 16-bits, so just handle the one case. 5654 if (SrcTy.getScalarType() != LLT::scalar(16) || 5655 NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u)) 5656 return UnableToLegalize; 5657 5658 Observer.changingInstr(MI); 5659 narrowScalarDst(MI, NarrowTy, 0, 5660 IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT); 5661 Observer.changedInstr(MI); 5662 return Legalized; 5663 } 5664 5665 LegalizerHelper::LegalizeResult 5666 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx, 5667 LLT NarrowTy) { 5668 if (TypeIdx != 1) 5669 return UnableToLegalize; 5670 5671 uint64_t NarrowSize = NarrowTy.getSizeInBits(); 5672 5673 int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 5674 // FIXME: add support for when SizeOp1 isn't an exact multiple of 5675 // NarrowSize. 5676 if (SizeOp1 % NarrowSize != 0) 5677 return UnableToLegalize; 5678 int NumParts = SizeOp1 / NarrowSize; 5679 5680 SmallVector<Register, 2> SrcRegs, DstRegs; 5681 SmallVector<uint64_t, 2> Indexes; 5682 extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs, 5683 MIRBuilder, MRI); 5684 5685 Register OpReg = MI.getOperand(0).getReg(); 5686 uint64_t OpStart = MI.getOperand(2).getImm(); 5687 uint64_t OpSize = MRI.getType(OpReg).getSizeInBits(); 5688 for (int i = 0; i < NumParts; ++i) { 5689 unsigned SrcStart = i * NarrowSize; 5690 5691 if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) { 5692 // No part of the extract uses this subregister, ignore it. 5693 continue; 5694 } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) { 5695 // The entire subregister is extracted, forward the value. 5696 DstRegs.push_back(SrcRegs[i]); 5697 continue; 5698 } 5699 5700 // OpSegStart is where this destination segment would start in OpReg if it 5701 // extended infinitely in both directions. 5702 int64_t ExtractOffset; 5703 uint64_t SegSize; 5704 if (OpStart < SrcStart) { 5705 ExtractOffset = 0; 5706 SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart); 5707 } else { 5708 ExtractOffset = OpStart - SrcStart; 5709 SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize); 5710 } 5711 5712 Register SegReg = SrcRegs[i]; 5713 if (ExtractOffset != 0 || SegSize != NarrowSize) { 5714 // A genuine extract is needed. 5715 SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize)); 5716 MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset); 5717 } 5718 5719 DstRegs.push_back(SegReg); 5720 } 5721 5722 Register DstReg = MI.getOperand(0).getReg(); 5723 if (MRI.getType(DstReg).isVector()) 5724 MIRBuilder.buildBuildVector(DstReg, DstRegs); 5725 else if (DstRegs.size() > 1) 5726 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs); 5727 else 5728 MIRBuilder.buildCopy(DstReg, DstRegs[0]); 5729 MI.eraseFromParent(); 5730 return Legalized; 5731 } 5732 5733 LegalizerHelper::LegalizeResult 5734 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx, 5735 LLT NarrowTy) { 5736 // FIXME: Don't know how to handle secondary types yet. 5737 if (TypeIdx != 0) 5738 return UnableToLegalize; 5739 5740 SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs; 5741 SmallVector<uint64_t, 2> Indexes; 5742 LLT RegTy = MRI.getType(MI.getOperand(0).getReg()); 5743 LLT LeftoverTy; 5744 extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs, 5745 LeftoverRegs, MIRBuilder, MRI); 5746 5747 for (Register Reg : LeftoverRegs) 5748 SrcRegs.push_back(Reg); 5749 5750 uint64_t NarrowSize = NarrowTy.getSizeInBits(); 5751 Register OpReg = MI.getOperand(2).getReg(); 5752 uint64_t OpStart = MI.getOperand(3).getImm(); 5753 uint64_t OpSize = MRI.getType(OpReg).getSizeInBits(); 5754 for (int I = 0, E = SrcRegs.size(); I != E; ++I) { 5755 unsigned DstStart = I * NarrowSize; 5756 5757 if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) { 5758 // The entire subregister is defined by this insert, forward the new 5759 // value. 5760 DstRegs.push_back(OpReg); 5761 continue; 5762 } 5763 5764 Register SrcReg = SrcRegs[I]; 5765 if (MRI.getType(SrcRegs[I]) == LeftoverTy) { 5766 // The leftover reg is smaller than NarrowTy, so we need to extend it. 5767 SrcReg = MRI.createGenericVirtualRegister(NarrowTy); 5768 MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]); 5769 } 5770 5771 if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) { 5772 // No part of the insert affects this subregister, forward the original. 5773 DstRegs.push_back(SrcReg); 5774 continue; 5775 } 5776 5777 // OpSegStart is where this destination segment would start in OpReg if it 5778 // extended infinitely in both directions. 5779 int64_t ExtractOffset, InsertOffset; 5780 uint64_t SegSize; 5781 if (OpStart < DstStart) { 5782 InsertOffset = 0; 5783 ExtractOffset = DstStart - OpStart; 5784 SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart); 5785 } else { 5786 InsertOffset = OpStart - DstStart; 5787 ExtractOffset = 0; 5788 SegSize = 5789 std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart); 5790 } 5791 5792 Register SegReg = OpReg; 5793 if (ExtractOffset != 0 || SegSize != OpSize) { 5794 // A genuine extract is needed. 5795 SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize)); 5796 MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset); 5797 } 5798 5799 Register DstReg = MRI.createGenericVirtualRegister(NarrowTy); 5800 MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset); 5801 DstRegs.push_back(DstReg); 5802 } 5803 5804 uint64_t WideSize = DstRegs.size() * NarrowSize; 5805 Register DstReg = MI.getOperand(0).getReg(); 5806 if (WideSize > RegTy.getSizeInBits()) { 5807 Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize)); 5808 MIRBuilder.buildMergeLikeInstr(MergeReg, DstRegs); 5809 MIRBuilder.buildTrunc(DstReg, MergeReg); 5810 } else 5811 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs); 5812 5813 MI.eraseFromParent(); 5814 return Legalized; 5815 } 5816 5817 LegalizerHelper::LegalizeResult 5818 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx, 5819 LLT NarrowTy) { 5820 Register DstReg = MI.getOperand(0).getReg(); 5821 LLT DstTy = MRI.getType(DstReg); 5822 5823 assert(MI.getNumOperands() == 3 && TypeIdx == 0); 5824 5825 SmallVector<Register, 4> DstRegs, DstLeftoverRegs; 5826 SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs; 5827 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs; 5828 LLT LeftoverTy; 5829 if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy, 5830 Src0Regs, Src0LeftoverRegs, MIRBuilder, MRI)) 5831 return UnableToLegalize; 5832 5833 LLT Unused; 5834 if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused, 5835 Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI)) 5836 llvm_unreachable("inconsistent extractParts result"); 5837 5838 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) { 5839 auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy}, 5840 {Src0Regs[I], Src1Regs[I]}); 5841 DstRegs.push_back(Inst.getReg(0)); 5842 } 5843 5844 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) { 5845 auto Inst = MIRBuilder.buildInstr( 5846 MI.getOpcode(), 5847 {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]}); 5848 DstLeftoverRegs.push_back(Inst.getReg(0)); 5849 } 5850 5851 insertParts(DstReg, DstTy, NarrowTy, DstRegs, 5852 LeftoverTy, DstLeftoverRegs); 5853 5854 MI.eraseFromParent(); 5855 return Legalized; 5856 } 5857 5858 LegalizerHelper::LegalizeResult 5859 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx, 5860 LLT NarrowTy) { 5861 if (TypeIdx != 0) 5862 return UnableToLegalize; 5863 5864 auto [DstReg, SrcReg] = MI.getFirst2Regs(); 5865 5866 LLT DstTy = MRI.getType(DstReg); 5867 if (DstTy.isVector()) 5868 return UnableToLegalize; 5869 5870 SmallVector<Register, 8> Parts; 5871 LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg); 5872 LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode()); 5873 buildWidenedRemergeToDst(DstReg, LCMTy, Parts); 5874 5875 MI.eraseFromParent(); 5876 return Legalized; 5877 } 5878 5879 LegalizerHelper::LegalizeResult 5880 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx, 5881 LLT NarrowTy) { 5882 if (TypeIdx != 0) 5883 return UnableToLegalize; 5884 5885 Register CondReg = MI.getOperand(1).getReg(); 5886 LLT CondTy = MRI.getType(CondReg); 5887 if (CondTy.isVector()) // TODO: Handle vselect 5888 return UnableToLegalize; 5889 5890 Register DstReg = MI.getOperand(0).getReg(); 5891 LLT DstTy = MRI.getType(DstReg); 5892 5893 SmallVector<Register, 4> DstRegs, DstLeftoverRegs; 5894 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs; 5895 SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs; 5896 LLT LeftoverTy; 5897 if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy, 5898 Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI)) 5899 return UnableToLegalize; 5900 5901 LLT Unused; 5902 if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused, 5903 Src2Regs, Src2LeftoverRegs, MIRBuilder, MRI)) 5904 llvm_unreachable("inconsistent extractParts result"); 5905 5906 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) { 5907 auto Select = MIRBuilder.buildSelect(NarrowTy, 5908 CondReg, Src1Regs[I], Src2Regs[I]); 5909 DstRegs.push_back(Select.getReg(0)); 5910 } 5911 5912 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) { 5913 auto Select = MIRBuilder.buildSelect( 5914 LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]); 5915 DstLeftoverRegs.push_back(Select.getReg(0)); 5916 } 5917 5918 insertParts(DstReg, DstTy, NarrowTy, DstRegs, 5919 LeftoverTy, DstLeftoverRegs); 5920 5921 MI.eraseFromParent(); 5922 return Legalized; 5923 } 5924 5925 LegalizerHelper::LegalizeResult 5926 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx, 5927 LLT NarrowTy) { 5928 if (TypeIdx != 1) 5929 return UnableToLegalize; 5930 5931 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); 5932 unsigned NarrowSize = NarrowTy.getSizeInBits(); 5933 5934 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { 5935 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF; 5936 5937 MachineIRBuilder &B = MIRBuilder; 5938 auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg); 5939 // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi) 5940 auto C_0 = B.buildConstant(NarrowTy, 0); 5941 auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), 5942 UnmergeSrc.getReg(1), C_0); 5943 auto LoCTLZ = IsUndef ? 5944 B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) : 5945 B.buildCTLZ(DstTy, UnmergeSrc.getReg(0)); 5946 auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize); 5947 auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize); 5948 auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)); 5949 B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ); 5950 5951 MI.eraseFromParent(); 5952 return Legalized; 5953 } 5954 5955 return UnableToLegalize; 5956 } 5957 5958 LegalizerHelper::LegalizeResult 5959 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx, 5960 LLT NarrowTy) { 5961 if (TypeIdx != 1) 5962 return UnableToLegalize; 5963 5964 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); 5965 unsigned NarrowSize = NarrowTy.getSizeInBits(); 5966 5967 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { 5968 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF; 5969 5970 MachineIRBuilder &B = MIRBuilder; 5971 auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg); 5972 // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo) 5973 auto C_0 = B.buildConstant(NarrowTy, 0); 5974 auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), 5975 UnmergeSrc.getReg(0), C_0); 5976 auto HiCTTZ = IsUndef ? 5977 B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) : 5978 B.buildCTTZ(DstTy, UnmergeSrc.getReg(1)); 5979 auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize); 5980 auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize); 5981 auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)); 5982 B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ); 5983 5984 MI.eraseFromParent(); 5985 return Legalized; 5986 } 5987 5988 return UnableToLegalize; 5989 } 5990 5991 LegalizerHelper::LegalizeResult 5992 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, 5993 LLT NarrowTy) { 5994 if (TypeIdx != 1) 5995 return UnableToLegalize; 5996 5997 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); 5998 unsigned NarrowSize = NarrowTy.getSizeInBits(); 5999 6000 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { 6001 auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1)); 6002 6003 auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0)); 6004 auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1)); 6005 MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP); 6006 6007 MI.eraseFromParent(); 6008 return Legalized; 6009 } 6010 6011 return UnableToLegalize; 6012 } 6013 6014 LegalizerHelper::LegalizeResult 6015 LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx, 6016 LLT NarrowTy) { 6017 if (TypeIdx != 1) 6018 return UnableToLegalize; 6019 6020 MachineIRBuilder &B = MIRBuilder; 6021 Register ExpReg = MI.getOperand(2).getReg(); 6022 LLT ExpTy = MRI.getType(ExpReg); 6023 6024 unsigned ClampSize = NarrowTy.getScalarSizeInBits(); 6025 6026 // Clamp the exponent to the range of the target type. 6027 auto MinExp = B.buildConstant(ExpTy, minIntN(ClampSize)); 6028 auto ClampMin = B.buildSMax(ExpTy, ExpReg, MinExp); 6029 auto MaxExp = B.buildConstant(ExpTy, maxIntN(ClampSize)); 6030 auto Clamp = B.buildSMin(ExpTy, ClampMin, MaxExp); 6031 6032 auto Trunc = B.buildTrunc(NarrowTy, Clamp); 6033 Observer.changingInstr(MI); 6034 MI.getOperand(2).setReg(Trunc.getReg(0)); 6035 Observer.changedInstr(MI); 6036 return Legalized; 6037 } 6038 6039 LegalizerHelper::LegalizeResult 6040 LegalizerHelper::lowerBitCount(MachineInstr &MI) { 6041 unsigned Opc = MI.getOpcode(); 6042 const auto &TII = MIRBuilder.getTII(); 6043 auto isSupported = [this](const LegalityQuery &Q) { 6044 auto QAction = LI.getAction(Q).Action; 6045 return QAction == Legal || QAction == Libcall || QAction == Custom; 6046 }; 6047 switch (Opc) { 6048 default: 6049 return UnableToLegalize; 6050 case TargetOpcode::G_CTLZ_ZERO_UNDEF: { 6051 // This trivially expands to CTLZ. 6052 Observer.changingInstr(MI); 6053 MI.setDesc(TII.get(TargetOpcode::G_CTLZ)); 6054 Observer.changedInstr(MI); 6055 return Legalized; 6056 } 6057 case TargetOpcode::G_CTLZ: { 6058 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); 6059 unsigned Len = SrcTy.getSizeInBits(); 6060 6061 if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) { 6062 // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero. 6063 auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg); 6064 auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0); 6065 auto ICmp = MIRBuilder.buildICmp( 6066 CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc); 6067 auto LenConst = MIRBuilder.buildConstant(DstTy, Len); 6068 MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU); 6069 MI.eraseFromParent(); 6070 return Legalized; 6071 } 6072 // for now, we do this: 6073 // NewLen = NextPowerOf2(Len); 6074 // x = x | (x >> 1); 6075 // x = x | (x >> 2); 6076 // ... 6077 // x = x | (x >>16); 6078 // x = x | (x >>32); // for 64-bit input 6079 // Upto NewLen/2 6080 // return Len - popcount(x); 6081 // 6082 // Ref: "Hacker's Delight" by Henry Warren 6083 Register Op = SrcReg; 6084 unsigned NewLen = PowerOf2Ceil(Len); 6085 for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) { 6086 auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i); 6087 auto MIBOp = MIRBuilder.buildOr( 6088 SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt)); 6089 Op = MIBOp.getReg(0); 6090 } 6091 auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op); 6092 MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len), 6093 MIBPop); 6094 MI.eraseFromParent(); 6095 return Legalized; 6096 } 6097 case TargetOpcode::G_CTTZ_ZERO_UNDEF: { 6098 // This trivially expands to CTTZ. 6099 Observer.changingInstr(MI); 6100 MI.setDesc(TII.get(TargetOpcode::G_CTTZ)); 6101 Observer.changedInstr(MI); 6102 return Legalized; 6103 } 6104 case TargetOpcode::G_CTTZ: { 6105 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); 6106 6107 unsigned Len = SrcTy.getSizeInBits(); 6108 if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) { 6109 // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with 6110 // zero. 6111 auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg); 6112 auto Zero = MIRBuilder.buildConstant(SrcTy, 0); 6113 auto ICmp = MIRBuilder.buildICmp( 6114 CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero); 6115 auto LenConst = MIRBuilder.buildConstant(DstTy, Len); 6116 MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU); 6117 MI.eraseFromParent(); 6118 return Legalized; 6119 } 6120 // for now, we use: { return popcount(~x & (x - 1)); } 6121 // unless the target has ctlz but not ctpop, in which case we use: 6122 // { return 32 - nlz(~x & (x-1)); } 6123 // Ref: "Hacker's Delight" by Henry Warren 6124 auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1); 6125 auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1); 6126 auto MIBTmp = MIRBuilder.buildAnd( 6127 SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1)); 6128 if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) && 6129 isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) { 6130 auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len); 6131 MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen, 6132 MIRBuilder.buildCTLZ(SrcTy, MIBTmp)); 6133 MI.eraseFromParent(); 6134 return Legalized; 6135 } 6136 Observer.changingInstr(MI); 6137 MI.setDesc(TII.get(TargetOpcode::G_CTPOP)); 6138 MI.getOperand(1).setReg(MIBTmp.getReg(0)); 6139 Observer.changedInstr(MI); 6140 return Legalized; 6141 } 6142 case TargetOpcode::G_CTPOP: { 6143 Register SrcReg = MI.getOperand(1).getReg(); 6144 LLT Ty = MRI.getType(SrcReg); 6145 unsigned Size = Ty.getSizeInBits(); 6146 MachineIRBuilder &B = MIRBuilder; 6147 6148 // Count set bits in blocks of 2 bits. Default approach would be 6149 // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 } 6150 // We use following formula instead: 6151 // B2Count = val - { (val >> 1) & 0x55555555 } 6152 // since it gives same result in blocks of 2 with one instruction less. 6153 auto C_1 = B.buildConstant(Ty, 1); 6154 auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1); 6155 APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55)); 6156 auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0); 6157 auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0); 6158 auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi); 6159 6160 // In order to get count in blocks of 4 add values from adjacent block of 2. 6161 // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 } 6162 auto C_2 = B.buildConstant(Ty, 2); 6163 auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2); 6164 APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33)); 6165 auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0); 6166 auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0); 6167 auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0); 6168 auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count); 6169 6170 // For count in blocks of 8 bits we don't have to mask high 4 bits before 6171 // addition since count value sits in range {0,...,8} and 4 bits are enough 6172 // to hold such binary values. After addition high 4 bits still hold count 6173 // of set bits in high 4 bit block, set them to zero and get 8 bit result. 6174 // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F 6175 auto C_4 = B.buildConstant(Ty, 4); 6176 auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4); 6177 auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count); 6178 APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F)); 6179 auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0); 6180 auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0); 6181 6182 assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm"); 6183 // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this 6184 // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks. 6185 auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01))); 6186 auto ResTmp = B.buildMul(Ty, B8Count, MulMask); 6187 6188 // Shift count result from 8 high bits to low bits. 6189 auto C_SizeM8 = B.buildConstant(Ty, Size - 8); 6190 B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8); 6191 6192 MI.eraseFromParent(); 6193 return Legalized; 6194 } 6195 } 6196 } 6197 6198 // Check that (every element of) Reg is undef or not an exact multiple of BW. 6199 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI, 6200 Register Reg, unsigned BW) { 6201 return matchUnaryPredicate( 6202 MRI, Reg, 6203 [=](const Constant *C) { 6204 // Null constant here means an undef. 6205 const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C); 6206 return !CI || CI->getValue().urem(BW) != 0; 6207 }, 6208 /*AllowUndefs*/ true); 6209 } 6210 6211 LegalizerHelper::LegalizeResult 6212 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) { 6213 auto [Dst, X, Y, Z] = MI.getFirst4Regs(); 6214 LLT Ty = MRI.getType(Dst); 6215 LLT ShTy = MRI.getType(Z); 6216 6217 unsigned BW = Ty.getScalarSizeInBits(); 6218 6219 if (!isPowerOf2_32(BW)) 6220 return UnableToLegalize; 6221 6222 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL; 6223 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL; 6224 6225 if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) { 6226 // fshl X, Y, Z -> fshr X, Y, -Z 6227 // fshr X, Y, Z -> fshl X, Y, -Z 6228 auto Zero = MIRBuilder.buildConstant(ShTy, 0); 6229 Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0); 6230 } else { 6231 // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z 6232 // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z 6233 auto One = MIRBuilder.buildConstant(ShTy, 1); 6234 if (IsFSHL) { 6235 Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0); 6236 X = MIRBuilder.buildLShr(Ty, X, One).getReg(0); 6237 } else { 6238 X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0); 6239 Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0); 6240 } 6241 6242 Z = MIRBuilder.buildNot(ShTy, Z).getReg(0); 6243 } 6244 6245 MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z}); 6246 MI.eraseFromParent(); 6247 return Legalized; 6248 } 6249 6250 LegalizerHelper::LegalizeResult 6251 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) { 6252 auto [Dst, X, Y, Z] = MI.getFirst4Regs(); 6253 LLT Ty = MRI.getType(Dst); 6254 LLT ShTy = MRI.getType(Z); 6255 6256 const unsigned BW = Ty.getScalarSizeInBits(); 6257 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL; 6258 6259 Register ShX, ShY; 6260 Register ShAmt, InvShAmt; 6261 6262 // FIXME: Emit optimized urem by constant instead of letting it expand later. 6263 if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) { 6264 // fshl: X << C | Y >> (BW - C) 6265 // fshr: X << (BW - C) | Y >> C 6266 // where C = Z % BW is not zero 6267 auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW); 6268 ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0); 6269 InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0); 6270 ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0); 6271 ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0); 6272 } else { 6273 // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW)) 6274 // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW) 6275 auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1); 6276 if (isPowerOf2_32(BW)) { 6277 // Z % BW -> Z & (BW - 1) 6278 ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0); 6279 // (BW - 1) - (Z % BW) -> ~Z & (BW - 1) 6280 auto NotZ = MIRBuilder.buildNot(ShTy, Z); 6281 InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0); 6282 } else { 6283 auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW); 6284 ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0); 6285 InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0); 6286 } 6287 6288 auto One = MIRBuilder.buildConstant(ShTy, 1); 6289 if (IsFSHL) { 6290 ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0); 6291 auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One); 6292 ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0); 6293 } else { 6294 auto ShX1 = MIRBuilder.buildShl(Ty, X, One); 6295 ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0); 6296 ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0); 6297 } 6298 } 6299 6300 MIRBuilder.buildOr(Dst, ShX, ShY); 6301 MI.eraseFromParent(); 6302 return Legalized; 6303 } 6304 6305 LegalizerHelper::LegalizeResult 6306 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) { 6307 // These operations approximately do the following (while avoiding undefined 6308 // shifts by BW): 6309 // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 6310 // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 6311 Register Dst = MI.getOperand(0).getReg(); 6312 LLT Ty = MRI.getType(Dst); 6313 LLT ShTy = MRI.getType(MI.getOperand(3).getReg()); 6314 6315 bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL; 6316 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL; 6317 6318 // TODO: Use smarter heuristic that accounts for vector legalization. 6319 if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower) 6320 return lowerFunnelShiftAsShifts(MI); 6321 6322 // This only works for powers of 2, fallback to shifts if it fails. 6323 LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI); 6324 if (Result == UnableToLegalize) 6325 return lowerFunnelShiftAsShifts(MI); 6326 return Result; 6327 } 6328 6329 LegalizerHelper::LegalizeResult LegalizerHelper::lowerEXT(MachineInstr &MI) { 6330 auto [Dst, Src] = MI.getFirst2Regs(); 6331 LLT DstTy = MRI.getType(Dst); 6332 LLT SrcTy = MRI.getType(Src); 6333 6334 uint32_t DstTySize = DstTy.getSizeInBits(); 6335 uint32_t DstTyScalarSize = DstTy.getScalarSizeInBits(); 6336 uint32_t SrcTyScalarSize = SrcTy.getScalarSizeInBits(); 6337 6338 if (!isPowerOf2_32(DstTySize) || !isPowerOf2_32(DstTyScalarSize) || 6339 !isPowerOf2_32(SrcTyScalarSize)) 6340 return UnableToLegalize; 6341 6342 // The step between extend is too large, split it by creating an intermediate 6343 // extend instruction 6344 if (SrcTyScalarSize * 2 < DstTyScalarSize) { 6345 LLT MidTy = SrcTy.changeElementSize(SrcTyScalarSize * 2); 6346 // If the destination type is illegal, split it into multiple statements 6347 // zext x -> zext(merge(zext(unmerge), zext(unmerge))) 6348 auto NewExt = MIRBuilder.buildInstr(MI.getOpcode(), {MidTy}, {Src}); 6349 // Unmerge the vector 6350 LLT EltTy = MidTy.changeElementCount( 6351 MidTy.getElementCount().divideCoefficientBy(2)); 6352 auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, NewExt); 6353 6354 // ZExt the vectors 6355 LLT ZExtResTy = DstTy.changeElementCount( 6356 DstTy.getElementCount().divideCoefficientBy(2)); 6357 auto ZExtRes1 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy}, 6358 {UnmergeSrc.getReg(0)}); 6359 auto ZExtRes2 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy}, 6360 {UnmergeSrc.getReg(1)}); 6361 6362 // Merge the ending vectors 6363 MIRBuilder.buildMergeLikeInstr(Dst, {ZExtRes1, ZExtRes2}); 6364 6365 MI.eraseFromParent(); 6366 return Legalized; 6367 } 6368 return UnableToLegalize; 6369 } 6370 6371 LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) { 6372 // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 6373 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 6374 // Similar to how operand splitting is done in SelectiondDAG, we can handle 6375 // %res(v8s8) = G_TRUNC %in(v8s32) by generating: 6376 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>) 6377 // %lo16(<4 x s16>) = G_TRUNC %inlo 6378 // %hi16(<4 x s16>) = G_TRUNC %inhi 6379 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16 6380 // %res(<8 x s8>) = G_TRUNC %in16 6381 6382 assert(MI.getOpcode() == TargetOpcode::G_TRUNC); 6383 6384 Register DstReg = MI.getOperand(0).getReg(); 6385 Register SrcReg = MI.getOperand(1).getReg(); 6386 LLT DstTy = MRI.getType(DstReg); 6387 LLT SrcTy = MRI.getType(SrcReg); 6388 6389 if (DstTy.isVector() && isPowerOf2_32(DstTy.getNumElements()) && 6390 isPowerOf2_32(DstTy.getScalarSizeInBits()) && 6391 isPowerOf2_32(SrcTy.getNumElements()) && 6392 isPowerOf2_32(SrcTy.getScalarSizeInBits())) { 6393 // Split input type. 6394 LLT SplitSrcTy = SrcTy.changeElementCount( 6395 SrcTy.getElementCount().divideCoefficientBy(2)); 6396 6397 // First, split the source into two smaller vectors. 6398 SmallVector<Register, 2> SplitSrcs; 6399 extractParts(SrcReg, SplitSrcTy, 2, SplitSrcs, MIRBuilder, MRI); 6400 6401 // Truncate the splits into intermediate narrower elements. 6402 LLT InterTy; 6403 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits()) 6404 InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2); 6405 else 6406 InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits()); 6407 for (unsigned I = 0; I < SplitSrcs.size(); ++I) { 6408 SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0); 6409 } 6410 6411 // Combine the new truncates into one vector 6412 auto Merge = MIRBuilder.buildMergeLikeInstr( 6413 DstTy.changeElementSize(InterTy.getScalarSizeInBits()), SplitSrcs); 6414 6415 // Truncate the new vector to the final result type 6416 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits()) 6417 MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), Merge.getReg(0)); 6418 else 6419 MIRBuilder.buildCopy(MI.getOperand(0).getReg(), Merge.getReg(0)); 6420 6421 MI.eraseFromParent(); 6422 6423 return Legalized; 6424 } 6425 return UnableToLegalize; 6426 } 6427 6428 LegalizerHelper::LegalizeResult 6429 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) { 6430 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs(); 6431 auto Zero = MIRBuilder.buildConstant(AmtTy, 0); 6432 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL; 6433 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL; 6434 auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt); 6435 MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg}); 6436 MI.eraseFromParent(); 6437 return Legalized; 6438 } 6439 6440 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) { 6441 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs(); 6442 6443 unsigned EltSizeInBits = DstTy.getScalarSizeInBits(); 6444 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL; 6445 6446 MIRBuilder.setInstrAndDebugLoc(MI); 6447 6448 // If a rotate in the other direction is supported, use it. 6449 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL; 6450 if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) && 6451 isPowerOf2_32(EltSizeInBits)) 6452 return lowerRotateWithReverseRotate(MI); 6453 6454 // If a funnel shift is supported, use it. 6455 unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR; 6456 unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR; 6457 bool IsFShLegal = false; 6458 if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) || 6459 LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) { 6460 auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2, 6461 Register R3) { 6462 MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3}); 6463 MI.eraseFromParent(); 6464 return Legalized; 6465 }; 6466 // If a funnel shift in the other direction is supported, use it. 6467 if (IsFShLegal) { 6468 return buildFunnelShift(FShOpc, Dst, Src, Amt); 6469 } else if (isPowerOf2_32(EltSizeInBits)) { 6470 Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0); 6471 return buildFunnelShift(RevFsh, Dst, Src, Amt); 6472 } 6473 } 6474 6475 auto Zero = MIRBuilder.buildConstant(AmtTy, 0); 6476 unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR; 6477 unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL; 6478 auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1); 6479 Register ShVal; 6480 Register RevShiftVal; 6481 if (isPowerOf2_32(EltSizeInBits)) { 6482 // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1)) 6483 // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1)) 6484 auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt); 6485 auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC); 6486 ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0); 6487 auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC); 6488 RevShiftVal = 6489 MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0); 6490 } else { 6491 // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w)) 6492 // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w)) 6493 auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits); 6494 auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC); 6495 ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0); 6496 auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt); 6497 auto One = MIRBuilder.buildConstant(AmtTy, 1); 6498 auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One}); 6499 RevShiftVal = 6500 MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0); 6501 } 6502 MIRBuilder.buildOr(Dst, ShVal, RevShiftVal); 6503 MI.eraseFromParent(); 6504 return Legalized; 6505 } 6506 6507 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float 6508 // representation. 6509 LegalizerHelper::LegalizeResult 6510 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) { 6511 auto [Dst, Src] = MI.getFirst2Regs(); 6512 const LLT S64 = LLT::scalar(64); 6513 const LLT S32 = LLT::scalar(32); 6514 const LLT S1 = LLT::scalar(1); 6515 6516 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32); 6517 6518 // unsigned cul2f(ulong u) { 6519 // uint lz = clz(u); 6520 // uint e = (u != 0) ? 127U + 63U - lz : 0; 6521 // u = (u << lz) & 0x7fffffffffffffffUL; 6522 // ulong t = u & 0xffffffffffUL; 6523 // uint v = (e << 23) | (uint)(u >> 40); 6524 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U); 6525 // return as_float(v + r); 6526 // } 6527 6528 auto Zero32 = MIRBuilder.buildConstant(S32, 0); 6529 auto Zero64 = MIRBuilder.buildConstant(S64, 0); 6530 6531 auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src); 6532 6533 auto K = MIRBuilder.buildConstant(S32, 127U + 63U); 6534 auto Sub = MIRBuilder.buildSub(S32, K, LZ); 6535 6536 auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64); 6537 auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32); 6538 6539 auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1); 6540 auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ); 6541 6542 auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0); 6543 6544 auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL); 6545 auto T = MIRBuilder.buildAnd(S64, U, Mask1); 6546 6547 auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40)); 6548 auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23)); 6549 auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl)); 6550 6551 auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL); 6552 auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C); 6553 auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C); 6554 auto One = MIRBuilder.buildConstant(S32, 1); 6555 6556 auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One); 6557 auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32); 6558 auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0); 6559 MIRBuilder.buildAdd(Dst, V, R); 6560 6561 MI.eraseFromParent(); 6562 return Legalized; 6563 } 6564 6565 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) { 6566 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); 6567 6568 if (SrcTy == LLT::scalar(1)) { 6569 auto True = MIRBuilder.buildFConstant(DstTy, 1.0); 6570 auto False = MIRBuilder.buildFConstant(DstTy, 0.0); 6571 MIRBuilder.buildSelect(Dst, Src, True, False); 6572 MI.eraseFromParent(); 6573 return Legalized; 6574 } 6575 6576 if (SrcTy != LLT::scalar(64)) 6577 return UnableToLegalize; 6578 6579 if (DstTy == LLT::scalar(32)) { 6580 // TODO: SelectionDAG has several alternative expansions to port which may 6581 // be more reasonble depending on the available instructions. If a target 6582 // has sitofp, does not have CTLZ, or can efficiently use f64 as an 6583 // intermediate type, this is probably worse. 6584 return lowerU64ToF32BitOps(MI); 6585 } 6586 6587 return UnableToLegalize; 6588 } 6589 6590 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) { 6591 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); 6592 6593 const LLT S64 = LLT::scalar(64); 6594 const LLT S32 = LLT::scalar(32); 6595 const LLT S1 = LLT::scalar(1); 6596 6597 if (SrcTy == S1) { 6598 auto True = MIRBuilder.buildFConstant(DstTy, -1.0); 6599 auto False = MIRBuilder.buildFConstant(DstTy, 0.0); 6600 MIRBuilder.buildSelect(Dst, Src, True, False); 6601 MI.eraseFromParent(); 6602 return Legalized; 6603 } 6604 6605 if (SrcTy != S64) 6606 return UnableToLegalize; 6607 6608 if (DstTy == S32) { 6609 // signed cl2f(long l) { 6610 // long s = l >> 63; 6611 // float r = cul2f((l + s) ^ s); 6612 // return s ? -r : r; 6613 // } 6614 Register L = Src; 6615 auto SignBit = MIRBuilder.buildConstant(S64, 63); 6616 auto S = MIRBuilder.buildAShr(S64, L, SignBit); 6617 6618 auto LPlusS = MIRBuilder.buildAdd(S64, L, S); 6619 auto Xor = MIRBuilder.buildXor(S64, LPlusS, S); 6620 auto R = MIRBuilder.buildUITOFP(S32, Xor); 6621 6622 auto RNeg = MIRBuilder.buildFNeg(S32, R); 6623 auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S, 6624 MIRBuilder.buildConstant(S64, 0)); 6625 MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R); 6626 MI.eraseFromParent(); 6627 return Legalized; 6628 } 6629 6630 return UnableToLegalize; 6631 } 6632 6633 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) { 6634 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); 6635 const LLT S64 = LLT::scalar(64); 6636 const LLT S32 = LLT::scalar(32); 6637 6638 if (SrcTy != S64 && SrcTy != S32) 6639 return UnableToLegalize; 6640 if (DstTy != S32 && DstTy != S64) 6641 return UnableToLegalize; 6642 6643 // FPTOSI gives same result as FPTOUI for positive signed integers. 6644 // FPTOUI needs to deal with fp values that convert to unsigned integers 6645 // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp. 6646 6647 APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits()); 6648 APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle() 6649 : APFloat::IEEEdouble(), 6650 APInt::getZero(SrcTy.getSizeInBits())); 6651 TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven); 6652 6653 MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src); 6654 6655 MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP); 6656 // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on 6657 // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1. 6658 MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold); 6659 MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub); 6660 MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt); 6661 MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit); 6662 6663 const LLT S1 = LLT::scalar(1); 6664 6665 MachineInstrBuilder FCMP = 6666 MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold); 6667 MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res); 6668 6669 MI.eraseFromParent(); 6670 return Legalized; 6671 } 6672 6673 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) { 6674 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); 6675 const LLT S64 = LLT::scalar(64); 6676 const LLT S32 = LLT::scalar(32); 6677 6678 // FIXME: Only f32 to i64 conversions are supported. 6679 if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64) 6680 return UnableToLegalize; 6681 6682 // Expand f32 -> i64 conversion 6683 // This algorithm comes from compiler-rt's implementation of fixsfdi: 6684 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c 6685 6686 unsigned SrcEltBits = SrcTy.getScalarSizeInBits(); 6687 6688 auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000); 6689 auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23); 6690 6691 auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask); 6692 auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit); 6693 6694 auto SignMask = MIRBuilder.buildConstant(SrcTy, 6695 APInt::getSignMask(SrcEltBits)); 6696 auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask); 6697 auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1); 6698 auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit); 6699 Sign = MIRBuilder.buildSExt(DstTy, Sign); 6700 6701 auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF); 6702 auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask); 6703 auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000); 6704 6705 auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K); 6706 R = MIRBuilder.buildZExt(DstTy, R); 6707 6708 auto Bias = MIRBuilder.buildConstant(SrcTy, 127); 6709 auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias); 6710 auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit); 6711 auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent); 6712 6713 auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent); 6714 auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub); 6715 6716 const LLT S1 = LLT::scalar(1); 6717 auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, 6718 S1, Exponent, ExponentLoBit); 6719 6720 R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl); 6721 6722 auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign); 6723 auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign); 6724 6725 auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0); 6726 6727 auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, 6728 S1, Exponent, ZeroSrcTy); 6729 6730 auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0); 6731 MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret); 6732 6733 MI.eraseFromParent(); 6734 return Legalized; 6735 } 6736 6737 // f64 -> f16 conversion using round-to-nearest-even rounding mode. 6738 LegalizerHelper::LegalizeResult 6739 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) { 6740 const LLT S1 = LLT::scalar(1); 6741 const LLT S32 = LLT::scalar(32); 6742 6743 auto [Dst, Src] = MI.getFirst2Regs(); 6744 assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) && 6745 MRI.getType(Src).getScalarType() == LLT::scalar(64)); 6746 6747 if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly. 6748 return UnableToLegalize; 6749 6750 if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) { 6751 unsigned Flags = MI.getFlags(); 6752 auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags); 6753 MIRBuilder.buildFPTrunc(Dst, Src32, Flags); 6754 MI.eraseFromParent(); 6755 return Legalized; 6756 } 6757 6758 const unsigned ExpMask = 0x7ff; 6759 const unsigned ExpBiasf64 = 1023; 6760 const unsigned ExpBiasf16 = 15; 6761 6762 auto Unmerge = MIRBuilder.buildUnmerge(S32, Src); 6763 Register U = Unmerge.getReg(0); 6764 Register UH = Unmerge.getReg(1); 6765 6766 auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20)); 6767 E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask)); 6768 6769 // Subtract the fp64 exponent bias (1023) to get the real exponent and 6770 // add the f16 bias (15) to get the biased exponent for the f16 format. 6771 E = MIRBuilder.buildAdd( 6772 S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16)); 6773 6774 auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8)); 6775 M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe)); 6776 6777 auto MaskedSig = MIRBuilder.buildAnd(S32, UH, 6778 MIRBuilder.buildConstant(S32, 0x1ff)); 6779 MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U); 6780 6781 auto Zero = MIRBuilder.buildConstant(S32, 0); 6782 auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero); 6783 auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0); 6784 M = MIRBuilder.buildOr(S32, M, Lo40Set); 6785 6786 // (M != 0 ? 0x0200 : 0) | 0x7c00; 6787 auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200); 6788 auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero); 6789 auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero); 6790 6791 auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00); 6792 auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00); 6793 6794 // N = M | (E << 12); 6795 auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12)); 6796 auto N = MIRBuilder.buildOr(S32, M, EShl12); 6797 6798 // B = clamp(1-E, 0, 13); 6799 auto One = MIRBuilder.buildConstant(S32, 1); 6800 auto OneSubExp = MIRBuilder.buildSub(S32, One, E); 6801 auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero); 6802 B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13)); 6803 6804 auto SigSetHigh = MIRBuilder.buildOr(S32, M, 6805 MIRBuilder.buildConstant(S32, 0x1000)); 6806 6807 auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B); 6808 auto D0 = MIRBuilder.buildShl(S32, D, B); 6809 6810 auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, 6811 D0, SigSetHigh); 6812 auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh); 6813 D = MIRBuilder.buildOr(S32, D, D1); 6814 6815 auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One); 6816 auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N); 6817 6818 auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7)); 6819 V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2)); 6820 6821 auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3, 6822 MIRBuilder.buildConstant(S32, 3)); 6823 auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3); 6824 6825 auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3, 6826 MIRBuilder.buildConstant(S32, 5)); 6827 auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5); 6828 6829 V1 = MIRBuilder.buildOr(S32, V0, V1); 6830 V = MIRBuilder.buildAdd(S32, V, V1); 6831 6832 auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, 6833 E, MIRBuilder.buildConstant(S32, 30)); 6834 V = MIRBuilder.buildSelect(S32, CmpEGt30, 6835 MIRBuilder.buildConstant(S32, 0x7c00), V); 6836 6837 auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, 6838 E, MIRBuilder.buildConstant(S32, 1039)); 6839 V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V); 6840 6841 // Extract the sign bit. 6842 auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16)); 6843 Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000)); 6844 6845 // Insert the sign bit 6846 V = MIRBuilder.buildOr(S32, Sign, V); 6847 6848 MIRBuilder.buildTrunc(Dst, V); 6849 MI.eraseFromParent(); 6850 return Legalized; 6851 } 6852 6853 LegalizerHelper::LegalizeResult 6854 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) { 6855 auto [DstTy, SrcTy] = MI.getFirst2LLTs(); 6856 const LLT S64 = LLT::scalar(64); 6857 const LLT S16 = LLT::scalar(16); 6858 6859 if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64) 6860 return lowerFPTRUNC_F64_TO_F16(MI); 6861 6862 return UnableToLegalize; 6863 } 6864 6865 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a 6866 // multiplication tree. 6867 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) { 6868 auto [Dst, Src0, Src1] = MI.getFirst3Regs(); 6869 LLT Ty = MRI.getType(Dst); 6870 6871 auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1); 6872 MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags()); 6873 MI.eraseFromParent(); 6874 return Legalized; 6875 } 6876 6877 static CmpInst::Predicate minMaxToCompare(unsigned Opc) { 6878 switch (Opc) { 6879 case TargetOpcode::G_SMIN: 6880 return CmpInst::ICMP_SLT; 6881 case TargetOpcode::G_SMAX: 6882 return CmpInst::ICMP_SGT; 6883 case TargetOpcode::G_UMIN: 6884 return CmpInst::ICMP_ULT; 6885 case TargetOpcode::G_UMAX: 6886 return CmpInst::ICMP_UGT; 6887 default: 6888 llvm_unreachable("not in integer min/max"); 6889 } 6890 } 6891 6892 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) { 6893 auto [Dst, Src0, Src1] = MI.getFirst3Regs(); 6894 6895 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); 6896 LLT CmpType = MRI.getType(Dst).changeElementSize(1); 6897 6898 auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1); 6899 MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1); 6900 6901 MI.eraseFromParent(); 6902 return Legalized; 6903 } 6904 6905 LegalizerHelper::LegalizeResult 6906 LegalizerHelper::lowerFCopySign(MachineInstr &MI) { 6907 auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs(); 6908 const int Src0Size = Src0Ty.getScalarSizeInBits(); 6909 const int Src1Size = Src1Ty.getScalarSizeInBits(); 6910 6911 auto SignBitMask = MIRBuilder.buildConstant( 6912 Src0Ty, APInt::getSignMask(Src0Size)); 6913 6914 auto NotSignBitMask = MIRBuilder.buildConstant( 6915 Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1)); 6916 6917 Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0); 6918 Register And1; 6919 if (Src0Ty == Src1Ty) { 6920 And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0); 6921 } else if (Src0Size > Src1Size) { 6922 auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size); 6923 auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1); 6924 auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt); 6925 And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0); 6926 } else { 6927 auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size); 6928 auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt); 6929 auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift); 6930 And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0); 6931 } 6932 6933 // Be careful about setting nsz/nnan/ninf on every instruction, since the 6934 // constants are a nan and -0.0, but the final result should preserve 6935 // everything. 6936 unsigned Flags = MI.getFlags(); 6937 MIRBuilder.buildOr(Dst, And0, And1, Flags); 6938 6939 MI.eraseFromParent(); 6940 return Legalized; 6941 } 6942 6943 LegalizerHelper::LegalizeResult 6944 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) { 6945 unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ? 6946 TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE; 6947 6948 auto [Dst, Src0, Src1] = MI.getFirst3Regs(); 6949 LLT Ty = MRI.getType(Dst); 6950 6951 if (!MI.getFlag(MachineInstr::FmNoNans)) { 6952 // Insert canonicalizes if it's possible we need to quiet to get correct 6953 // sNaN behavior. 6954 6955 // Note this must be done here, and not as an optimization combine in the 6956 // absence of a dedicate quiet-snan instruction as we're using an 6957 // omni-purpose G_FCANONICALIZE. 6958 if (!isKnownNeverSNaN(Src0, MRI)) 6959 Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0); 6960 6961 if (!isKnownNeverSNaN(Src1, MRI)) 6962 Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0); 6963 } 6964 6965 // If there are no nans, it's safe to simply replace this with the non-IEEE 6966 // version. 6967 MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags()); 6968 MI.eraseFromParent(); 6969 return Legalized; 6970 } 6971 6972 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) { 6973 // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c 6974 Register DstReg = MI.getOperand(0).getReg(); 6975 LLT Ty = MRI.getType(DstReg); 6976 unsigned Flags = MI.getFlags(); 6977 6978 auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2), 6979 Flags); 6980 MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags); 6981 MI.eraseFromParent(); 6982 return Legalized; 6983 } 6984 6985 LegalizerHelper::LegalizeResult 6986 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) { 6987 auto [DstReg, X] = MI.getFirst2Regs(); 6988 const unsigned Flags = MI.getFlags(); 6989 const LLT Ty = MRI.getType(DstReg); 6990 const LLT CondTy = Ty.changeElementSize(1); 6991 6992 // round(x) => 6993 // t = trunc(x); 6994 // d = fabs(x - t); 6995 // o = copysign(d >= 0.5 ? 1.0 : 0.0, x); 6996 // return t + o; 6997 6998 auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags); 6999 7000 auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags); 7001 auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags); 7002 7003 auto Half = MIRBuilder.buildFConstant(Ty, 0.5); 7004 auto Cmp = 7005 MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half, Flags); 7006 7007 // Could emit G_UITOFP instead 7008 auto One = MIRBuilder.buildFConstant(Ty, 1.0); 7009 auto Zero = MIRBuilder.buildFConstant(Ty, 0.0); 7010 auto BoolFP = MIRBuilder.buildSelect(Ty, Cmp, One, Zero); 7011 auto SignedOffset = MIRBuilder.buildFCopysign(Ty, BoolFP, X); 7012 7013 MIRBuilder.buildFAdd(DstReg, T, SignedOffset, Flags); 7014 7015 MI.eraseFromParent(); 7016 return Legalized; 7017 } 7018 7019 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) { 7020 auto [DstReg, SrcReg] = MI.getFirst2Regs(); 7021 unsigned Flags = MI.getFlags(); 7022 LLT Ty = MRI.getType(DstReg); 7023 const LLT CondTy = Ty.changeElementSize(1); 7024 7025 // result = trunc(src); 7026 // if (src < 0.0 && src != result) 7027 // result += -1.0. 7028 7029 auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags); 7030 auto Zero = MIRBuilder.buildFConstant(Ty, 0.0); 7031 7032 auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy, 7033 SrcReg, Zero, Flags); 7034 auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy, 7035 SrcReg, Trunc, Flags); 7036 auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc); 7037 auto AddVal = MIRBuilder.buildSITOFP(Ty, And); 7038 7039 MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags); 7040 MI.eraseFromParent(); 7041 return Legalized; 7042 } 7043 7044 LegalizerHelper::LegalizeResult 7045 LegalizerHelper::lowerMergeValues(MachineInstr &MI) { 7046 const unsigned NumOps = MI.getNumOperands(); 7047 auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs(); 7048 unsigned PartSize = Src0Ty.getSizeInBits(); 7049 7050 LLT WideTy = LLT::scalar(DstTy.getSizeInBits()); 7051 Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0); 7052 7053 for (unsigned I = 2; I != NumOps; ++I) { 7054 const unsigned Offset = (I - 1) * PartSize; 7055 7056 Register SrcReg = MI.getOperand(I).getReg(); 7057 auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg); 7058 7059 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg : 7060 MRI.createGenericVirtualRegister(WideTy); 7061 7062 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset); 7063 auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt); 7064 MIRBuilder.buildOr(NextResult, ResultReg, Shl); 7065 ResultReg = NextResult; 7066 } 7067 7068 if (DstTy.isPointer()) { 7069 if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace( 7070 DstTy.getAddressSpace())) { 7071 LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n"); 7072 return UnableToLegalize; 7073 } 7074 7075 MIRBuilder.buildIntToPtr(DstReg, ResultReg); 7076 } 7077 7078 MI.eraseFromParent(); 7079 return Legalized; 7080 } 7081 7082 LegalizerHelper::LegalizeResult 7083 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) { 7084 const unsigned NumDst = MI.getNumOperands() - 1; 7085 Register SrcReg = MI.getOperand(NumDst).getReg(); 7086 Register Dst0Reg = MI.getOperand(0).getReg(); 7087 LLT DstTy = MRI.getType(Dst0Reg); 7088 if (DstTy.isPointer()) 7089 return UnableToLegalize; // TODO 7090 7091 SrcReg = coerceToScalar(SrcReg); 7092 if (!SrcReg) 7093 return UnableToLegalize; 7094 7095 // Expand scalarizing unmerge as bitcast to integer and shift. 7096 LLT IntTy = MRI.getType(SrcReg); 7097 7098 MIRBuilder.buildTrunc(Dst0Reg, SrcReg); 7099 7100 const unsigned DstSize = DstTy.getSizeInBits(); 7101 unsigned Offset = DstSize; 7102 for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) { 7103 auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset); 7104 auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt); 7105 MIRBuilder.buildTrunc(MI.getOperand(I), Shift); 7106 } 7107 7108 MI.eraseFromParent(); 7109 return Legalized; 7110 } 7111 7112 /// Lower a vector extract or insert by writing the vector to a stack temporary 7113 /// and reloading the element or vector. 7114 /// 7115 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx 7116 /// => 7117 /// %stack_temp = G_FRAME_INDEX 7118 /// G_STORE %vec, %stack_temp 7119 /// %idx = clamp(%idx, %vec.getNumElements()) 7120 /// %element_ptr = G_PTR_ADD %stack_temp, %idx 7121 /// %dst = G_LOAD %element_ptr 7122 LegalizerHelper::LegalizeResult 7123 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) { 7124 Register DstReg = MI.getOperand(0).getReg(); 7125 Register SrcVec = MI.getOperand(1).getReg(); 7126 Register InsertVal; 7127 if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT) 7128 InsertVal = MI.getOperand(2).getReg(); 7129 7130 Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg(); 7131 7132 LLT VecTy = MRI.getType(SrcVec); 7133 LLT EltTy = VecTy.getElementType(); 7134 unsigned NumElts = VecTy.getNumElements(); 7135 7136 int64_t IdxVal; 7137 if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) { 7138 SmallVector<Register, 8> SrcRegs; 7139 extractParts(SrcVec, EltTy, NumElts, SrcRegs, MIRBuilder, MRI); 7140 7141 if (InsertVal) { 7142 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 7143 MIRBuilder.buildMergeLikeInstr(DstReg, SrcRegs); 7144 } else { 7145 MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]); 7146 } 7147 7148 MI.eraseFromParent(); 7149 return Legalized; 7150 } 7151 7152 if (!EltTy.isByteSized()) { // Not implemented. 7153 LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n"); 7154 return UnableToLegalize; 7155 } 7156 7157 unsigned EltBytes = EltTy.getSizeInBytes(); 7158 Align VecAlign = getStackTemporaryAlignment(VecTy); 7159 Align EltAlign; 7160 7161 MachinePointerInfo PtrInfo; 7162 auto StackTemp = createStackTemporary( 7163 TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign, PtrInfo); 7164 MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign); 7165 7166 // Get the pointer to the element, and be sure not to hit undefined behavior 7167 // if the index is out of bounds. 7168 Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx); 7169 7170 if (mi_match(Idx, MRI, m_ICst(IdxVal))) { 7171 int64_t Offset = IdxVal * EltBytes; 7172 PtrInfo = PtrInfo.getWithOffset(Offset); 7173 EltAlign = commonAlignment(VecAlign, Offset); 7174 } else { 7175 // We lose information with a variable offset. 7176 EltAlign = getStackTemporaryAlignment(EltTy); 7177 PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace()); 7178 } 7179 7180 if (InsertVal) { 7181 // Write the inserted element 7182 MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign); 7183 7184 // Reload the whole vector. 7185 MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign); 7186 } else { 7187 MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign); 7188 } 7189 7190 MI.eraseFromParent(); 7191 return Legalized; 7192 } 7193 7194 LegalizerHelper::LegalizeResult 7195 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) { 7196 auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] = 7197 MI.getFirst3RegLLTs(); 7198 LLT IdxTy = LLT::scalar(32); 7199 7200 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); 7201 Register Undef; 7202 SmallVector<Register, 32> BuildVec; 7203 LLT EltTy = DstTy.getScalarType(); 7204 7205 for (int Idx : Mask) { 7206 if (Idx < 0) { 7207 if (!Undef.isValid()) 7208 Undef = MIRBuilder.buildUndef(EltTy).getReg(0); 7209 BuildVec.push_back(Undef); 7210 continue; 7211 } 7212 7213 if (Src0Ty.isScalar()) { 7214 BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg); 7215 } else { 7216 int NumElts = Src0Ty.getNumElements(); 7217 Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg; 7218 int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts; 7219 auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx); 7220 auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK); 7221 BuildVec.push_back(Extract.getReg(0)); 7222 } 7223 } 7224 7225 if (DstTy.isScalar()) 7226 MIRBuilder.buildCopy(DstReg, BuildVec[0]); 7227 else 7228 MIRBuilder.buildBuildVector(DstReg, BuildVec); 7229 MI.eraseFromParent(); 7230 return Legalized; 7231 } 7232 7233 Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg, 7234 Register AllocSize, 7235 Align Alignment, 7236 LLT PtrTy) { 7237 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 7238 7239 auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg); 7240 SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp); 7241 7242 // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't 7243 // have to generate an extra instruction to negate the alloc and then use 7244 // G_PTR_ADD to add the negative offset. 7245 auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize); 7246 if (Alignment > Align(1)) { 7247 APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true); 7248 AlignMask.negate(); 7249 auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask); 7250 Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst); 7251 } 7252 7253 return MIRBuilder.buildCast(PtrTy, Alloc).getReg(0); 7254 } 7255 7256 LegalizerHelper::LegalizeResult 7257 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { 7258 const auto &MF = *MI.getMF(); 7259 const auto &TFI = *MF.getSubtarget().getFrameLowering(); 7260 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp) 7261 return UnableToLegalize; 7262 7263 Register Dst = MI.getOperand(0).getReg(); 7264 Register AllocSize = MI.getOperand(1).getReg(); 7265 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 7266 7267 LLT PtrTy = MRI.getType(Dst); 7268 Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); 7269 Register SPTmp = 7270 getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy); 7271 7272 MIRBuilder.buildCopy(SPReg, SPTmp); 7273 MIRBuilder.buildCopy(Dst, SPTmp); 7274 7275 MI.eraseFromParent(); 7276 return Legalized; 7277 } 7278 7279 LegalizerHelper::LegalizeResult 7280 LegalizerHelper::lowerStackSave(MachineInstr &MI) { 7281 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore(); 7282 if (!StackPtr) 7283 return UnableToLegalize; 7284 7285 MIRBuilder.buildCopy(MI.getOperand(0), StackPtr); 7286 MI.eraseFromParent(); 7287 return Legalized; 7288 } 7289 7290 LegalizerHelper::LegalizeResult 7291 LegalizerHelper::lowerStackRestore(MachineInstr &MI) { 7292 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore(); 7293 if (!StackPtr) 7294 return UnableToLegalize; 7295 7296 MIRBuilder.buildCopy(StackPtr, MI.getOperand(0)); 7297 MI.eraseFromParent(); 7298 return Legalized; 7299 } 7300 7301 LegalizerHelper::LegalizeResult 7302 LegalizerHelper::lowerExtract(MachineInstr &MI) { 7303 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); 7304 unsigned Offset = MI.getOperand(2).getImm(); 7305 7306 // Extract sub-vector or one element 7307 if (SrcTy.isVector()) { 7308 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits(); 7309 unsigned DstSize = DstTy.getSizeInBits(); 7310 7311 if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) && 7312 (Offset + DstSize <= SrcTy.getSizeInBits())) { 7313 // Unmerge and allow access to each Src element for the artifact combiner. 7314 auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), SrcReg); 7315 7316 // Take element(s) we need to extract and copy it (merge them). 7317 SmallVector<Register, 8> SubVectorElts; 7318 for (unsigned Idx = Offset / SrcEltSize; 7319 Idx < (Offset + DstSize) / SrcEltSize; ++Idx) { 7320 SubVectorElts.push_back(Unmerge.getReg(Idx)); 7321 } 7322 if (SubVectorElts.size() == 1) 7323 MIRBuilder.buildCopy(DstReg, SubVectorElts[0]); 7324 else 7325 MIRBuilder.buildMergeLikeInstr(DstReg, SubVectorElts); 7326 7327 MI.eraseFromParent(); 7328 return Legalized; 7329 } 7330 } 7331 7332 if (DstTy.isScalar() && 7333 (SrcTy.isScalar() || 7334 (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) { 7335 LLT SrcIntTy = SrcTy; 7336 if (!SrcTy.isScalar()) { 7337 SrcIntTy = LLT::scalar(SrcTy.getSizeInBits()); 7338 SrcReg = MIRBuilder.buildBitcast(SrcIntTy, SrcReg).getReg(0); 7339 } 7340 7341 if (Offset == 0) 7342 MIRBuilder.buildTrunc(DstReg, SrcReg); 7343 else { 7344 auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset); 7345 auto Shr = MIRBuilder.buildLShr(SrcIntTy, SrcReg, ShiftAmt); 7346 MIRBuilder.buildTrunc(DstReg, Shr); 7347 } 7348 7349 MI.eraseFromParent(); 7350 return Legalized; 7351 } 7352 7353 return UnableToLegalize; 7354 } 7355 7356 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) { 7357 auto [Dst, Src, InsertSrc] = MI.getFirst3Regs(); 7358 uint64_t Offset = MI.getOperand(3).getImm(); 7359 7360 LLT DstTy = MRI.getType(Src); 7361 LLT InsertTy = MRI.getType(InsertSrc); 7362 7363 // Insert sub-vector or one element 7364 if (DstTy.isVector() && !InsertTy.isPointer()) { 7365 LLT EltTy = DstTy.getElementType(); 7366 unsigned EltSize = EltTy.getSizeInBits(); 7367 unsigned InsertSize = InsertTy.getSizeInBits(); 7368 7369 if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) && 7370 (Offset + InsertSize <= DstTy.getSizeInBits())) { 7371 auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src); 7372 SmallVector<Register, 8> DstElts; 7373 unsigned Idx = 0; 7374 // Elements from Src before insert start Offset 7375 for (; Idx < Offset / EltSize; ++Idx) { 7376 DstElts.push_back(UnmergeSrc.getReg(Idx)); 7377 } 7378 7379 // Replace elements in Src with elements from InsertSrc 7380 if (InsertTy.getSizeInBits() > EltSize) { 7381 auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc); 7382 for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize; 7383 ++Idx, ++i) { 7384 DstElts.push_back(UnmergeInsertSrc.getReg(i)); 7385 } 7386 } else { 7387 DstElts.push_back(InsertSrc); 7388 ++Idx; 7389 } 7390 7391 // Remaining elements from Src after insert 7392 for (; Idx < DstTy.getNumElements(); ++Idx) { 7393 DstElts.push_back(UnmergeSrc.getReg(Idx)); 7394 } 7395 7396 MIRBuilder.buildMergeLikeInstr(Dst, DstElts); 7397 MI.eraseFromParent(); 7398 return Legalized; 7399 } 7400 } 7401 7402 if (InsertTy.isVector() || 7403 (DstTy.isVector() && DstTy.getElementType() != InsertTy)) 7404 return UnableToLegalize; 7405 7406 const DataLayout &DL = MIRBuilder.getDataLayout(); 7407 if ((DstTy.isPointer() && 7408 DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) || 7409 (InsertTy.isPointer() && 7410 DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) { 7411 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n"); 7412 return UnableToLegalize; 7413 } 7414 7415 LLT IntDstTy = DstTy; 7416 7417 if (!DstTy.isScalar()) { 7418 IntDstTy = LLT::scalar(DstTy.getSizeInBits()); 7419 Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0); 7420 } 7421 7422 if (!InsertTy.isScalar()) { 7423 const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits()); 7424 InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0); 7425 } 7426 7427 Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0); 7428 if (Offset != 0) { 7429 auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset); 7430 ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0); 7431 } 7432 7433 APInt MaskVal = APInt::getBitsSetWithWrap( 7434 DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset); 7435 7436 auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal); 7437 auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask); 7438 auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc); 7439 7440 MIRBuilder.buildCast(Dst, Or); 7441 MI.eraseFromParent(); 7442 return Legalized; 7443 } 7444 7445 LegalizerHelper::LegalizeResult 7446 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) { 7447 auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] = 7448 MI.getFirst4RegLLTs(); 7449 const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO; 7450 7451 LLT Ty = Dst0Ty; 7452 LLT BoolTy = Dst1Ty; 7453 7454 if (IsAdd) 7455 MIRBuilder.buildAdd(Dst0, LHS, RHS); 7456 else 7457 MIRBuilder.buildSub(Dst0, LHS, RHS); 7458 7459 // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow. 7460 7461 auto Zero = MIRBuilder.buildConstant(Ty, 0); 7462 7463 // For an addition, the result should be less than one of the operands (LHS) 7464 // if and only if the other operand (RHS) is negative, otherwise there will 7465 // be overflow. 7466 // For a subtraction, the result should be less than one of the operands 7467 // (LHS) if and only if the other operand (RHS) is (non-zero) positive, 7468 // otherwise there will be overflow. 7469 auto ResultLowerThanLHS = 7470 MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS); 7471 auto ConditionRHS = MIRBuilder.buildICmp( 7472 IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero); 7473 7474 MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS); 7475 MI.eraseFromParent(); 7476 return Legalized; 7477 } 7478 7479 LegalizerHelper::LegalizeResult 7480 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) { 7481 auto [Res, LHS, RHS] = MI.getFirst3Regs(); 7482 LLT Ty = MRI.getType(Res); 7483 bool IsSigned; 7484 bool IsAdd; 7485 unsigned BaseOp; 7486 switch (MI.getOpcode()) { 7487 default: 7488 llvm_unreachable("unexpected addsat/subsat opcode"); 7489 case TargetOpcode::G_UADDSAT: 7490 IsSigned = false; 7491 IsAdd = true; 7492 BaseOp = TargetOpcode::G_ADD; 7493 break; 7494 case TargetOpcode::G_SADDSAT: 7495 IsSigned = true; 7496 IsAdd = true; 7497 BaseOp = TargetOpcode::G_ADD; 7498 break; 7499 case TargetOpcode::G_USUBSAT: 7500 IsSigned = false; 7501 IsAdd = false; 7502 BaseOp = TargetOpcode::G_SUB; 7503 break; 7504 case TargetOpcode::G_SSUBSAT: 7505 IsSigned = true; 7506 IsAdd = false; 7507 BaseOp = TargetOpcode::G_SUB; 7508 break; 7509 } 7510 7511 if (IsSigned) { 7512 // sadd.sat(a, b) -> 7513 // hi = 0x7fffffff - smax(a, 0) 7514 // lo = 0x80000000 - smin(a, 0) 7515 // a + smin(smax(lo, b), hi) 7516 // ssub.sat(a, b) -> 7517 // lo = smax(a, -1) - 0x7fffffff 7518 // hi = smin(a, -1) - 0x80000000 7519 // a - smin(smax(lo, b), hi) 7520 // TODO: AMDGPU can use a "median of 3" instruction here: 7521 // a +/- med3(lo, b, hi) 7522 uint64_t NumBits = Ty.getScalarSizeInBits(); 7523 auto MaxVal = 7524 MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits)); 7525 auto MinVal = 7526 MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits)); 7527 MachineInstrBuilder Hi, Lo; 7528 if (IsAdd) { 7529 auto Zero = MIRBuilder.buildConstant(Ty, 0); 7530 Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero)); 7531 Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero)); 7532 } else { 7533 auto NegOne = MIRBuilder.buildConstant(Ty, -1); 7534 Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne), 7535 MaxVal); 7536 Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne), 7537 MinVal); 7538 } 7539 auto RHSClamped = 7540 MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi); 7541 MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped}); 7542 } else { 7543 // uadd.sat(a, b) -> a + umin(~a, b) 7544 // usub.sat(a, b) -> a - umin(a, b) 7545 Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS; 7546 auto Min = MIRBuilder.buildUMin(Ty, Not, RHS); 7547 MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min}); 7548 } 7549 7550 MI.eraseFromParent(); 7551 return Legalized; 7552 } 7553 7554 LegalizerHelper::LegalizeResult 7555 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) { 7556 auto [Res, LHS, RHS] = MI.getFirst3Regs(); 7557 LLT Ty = MRI.getType(Res); 7558 LLT BoolTy = Ty.changeElementSize(1); 7559 bool IsSigned; 7560 bool IsAdd; 7561 unsigned OverflowOp; 7562 switch (MI.getOpcode()) { 7563 default: 7564 llvm_unreachable("unexpected addsat/subsat opcode"); 7565 case TargetOpcode::G_UADDSAT: 7566 IsSigned = false; 7567 IsAdd = true; 7568 OverflowOp = TargetOpcode::G_UADDO; 7569 break; 7570 case TargetOpcode::G_SADDSAT: 7571 IsSigned = true; 7572 IsAdd = true; 7573 OverflowOp = TargetOpcode::G_SADDO; 7574 break; 7575 case TargetOpcode::G_USUBSAT: 7576 IsSigned = false; 7577 IsAdd = false; 7578 OverflowOp = TargetOpcode::G_USUBO; 7579 break; 7580 case TargetOpcode::G_SSUBSAT: 7581 IsSigned = true; 7582 IsAdd = false; 7583 OverflowOp = TargetOpcode::G_SSUBO; 7584 break; 7585 } 7586 7587 auto OverflowRes = 7588 MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS}); 7589 Register Tmp = OverflowRes.getReg(0); 7590 Register Ov = OverflowRes.getReg(1); 7591 MachineInstrBuilder Clamp; 7592 if (IsSigned) { 7593 // sadd.sat(a, b) -> 7594 // {tmp, ov} = saddo(a, b) 7595 // ov ? (tmp >>s 31) + 0x80000000 : r 7596 // ssub.sat(a, b) -> 7597 // {tmp, ov} = ssubo(a, b) 7598 // ov ? (tmp >>s 31) + 0x80000000 : r 7599 uint64_t NumBits = Ty.getScalarSizeInBits(); 7600 auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1); 7601 auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount); 7602 auto MinVal = 7603 MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits)); 7604 Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal); 7605 } else { 7606 // uadd.sat(a, b) -> 7607 // {tmp, ov} = uaddo(a, b) 7608 // ov ? 0xffffffff : tmp 7609 // usub.sat(a, b) -> 7610 // {tmp, ov} = usubo(a, b) 7611 // ov ? 0 : tmp 7612 Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0); 7613 } 7614 MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp); 7615 7616 MI.eraseFromParent(); 7617 return Legalized; 7618 } 7619 7620 LegalizerHelper::LegalizeResult 7621 LegalizerHelper::lowerShlSat(MachineInstr &MI) { 7622 assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT || 7623 MI.getOpcode() == TargetOpcode::G_USHLSAT) && 7624 "Expected shlsat opcode!"); 7625 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT; 7626 auto [Res, LHS, RHS] = MI.getFirst3Regs(); 7627 LLT Ty = MRI.getType(Res); 7628 LLT BoolTy = Ty.changeElementSize(1); 7629 7630 unsigned BW = Ty.getScalarSizeInBits(); 7631 auto Result = MIRBuilder.buildShl(Ty, LHS, RHS); 7632 auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS) 7633 : MIRBuilder.buildLShr(Ty, Result, RHS); 7634 7635 MachineInstrBuilder SatVal; 7636 if (IsSigned) { 7637 auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW)); 7638 auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW)); 7639 auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS, 7640 MIRBuilder.buildConstant(Ty, 0)); 7641 SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax); 7642 } else { 7643 SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW)); 7644 } 7645 auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig); 7646 MIRBuilder.buildSelect(Res, Ov, SatVal, Result); 7647 7648 MI.eraseFromParent(); 7649 return Legalized; 7650 } 7651 7652 LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) { 7653 auto [Dst, Src] = MI.getFirst2Regs(); 7654 const LLT Ty = MRI.getType(Src); 7655 unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8; 7656 unsigned BaseShiftAmt = (SizeInBytes - 1) * 8; 7657 7658 // Swap most and least significant byte, set remaining bytes in Res to zero. 7659 auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt); 7660 auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt); 7661 auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt); 7662 auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft); 7663 7664 // Set i-th high/low byte in Res to i-th low/high byte from Src. 7665 for (unsigned i = 1; i < SizeInBytes / 2; ++i) { 7666 // AND with Mask leaves byte i unchanged and sets remaining bytes to 0. 7667 APInt APMask(SizeInBytes * 8, 0xFF << (i * 8)); 7668 auto Mask = MIRBuilder.buildConstant(Ty, APMask); 7669 auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i); 7670 // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt. 7671 auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask); 7672 auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt); 7673 Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft); 7674 // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask. 7675 auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt); 7676 auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask); 7677 Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight); 7678 } 7679 Res.getInstr()->getOperand(0).setReg(Dst); 7680 7681 MI.eraseFromParent(); 7682 return Legalized; 7683 } 7684 7685 //{ (Src & Mask) >> N } | { (Src << N) & Mask } 7686 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B, 7687 MachineInstrBuilder Src, APInt Mask) { 7688 const LLT Ty = Dst.getLLTTy(*B.getMRI()); 7689 MachineInstrBuilder C_N = B.buildConstant(Ty, N); 7690 MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask); 7691 auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N); 7692 auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0); 7693 return B.buildOr(Dst, LHS, RHS); 7694 } 7695 7696 LegalizerHelper::LegalizeResult 7697 LegalizerHelper::lowerBitreverse(MachineInstr &MI) { 7698 auto [Dst, Src] = MI.getFirst2Regs(); 7699 const LLT Ty = MRI.getType(Src); 7700 unsigned Size = Ty.getSizeInBits(); 7701 7702 MachineInstrBuilder BSWAP = 7703 MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src}); 7704 7705 // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654 7706 // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4] 7707 // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0] 7708 MachineInstrBuilder Swap4 = 7709 SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0))); 7710 7711 // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76 7712 // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2] 7713 // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC] 7714 MachineInstrBuilder Swap2 = 7715 SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC))); 7716 7717 // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7 7718 // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1] 7719 // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA] 7720 SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA))); 7721 7722 MI.eraseFromParent(); 7723 return Legalized; 7724 } 7725 7726 LegalizerHelper::LegalizeResult 7727 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) { 7728 MachineFunction &MF = MIRBuilder.getMF(); 7729 7730 bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER; 7731 int NameOpIdx = IsRead ? 1 : 0; 7732 int ValRegIndex = IsRead ? 0 : 1; 7733 7734 Register ValReg = MI.getOperand(ValRegIndex).getReg(); 7735 const LLT Ty = MRI.getType(ValReg); 7736 const MDString *RegStr = cast<MDString>( 7737 cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0)); 7738 7739 Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF); 7740 if (!PhysReg.isValid()) 7741 return UnableToLegalize; 7742 7743 if (IsRead) 7744 MIRBuilder.buildCopy(ValReg, PhysReg); 7745 else 7746 MIRBuilder.buildCopy(PhysReg, ValReg); 7747 7748 MI.eraseFromParent(); 7749 return Legalized; 7750 } 7751 7752 LegalizerHelper::LegalizeResult 7753 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) { 7754 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH; 7755 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; 7756 Register Result = MI.getOperand(0).getReg(); 7757 LLT OrigTy = MRI.getType(Result); 7758 auto SizeInBits = OrigTy.getScalarSizeInBits(); 7759 LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2); 7760 7761 auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)}); 7762 auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)}); 7763 auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS); 7764 unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR; 7765 7766 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits); 7767 auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt}); 7768 MIRBuilder.buildTrunc(Result, Shifted); 7769 7770 MI.eraseFromParent(); 7771 return Legalized; 7772 } 7773 7774 LegalizerHelper::LegalizeResult 7775 LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) { 7776 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); 7777 FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm()); 7778 7779 if (Mask == fcNone) { 7780 MIRBuilder.buildConstant(DstReg, 0); 7781 MI.eraseFromParent(); 7782 return Legalized; 7783 } 7784 if (Mask == fcAllFlags) { 7785 MIRBuilder.buildConstant(DstReg, 1); 7786 MI.eraseFromParent(); 7787 return Legalized; 7788 } 7789 7790 // TODO: Try inverting the test with getInvertedFPClassTest like the DAG 7791 // version 7792 7793 unsigned BitSize = SrcTy.getScalarSizeInBits(); 7794 const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType()); 7795 7796 LLT IntTy = LLT::scalar(BitSize); 7797 if (SrcTy.isVector()) 7798 IntTy = LLT::vector(SrcTy.getElementCount(), IntTy); 7799 auto AsInt = MIRBuilder.buildCopy(IntTy, SrcReg); 7800 7801 // Various masks. 7802 APInt SignBit = APInt::getSignMask(BitSize); 7803 APInt ValueMask = APInt::getSignedMaxValue(BitSize); // All bits but sign. 7804 APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit. 7805 APInt ExpMask = Inf; 7806 APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf; 7807 APInt QNaNBitMask = 7808 APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1); 7809 APInt InvertionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits()); 7810 7811 auto SignBitC = MIRBuilder.buildConstant(IntTy, SignBit); 7812 auto ValueMaskC = MIRBuilder.buildConstant(IntTy, ValueMask); 7813 auto InfC = MIRBuilder.buildConstant(IntTy, Inf); 7814 auto ExpMaskC = MIRBuilder.buildConstant(IntTy, ExpMask); 7815 auto ZeroC = MIRBuilder.buildConstant(IntTy, 0); 7816 7817 auto Abs = MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC); 7818 auto Sign = 7819 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs); 7820 7821 auto Res = MIRBuilder.buildConstant(DstTy, 0); 7822 // Clang doesn't support capture of structured bindings: 7823 LLT DstTyCopy = DstTy; 7824 const auto appendToRes = [&](MachineInstrBuilder ToAppend) { 7825 Res = MIRBuilder.buildOr(DstTyCopy, Res, ToAppend); 7826 }; 7827 7828 // Tests that involve more than one class should be processed first. 7829 if ((Mask & fcFinite) == fcFinite) { 7830 // finite(V) ==> abs(V) u< exp_mask 7831 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs, 7832 ExpMaskC)); 7833 Mask &= ~fcFinite; 7834 } else if ((Mask & fcFinite) == fcPosFinite) { 7835 // finite(V) && V > 0 ==> V u< exp_mask 7836 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt, 7837 ExpMaskC)); 7838 Mask &= ~fcPosFinite; 7839 } else if ((Mask & fcFinite) == fcNegFinite) { 7840 // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1 7841 auto Cmp = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs, 7842 ExpMaskC); 7843 auto And = MIRBuilder.buildAnd(DstTy, Cmp, Sign); 7844 appendToRes(And); 7845 Mask &= ~fcNegFinite; 7846 } 7847 7848 if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) { 7849 // fcZero | fcSubnormal => test all exponent bits are 0 7850 // TODO: Handle sign bit specific cases 7851 // TODO: Handle inverted case 7852 if (PartialCheck == (fcZero | fcSubnormal)) { 7853 auto ExpBits = MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC); 7854 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, 7855 ExpBits, ZeroC)); 7856 Mask &= ~PartialCheck; 7857 } 7858 } 7859 7860 // Check for individual classes. 7861 if (FPClassTest PartialCheck = Mask & fcZero) { 7862 if (PartialCheck == fcPosZero) 7863 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, 7864 AsInt, ZeroC)); 7865 else if (PartialCheck == fcZero) 7866 appendToRes( 7867 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC)); 7868 else // fcNegZero 7869 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, 7870 AsInt, SignBitC)); 7871 } 7872 7873 if (FPClassTest PartialCheck = Mask & fcSubnormal) { 7874 // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set) 7875 // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set) 7876 auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs; 7877 auto OneC = MIRBuilder.buildConstant(IntTy, 1); 7878 auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC); 7879 auto SubnormalRes = 7880 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne, 7881 MIRBuilder.buildConstant(IntTy, AllOneMantissa)); 7882 if (PartialCheck == fcNegSubnormal) 7883 SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign); 7884 appendToRes(SubnormalRes); 7885 } 7886 7887 if (FPClassTest PartialCheck = Mask & fcInf) { 7888 if (PartialCheck == fcPosInf) 7889 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, 7890 AsInt, InfC)); 7891 else if (PartialCheck == fcInf) 7892 appendToRes( 7893 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC)); 7894 else { // fcNegInf 7895 APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt(); 7896 auto NegInfC = MIRBuilder.buildConstant(IntTy, NegInf); 7897 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, 7898 AsInt, NegInfC)); 7899 } 7900 } 7901 7902 if (FPClassTest PartialCheck = Mask & fcNan) { 7903 auto InfWithQnanBitC = MIRBuilder.buildConstant(IntTy, Inf | QNaNBitMask); 7904 if (PartialCheck == fcNan) { 7905 // isnan(V) ==> abs(V) u> int(inf) 7906 appendToRes( 7907 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC)); 7908 } else if (PartialCheck == fcQNan) { 7909 // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit) 7910 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs, 7911 InfWithQnanBitC)); 7912 } else { // fcSNan 7913 // issignaling(V) ==> abs(V) u> unsigned(Inf) && 7914 // abs(V) u< (unsigned(Inf) | quiet_bit) 7915 auto IsNan = 7916 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC); 7917 auto IsNotQnan = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, 7918 Abs, InfWithQnanBitC); 7919 appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan)); 7920 } 7921 } 7922 7923 if (FPClassTest PartialCheck = Mask & fcNormal) { 7924 // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u< 7925 // (max_exp-1)) 7926 APInt ExpLSB = ExpMask & ~(ExpMask.shl(1)); 7927 auto ExpMinusOne = MIRBuilder.buildSub( 7928 IntTy, Abs, MIRBuilder.buildConstant(IntTy, ExpLSB)); 7929 APInt MaxExpMinusOne = ExpMask - ExpLSB; 7930 auto NormalRes = 7931 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne, 7932 MIRBuilder.buildConstant(IntTy, MaxExpMinusOne)); 7933 if (PartialCheck == fcNegNormal) 7934 NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign); 7935 else if (PartialCheck == fcPosNormal) { 7936 auto PosSign = MIRBuilder.buildXor( 7937 DstTy, Sign, MIRBuilder.buildConstant(DstTy, InvertionMask)); 7938 NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign); 7939 } 7940 appendToRes(NormalRes); 7941 } 7942 7943 MIRBuilder.buildCopy(DstReg, Res); 7944 MI.eraseFromParent(); 7945 return Legalized; 7946 } 7947 7948 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) { 7949 // Implement vector G_SELECT in terms of XOR, AND, OR. 7950 auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] = 7951 MI.getFirst4RegLLTs(); 7952 if (!DstTy.isVector()) 7953 return UnableToLegalize; 7954 7955 bool IsEltPtr = DstTy.getElementType().isPointer(); 7956 if (IsEltPtr) { 7957 LLT ScalarPtrTy = LLT::scalar(DstTy.getScalarSizeInBits()); 7958 LLT NewTy = DstTy.changeElementType(ScalarPtrTy); 7959 Op1Reg = MIRBuilder.buildPtrToInt(NewTy, Op1Reg).getReg(0); 7960 Op2Reg = MIRBuilder.buildPtrToInt(NewTy, Op2Reg).getReg(0); 7961 DstTy = NewTy; 7962 } 7963 7964 if (MaskTy.isScalar()) { 7965 // Turn the scalar condition into a vector condition mask. 7966 7967 Register MaskElt = MaskReg; 7968 7969 // The condition was potentially zero extended before, but we want a sign 7970 // extended boolean. 7971 if (MaskTy != LLT::scalar(1)) 7972 MaskElt = MIRBuilder.buildSExtInReg(MaskTy, MaskElt, 1).getReg(0); 7973 7974 // Continue the sign extension (or truncate) to match the data type. 7975 MaskElt = MIRBuilder.buildSExtOrTrunc(DstTy.getElementType(), 7976 MaskElt).getReg(0); 7977 7978 // Generate a vector splat idiom. 7979 auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt); 7980 MaskReg = ShufSplat.getReg(0); 7981 MaskTy = DstTy; 7982 } 7983 7984 if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) { 7985 return UnableToLegalize; 7986 } 7987 7988 auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg); 7989 auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg); 7990 auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask); 7991 if (IsEltPtr) { 7992 auto Or = MIRBuilder.buildOr(DstTy, NewOp1, NewOp2); 7993 MIRBuilder.buildIntToPtr(DstReg, Or); 7994 } else { 7995 MIRBuilder.buildOr(DstReg, NewOp1, NewOp2); 7996 } 7997 MI.eraseFromParent(); 7998 return Legalized; 7999 } 8000 8001 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) { 8002 // Split DIVREM into individual instructions. 8003 unsigned Opcode = MI.getOpcode(); 8004 8005 MIRBuilder.buildInstr( 8006 Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV 8007 : TargetOpcode::G_UDIV, 8008 {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)}); 8009 MIRBuilder.buildInstr( 8010 Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM 8011 : TargetOpcode::G_UREM, 8012 {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)}); 8013 MI.eraseFromParent(); 8014 return Legalized; 8015 } 8016 8017 LegalizerHelper::LegalizeResult 8018 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) { 8019 // Expand %res = G_ABS %a into: 8020 // %v1 = G_ASHR %a, scalar_size-1 8021 // %v2 = G_ADD %a, %v1 8022 // %res = G_XOR %v2, %v1 8023 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 8024 Register OpReg = MI.getOperand(1).getReg(); 8025 auto ShiftAmt = 8026 MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1); 8027 auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt); 8028 auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift); 8029 MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift); 8030 MI.eraseFromParent(); 8031 return Legalized; 8032 } 8033 8034 LegalizerHelper::LegalizeResult 8035 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) { 8036 // Expand %res = G_ABS %a into: 8037 // %v1 = G_CONSTANT 0 8038 // %v2 = G_SUB %v1, %a 8039 // %res = G_SMAX %a, %v2 8040 Register SrcReg = MI.getOperand(1).getReg(); 8041 LLT Ty = MRI.getType(SrcReg); 8042 auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0); 8043 auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0); 8044 MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub); 8045 MI.eraseFromParent(); 8046 return Legalized; 8047 } 8048 8049 LegalizerHelper::LegalizeResult 8050 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) { 8051 Register SrcReg = MI.getOperand(1).getReg(); 8052 LLT SrcTy = MRI.getType(SrcReg); 8053 LLT DstTy = MRI.getType(SrcReg); 8054 8055 // The source could be a scalar if the IR type was <1 x sN>. 8056 if (SrcTy.isScalar()) { 8057 if (DstTy.getSizeInBits() > SrcTy.getSizeInBits()) 8058 return UnableToLegalize; // FIXME: handle extension. 8059 // This can be just a plain copy. 8060 Observer.changingInstr(MI); 8061 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY)); 8062 Observer.changedInstr(MI); 8063 return Legalized; 8064 } 8065 return UnableToLegalize; 8066 } 8067 8068 static Type *getTypeForLLT(LLT Ty, LLVMContext &C); 8069 8070 LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) { 8071 MachineFunction &MF = *MI.getMF(); 8072 const DataLayout &DL = MIRBuilder.getDataLayout(); 8073 LLVMContext &Ctx = MF.getFunction().getContext(); 8074 Register ListPtr = MI.getOperand(1).getReg(); 8075 LLT PtrTy = MRI.getType(ListPtr); 8076 8077 // LstPtr is a pointer to the head of the list. Get the address 8078 // of the head of the list. 8079 Align PtrAlignment = DL.getABITypeAlign(getTypeForLLT(PtrTy, Ctx)); 8080 MachineMemOperand *PtrLoadMMO = MF.getMachineMemOperand( 8081 MachinePointerInfo(), MachineMemOperand::MOLoad, PtrTy, PtrAlignment); 8082 auto VAList = MIRBuilder.buildLoad(PtrTy, ListPtr, *PtrLoadMMO).getReg(0); 8083 8084 const Align A(MI.getOperand(2).getImm()); 8085 LLT PtrTyAsScalarTy = LLT::scalar(PtrTy.getSizeInBits()); 8086 if (A > TLI.getMinStackArgumentAlignment()) { 8087 Register AlignAmt = 8088 MIRBuilder.buildConstant(PtrTyAsScalarTy, A.value() - 1).getReg(0); 8089 auto AddDst = MIRBuilder.buildPtrAdd(PtrTy, VAList, AlignAmt); 8090 auto AndDst = MIRBuilder.buildMaskLowPtrBits(PtrTy, AddDst, Log2(A)); 8091 VAList = AndDst.getReg(0); 8092 } 8093 8094 // Increment the pointer, VAList, to the next vaarg 8095 // The list should be bumped by the size of element in the current head of 8096 // list. 8097 Register Dst = MI.getOperand(0).getReg(); 8098 LLT LLTTy = MRI.getType(Dst); 8099 Type *Ty = getTypeForLLT(LLTTy, Ctx); 8100 auto IncAmt = 8101 MIRBuilder.buildConstant(PtrTyAsScalarTy, DL.getTypeAllocSize(Ty)); 8102 auto Succ = MIRBuilder.buildPtrAdd(PtrTy, VAList, IncAmt); 8103 8104 // Store the increment VAList to the legalized pointer 8105 MachineMemOperand *StoreMMO = MF.getMachineMemOperand( 8106 MachinePointerInfo(), MachineMemOperand::MOStore, PtrTy, PtrAlignment); 8107 MIRBuilder.buildStore(Succ, ListPtr, *StoreMMO); 8108 // Load the actual argument out of the pointer VAList 8109 Align EltAlignment = DL.getABITypeAlign(Ty); 8110 MachineMemOperand *EltLoadMMO = MF.getMachineMemOperand( 8111 MachinePointerInfo(), MachineMemOperand::MOLoad, LLTTy, EltAlignment); 8112 MIRBuilder.buildLoad(Dst, VAList, *EltLoadMMO); 8113 8114 MI.eraseFromParent(); 8115 return Legalized; 8116 } 8117 8118 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { 8119 // On Darwin, -Os means optimize for size without hurting performance, so 8120 // only really optimize for size when -Oz (MinSize) is used. 8121 if (MF.getTarget().getTargetTriple().isOSDarwin()) 8122 return MF.getFunction().hasMinSize(); 8123 return MF.getFunction().hasOptSize(); 8124 } 8125 8126 // Returns a list of types to use for memory op lowering in MemOps. A partial 8127 // port of findOptimalMemOpLowering in TargetLowering. 8128 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps, 8129 unsigned Limit, const MemOp &Op, 8130 unsigned DstAS, unsigned SrcAS, 8131 const AttributeList &FuncAttributes, 8132 const TargetLowering &TLI) { 8133 if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign()) 8134 return false; 8135 8136 LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes); 8137 8138 if (Ty == LLT()) { 8139 // Use the largest scalar type whose alignment constraints are satisfied. 8140 // We only need to check DstAlign here as SrcAlign is always greater or 8141 // equal to DstAlign (or zero). 8142 Ty = LLT::scalar(64); 8143 if (Op.isFixedDstAlign()) 8144 while (Op.getDstAlign() < Ty.getSizeInBytes() && 8145 !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign())) 8146 Ty = LLT::scalar(Ty.getSizeInBytes()); 8147 assert(Ty.getSizeInBits() > 0 && "Could not find valid type"); 8148 // FIXME: check for the largest legal type we can load/store to. 8149 } 8150 8151 unsigned NumMemOps = 0; 8152 uint64_t Size = Op.size(); 8153 while (Size) { 8154 unsigned TySize = Ty.getSizeInBytes(); 8155 while (TySize > Size) { 8156 // For now, only use non-vector load / store's for the left-over pieces. 8157 LLT NewTy = Ty; 8158 // FIXME: check for mem op safety and legality of the types. Not all of 8159 // SDAGisms map cleanly to GISel concepts. 8160 if (NewTy.isVector()) 8161 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32); 8162 NewTy = LLT::scalar(llvm::bit_floor(NewTy.getSizeInBits() - 1)); 8163 unsigned NewTySize = NewTy.getSizeInBytes(); 8164 assert(NewTySize > 0 && "Could not find appropriate type"); 8165 8166 // If the new LLT cannot cover all of the remaining bits, then consider 8167 // issuing a (or a pair of) unaligned and overlapping load / store. 8168 unsigned Fast; 8169 // Need to get a VT equivalent for allowMisalignedMemoryAccesses(). 8170 MVT VT = getMVTForLLT(Ty); 8171 if (NumMemOps && Op.allowOverlap() && NewTySize < Size && 8172 TLI.allowsMisalignedMemoryAccesses( 8173 VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1), 8174 MachineMemOperand::MONone, &Fast) && 8175 Fast) 8176 TySize = Size; 8177 else { 8178 Ty = NewTy; 8179 TySize = NewTySize; 8180 } 8181 } 8182 8183 if (++NumMemOps > Limit) 8184 return false; 8185 8186 MemOps.push_back(Ty); 8187 Size -= TySize; 8188 } 8189 8190 return true; 8191 } 8192 8193 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) { 8194 if (Ty.isVector()) 8195 return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()), 8196 Ty.getNumElements()); 8197 return IntegerType::get(C, Ty.getSizeInBits()); 8198 } 8199 8200 // Get a vectorized representation of the memset value operand, GISel edition. 8201 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) { 8202 MachineRegisterInfo &MRI = *MIB.getMRI(); 8203 unsigned NumBits = Ty.getScalarSizeInBits(); 8204 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI); 8205 if (!Ty.isVector() && ValVRegAndVal) { 8206 APInt Scalar = ValVRegAndVal->Value.trunc(8); 8207 APInt SplatVal = APInt::getSplat(NumBits, Scalar); 8208 return MIB.buildConstant(Ty, SplatVal).getReg(0); 8209 } 8210 8211 // Extend the byte value to the larger type, and then multiply by a magic 8212 // value 0x010101... in order to replicate it across every byte. 8213 // Unless it's zero, in which case just emit a larger G_CONSTANT 0. 8214 if (ValVRegAndVal && ValVRegAndVal->Value == 0) { 8215 return MIB.buildConstant(Ty, 0).getReg(0); 8216 } 8217 8218 LLT ExtType = Ty.getScalarType(); 8219 auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val); 8220 if (NumBits > 8) { 8221 APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01)); 8222 auto MagicMI = MIB.buildConstant(ExtType, Magic); 8223 Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0); 8224 } 8225 8226 // For vector types create a G_BUILD_VECTOR. 8227 if (Ty.isVector()) 8228 Val = MIB.buildSplatVector(Ty, Val).getReg(0); 8229 8230 return Val; 8231 } 8232 8233 LegalizerHelper::LegalizeResult 8234 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val, 8235 uint64_t KnownLen, Align Alignment, 8236 bool IsVolatile) { 8237 auto &MF = *MI.getParent()->getParent(); 8238 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 8239 auto &DL = MF.getDataLayout(); 8240 LLVMContext &C = MF.getFunction().getContext(); 8241 8242 assert(KnownLen != 0 && "Have a zero length memset length!"); 8243 8244 bool DstAlignCanChange = false; 8245 MachineFrameInfo &MFI = MF.getFrameInfo(); 8246 bool OptSize = shouldLowerMemFuncForSize(MF); 8247 8248 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 8249 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 8250 DstAlignCanChange = true; 8251 8252 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize); 8253 std::vector<LLT> MemOps; 8254 8255 const auto &DstMMO = **MI.memoperands_begin(); 8256 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 8257 8258 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI); 8259 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0; 8260 8261 if (!findGISelOptimalMemOpLowering(MemOps, Limit, 8262 MemOp::Set(KnownLen, DstAlignCanChange, 8263 Alignment, 8264 /*IsZeroMemset=*/IsZeroVal, 8265 /*IsVolatile=*/IsVolatile), 8266 DstPtrInfo.getAddrSpace(), ~0u, 8267 MF.getFunction().getAttributes(), TLI)) 8268 return UnableToLegalize; 8269 8270 if (DstAlignCanChange) { 8271 // Get an estimate of the type from the LLT. 8272 Type *IRTy = getTypeForLLT(MemOps[0], C); 8273 Align NewAlign = DL.getABITypeAlign(IRTy); 8274 if (NewAlign > Alignment) { 8275 Alignment = NewAlign; 8276 unsigned FI = FIDef->getOperand(1).getIndex(); 8277 // Give the stack frame object a larger alignment if needed. 8278 if (MFI.getObjectAlign(FI) < Alignment) 8279 MFI.setObjectAlignment(FI, Alignment); 8280 } 8281 } 8282 8283 MachineIRBuilder MIB(MI); 8284 // Find the largest store and generate the bit pattern for it. 8285 LLT LargestTy = MemOps[0]; 8286 for (unsigned i = 1; i < MemOps.size(); i++) 8287 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits()) 8288 LargestTy = MemOps[i]; 8289 8290 // The memset stored value is always defined as an s8, so in order to make it 8291 // work with larger store types we need to repeat the bit pattern across the 8292 // wider type. 8293 Register MemSetValue = getMemsetValue(Val, LargestTy, MIB); 8294 8295 if (!MemSetValue) 8296 return UnableToLegalize; 8297 8298 // Generate the stores. For each store type in the list, we generate the 8299 // matching store of that type to the destination address. 8300 LLT PtrTy = MRI.getType(Dst); 8301 unsigned DstOff = 0; 8302 unsigned Size = KnownLen; 8303 for (unsigned I = 0; I < MemOps.size(); I++) { 8304 LLT Ty = MemOps[I]; 8305 unsigned TySize = Ty.getSizeInBytes(); 8306 if (TySize > Size) { 8307 // Issuing an unaligned load / store pair that overlaps with the previous 8308 // pair. Adjust the offset accordingly. 8309 assert(I == MemOps.size() - 1 && I != 0); 8310 DstOff -= TySize - Size; 8311 } 8312 8313 // If this store is smaller than the largest store see whether we can get 8314 // the smaller value for free with a truncate. 8315 Register Value = MemSetValue; 8316 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) { 8317 MVT VT = getMVTForLLT(Ty); 8318 MVT LargestVT = getMVTForLLT(LargestTy); 8319 if (!LargestTy.isVector() && !Ty.isVector() && 8320 TLI.isTruncateFree(LargestVT, VT)) 8321 Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0); 8322 else 8323 Value = getMemsetValue(Val, Ty, MIB); 8324 if (!Value) 8325 return UnableToLegalize; 8326 } 8327 8328 auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty); 8329 8330 Register Ptr = Dst; 8331 if (DstOff != 0) { 8332 auto Offset = 8333 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff); 8334 Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0); 8335 } 8336 8337 MIB.buildStore(Value, Ptr, *StoreMMO); 8338 DstOff += Ty.getSizeInBytes(); 8339 Size -= TySize; 8340 } 8341 8342 MI.eraseFromParent(); 8343 return Legalized; 8344 } 8345 8346 LegalizerHelper::LegalizeResult 8347 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) { 8348 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE); 8349 8350 auto [Dst, Src, Len] = MI.getFirst3Regs(); 8351 8352 const auto *MMOIt = MI.memoperands_begin(); 8353 const MachineMemOperand *MemOp = *MMOIt; 8354 bool IsVolatile = MemOp->isVolatile(); 8355 8356 // See if this is a constant length copy 8357 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI); 8358 // FIXME: support dynamically sized G_MEMCPY_INLINE 8359 assert(LenVRegAndVal && 8360 "inline memcpy with dynamic size is not yet supported"); 8361 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue(); 8362 if (KnownLen == 0) { 8363 MI.eraseFromParent(); 8364 return Legalized; 8365 } 8366 8367 const auto &DstMMO = **MI.memoperands_begin(); 8368 const auto &SrcMMO = **std::next(MI.memoperands_begin()); 8369 Align DstAlign = DstMMO.getBaseAlign(); 8370 Align SrcAlign = SrcMMO.getBaseAlign(); 8371 8372 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, 8373 IsVolatile); 8374 } 8375 8376 LegalizerHelper::LegalizeResult 8377 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src, 8378 uint64_t KnownLen, Align DstAlign, 8379 Align SrcAlign, bool IsVolatile) { 8380 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE); 8381 return lowerMemcpy(MI, Dst, Src, KnownLen, 8382 std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign, 8383 IsVolatile); 8384 } 8385 8386 LegalizerHelper::LegalizeResult 8387 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src, 8388 uint64_t KnownLen, uint64_t Limit, Align DstAlign, 8389 Align SrcAlign, bool IsVolatile) { 8390 auto &MF = *MI.getParent()->getParent(); 8391 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 8392 auto &DL = MF.getDataLayout(); 8393 LLVMContext &C = MF.getFunction().getContext(); 8394 8395 assert(KnownLen != 0 && "Have a zero length memcpy length!"); 8396 8397 bool DstAlignCanChange = false; 8398 MachineFrameInfo &MFI = MF.getFrameInfo(); 8399 Align Alignment = std::min(DstAlign, SrcAlign); 8400 8401 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 8402 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 8403 DstAlignCanChange = true; 8404 8405 // FIXME: infer better src pointer alignment like SelectionDAG does here. 8406 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining 8407 // if the memcpy is in a tail call position. 8408 8409 std::vector<LLT> MemOps; 8410 8411 const auto &DstMMO = **MI.memoperands_begin(); 8412 const auto &SrcMMO = **std::next(MI.memoperands_begin()); 8413 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 8414 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); 8415 8416 if (!findGISelOptimalMemOpLowering( 8417 MemOps, Limit, 8418 MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign, 8419 IsVolatile), 8420 DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(), 8421 MF.getFunction().getAttributes(), TLI)) 8422 return UnableToLegalize; 8423 8424 if (DstAlignCanChange) { 8425 // Get an estimate of the type from the LLT. 8426 Type *IRTy = getTypeForLLT(MemOps[0], C); 8427 Align NewAlign = DL.getABITypeAlign(IRTy); 8428 8429 // Don't promote to an alignment that would require dynamic stack 8430 // realignment. 8431 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 8432 if (!TRI->hasStackRealignment(MF)) 8433 while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign)) 8434 NewAlign = NewAlign.previous(); 8435 8436 if (NewAlign > Alignment) { 8437 Alignment = NewAlign; 8438 unsigned FI = FIDef->getOperand(1).getIndex(); 8439 // Give the stack frame object a larger alignment if needed. 8440 if (MFI.getObjectAlign(FI) < Alignment) 8441 MFI.setObjectAlignment(FI, Alignment); 8442 } 8443 } 8444 8445 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n"); 8446 8447 MachineIRBuilder MIB(MI); 8448 // Now we need to emit a pair of load and stores for each of the types we've 8449 // collected. I.e. for each type, generate a load from the source pointer of 8450 // that type width, and then generate a corresponding store to the dest buffer 8451 // of that value loaded. This can result in a sequence of loads and stores 8452 // mixed types, depending on what the target specifies as good types to use. 8453 unsigned CurrOffset = 0; 8454 unsigned Size = KnownLen; 8455 for (auto CopyTy : MemOps) { 8456 // Issuing an unaligned load / store pair that overlaps with the previous 8457 // pair. Adjust the offset accordingly. 8458 if (CopyTy.getSizeInBytes() > Size) 8459 CurrOffset -= CopyTy.getSizeInBytes() - Size; 8460 8461 // Construct MMOs for the accesses. 8462 auto *LoadMMO = 8463 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); 8464 auto *StoreMMO = 8465 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); 8466 8467 // Create the load. 8468 Register LoadPtr = Src; 8469 Register Offset; 8470 if (CurrOffset != 0) { 8471 LLT SrcTy = MRI.getType(Src); 8472 Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset) 8473 .getReg(0); 8474 LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0); 8475 } 8476 auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO); 8477 8478 // Create the store. 8479 Register StorePtr = Dst; 8480 if (CurrOffset != 0) { 8481 LLT DstTy = MRI.getType(Dst); 8482 StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0); 8483 } 8484 MIB.buildStore(LdVal, StorePtr, *StoreMMO); 8485 CurrOffset += CopyTy.getSizeInBytes(); 8486 Size -= CopyTy.getSizeInBytes(); 8487 } 8488 8489 MI.eraseFromParent(); 8490 return Legalized; 8491 } 8492 8493 LegalizerHelper::LegalizeResult 8494 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, 8495 uint64_t KnownLen, Align DstAlign, Align SrcAlign, 8496 bool IsVolatile) { 8497 auto &MF = *MI.getParent()->getParent(); 8498 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 8499 auto &DL = MF.getDataLayout(); 8500 LLVMContext &C = MF.getFunction().getContext(); 8501 8502 assert(KnownLen != 0 && "Have a zero length memmove length!"); 8503 8504 bool DstAlignCanChange = false; 8505 MachineFrameInfo &MFI = MF.getFrameInfo(); 8506 bool OptSize = shouldLowerMemFuncForSize(MF); 8507 Align Alignment = std::min(DstAlign, SrcAlign); 8508 8509 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 8510 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 8511 DstAlignCanChange = true; 8512 8513 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize); 8514 std::vector<LLT> MemOps; 8515 8516 const auto &DstMMO = **MI.memoperands_begin(); 8517 const auto &SrcMMO = **std::next(MI.memoperands_begin()); 8518 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 8519 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); 8520 8521 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due 8522 // to a bug in it's findOptimalMemOpLowering implementation. For now do the 8523 // same thing here. 8524 if (!findGISelOptimalMemOpLowering( 8525 MemOps, Limit, 8526 MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign, 8527 /*IsVolatile*/ true), 8528 DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(), 8529 MF.getFunction().getAttributes(), TLI)) 8530 return UnableToLegalize; 8531 8532 if (DstAlignCanChange) { 8533 // Get an estimate of the type from the LLT. 8534 Type *IRTy = getTypeForLLT(MemOps[0], C); 8535 Align NewAlign = DL.getABITypeAlign(IRTy); 8536 8537 // Don't promote to an alignment that would require dynamic stack 8538 // realignment. 8539 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 8540 if (!TRI->hasStackRealignment(MF)) 8541 while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign)) 8542 NewAlign = NewAlign.previous(); 8543 8544 if (NewAlign > Alignment) { 8545 Alignment = NewAlign; 8546 unsigned FI = FIDef->getOperand(1).getIndex(); 8547 // Give the stack frame object a larger alignment if needed. 8548 if (MFI.getObjectAlign(FI) < Alignment) 8549 MFI.setObjectAlignment(FI, Alignment); 8550 } 8551 } 8552 8553 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n"); 8554 8555 MachineIRBuilder MIB(MI); 8556 // Memmove requires that we perform the loads first before issuing the stores. 8557 // Apart from that, this loop is pretty much doing the same thing as the 8558 // memcpy codegen function. 8559 unsigned CurrOffset = 0; 8560 SmallVector<Register, 16> LoadVals; 8561 for (auto CopyTy : MemOps) { 8562 // Construct MMO for the load. 8563 auto *LoadMMO = 8564 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); 8565 8566 // Create the load. 8567 Register LoadPtr = Src; 8568 if (CurrOffset != 0) { 8569 LLT SrcTy = MRI.getType(Src); 8570 auto Offset = 8571 MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset); 8572 LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0); 8573 } 8574 LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0)); 8575 CurrOffset += CopyTy.getSizeInBytes(); 8576 } 8577 8578 CurrOffset = 0; 8579 for (unsigned I = 0; I < MemOps.size(); ++I) { 8580 LLT CopyTy = MemOps[I]; 8581 // Now store the values loaded. 8582 auto *StoreMMO = 8583 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); 8584 8585 Register StorePtr = Dst; 8586 if (CurrOffset != 0) { 8587 LLT DstTy = MRI.getType(Dst); 8588 auto Offset = 8589 MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset); 8590 StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0); 8591 } 8592 MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO); 8593 CurrOffset += CopyTy.getSizeInBytes(); 8594 } 8595 MI.eraseFromParent(); 8596 return Legalized; 8597 } 8598 8599 LegalizerHelper::LegalizeResult 8600 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { 8601 const unsigned Opc = MI.getOpcode(); 8602 // This combine is fairly complex so it's not written with a separate 8603 // matcher function. 8604 assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE || 8605 Opc == TargetOpcode::G_MEMSET) && 8606 "Expected memcpy like instruction"); 8607 8608 auto MMOIt = MI.memoperands_begin(); 8609 const MachineMemOperand *MemOp = *MMOIt; 8610 8611 Align DstAlign = MemOp->getBaseAlign(); 8612 Align SrcAlign; 8613 auto [Dst, Src, Len] = MI.getFirst3Regs(); 8614 8615 if (Opc != TargetOpcode::G_MEMSET) { 8616 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI"); 8617 MemOp = *(++MMOIt); 8618 SrcAlign = MemOp->getBaseAlign(); 8619 } 8620 8621 // See if this is a constant length copy 8622 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI); 8623 if (!LenVRegAndVal) 8624 return UnableToLegalize; 8625 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue(); 8626 8627 if (KnownLen == 0) { 8628 MI.eraseFromParent(); 8629 return Legalized; 8630 } 8631 8632 bool IsVolatile = MemOp->isVolatile(); 8633 if (Opc == TargetOpcode::G_MEMCPY_INLINE) 8634 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, 8635 IsVolatile); 8636 8637 // Don't try to optimize volatile. 8638 if (IsVolatile) 8639 return UnableToLegalize; 8640 8641 if (MaxLen && KnownLen > MaxLen) 8642 return UnableToLegalize; 8643 8644 if (Opc == TargetOpcode::G_MEMCPY) { 8645 auto &MF = *MI.getParent()->getParent(); 8646 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 8647 bool OptSize = shouldLowerMemFuncForSize(MF); 8648 uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize); 8649 return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign, 8650 IsVolatile); 8651 } 8652 if (Opc == TargetOpcode::G_MEMMOVE) 8653 return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); 8654 if (Opc == TargetOpcode::G_MEMSET) 8655 return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile); 8656 return UnableToLegalize; 8657 } 8658